# Rossmann Challenge

## 0. Entorno de ejecución
* pip install pandas
* pip install pandas_summary
* pip install pyarrow
* pip install scikit-learn
* pip install sklearn-pandas

In [18]:
import numpy as np
import pandas as pd
from pandas_summary import DataFrameSummary
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn_pandas import DataFrameMapper

## 1. Levantamos los datos preprocesados

In [33]:
df_train = pd.read_feather('all_preprocessed_train.fth')
df_test = pd.read_feather('all_preprocessed_test.fth')

In [34]:
df_train.shape, df_test.shape

((1017209, 81), (41088, 80))

In [35]:
# Verificación de que se hicieron bien las cosas en testa también
set(df_train.columns) - set(df_test.columns)

{'Customers', 'Sales'}

In [36]:
df_train.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week', 'Day',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval',
       'CompetitionOpenSince', 'Promo2Since', 'State', 'file', 'week', 'trend',
       'Date_y', 'Month_y', 'Day_y', 'file_DE', 'week_DE', 'trend_DE',
       'Date_DE', 'State_DE', 'Month_DE', 'Day_DE', 'file_y',
       'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
       'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC', 'Max_Humidity',
       'Mean_Humidity', 'Min_Humidity', 'Max_Sea_Level_PressurehPa',
       'Mean_Sea_Level_PressurehPa', 'Min_Sea_Level_PressurehPa',
       'Max_VisibilityKm', 'Mean_VisibilityKm', 'Min_VisibilitykM',
       'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'Max_Gust_SpeedKm_h',
       'Precipitationmm', 'Cl

## 2. Variables categóricas

In [37]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 
            'CompetitionMonthsOpen', 'Promo2Weeks', 
            'StoreType', 'Assortment', 'PromoInterval', 
            'CompetitionOpenSinceYear', 'Promo2SinceYear', 
            'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 
            'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 
            'SchoolHoliday_bw']

In [38]:
DataFrameSummary(df_train[cat_vars]).summary().loc[['uniques', 'types', 'missing']].T

Unnamed: 0,uniques,types,missing
Store,1115,numeric,0
DayOfWeek,7,numeric,0
Year,3,numeric,0
Month,12,numeric,0
Day,31,numeric,0
StateHoliday,4,categorical,0
CompetitionMonthsOpen,25,numeric,0
Promo2Weeks,26,numeric,0
StoreType,4,categorical,0
Assortment,3,categorical,0


No hay datos faltantes en las variables categóricas.

## 3. Variables continuas

In [39]:
contin_vars = ['CompetitionDistance', 
               'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 
               'Precipitationmm', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 
               'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 
               'trend_DE', 'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 
               'Promo', 'SchoolHoliday', 'StateHoliday_bool']

In [40]:
DataFrameSummary(df_train[contin_vars]).summary().loc[['uniques', 'types', 'missing']].T

Unnamed: 0,uniques,types,missing
CompetitionDistance,654,numeric,0
Max_TemperatureC,50,numeric,0
Mean_TemperatureC,45,numeric,0
Min_TemperatureC,40,numeric,0
Precipitationmm,40,numeric,0
Max_Humidity,52,numeric,0
Mean_Humidity,71,numeric,0
Min_Humidity,93,numeric,0
Max_Wind_SpeedKm_h,42,numeric,0
Mean_Wind_SpeedKm_h,27,numeric,0


No hay datos faltantes en las variables continuas

## 4. Normalización de las variales continuas y LabelEncode de las categóricas

Lista de tuplas, donde cada tupla contiene: nombre de la variable y función Encoder/Normalizadora.

In [41]:
cat_maps = [(o, LabelEncoder()) for o in cat_vars]
contin_maps = [([o], StandardScaler()) for o in contin_vars]

In [42]:
mapper_cat = DataFrameMapper(cat_maps)
_ = mapper_cat.fit(df_train)

Vemos algunos ejemplos de cómo se realiza el LabelEncode:

In [26]:
# Assortment es la novena posición en cat_maps
N = 10
print(list(zip(df_train['Assortment'].values[:N], mapper_cat.transform(df_train)[:,9][:N])))
print(list(zip(df_train['Events'].values[:N], mapper_cat.transform(df_train)[:,15][:N])))
print(list(zip(df_train['Year'].values[:N], mapper_cat.transform(df_train)[:,2][:N])))

[('a', 0), ('a', 0), ('a', 0), ('c', 2), ('a', 0), ('a', 0), ('c', 2), ('a', 0), ('c', 2), ('a', 0)]
[('Fog', 0), ('Fog', 0), ('Fog', 0), ('Sunny', 20), ('Sunny', 20), ('Sunny', 20), ('Rain', 10), ('Rain', 10), ('Fog', 0), ('Sunny', 20)]
[(2015, 2), (2015, 2), (2015, 2), (2015, 2), (2015, 2), (2015, 2), (2015, 2), (2015, 2), (2015, 2), (2015, 2)]


Vemos que para el tipo de tienda, la 'a' se codifica con 0 y la 'c' con 2.
Para Events, 'Fog' se codifica con 0 y 'Sunny' con 1.
Por último, para la columna de año, 2015 se codifica como 2.

In [43]:
mapper_cont = DataFrameMapper(contin_maps)
_ = mapper_cont.fit(df_train)

Vemos ejemplos de la normalización de las variables continuas:

In [28]:
N = 10
print(df_train['CompetitionDistance'].values[:N])
print(mapper_cont.transform(df_train)[:, 0][:N])

[ 1270.   570. 14130.   620. 29910.   310. 24000.  7520.  2030.  3160.]
[-0.51104292 -0.59341201  1.00219483 -0.5875285   2.85902934 -0.62400624
  2.16359893  0.22439533 -0.42161363 -0.28864639]


Ahora lo hacemos con todas las variables y todos los datos.

In [44]:
# La hacemos con todas ahora
df_train[cat_vars] = mapper_cat.transform(df_train)
df_test[cat_vars] = mapper_cat.transform(df_test)

In [47]:
df_train[cat_vars].head()

Unnamed: 0,Store,DayOfWeek,Year,Month,Day,StateHoliday,CompetitionMonthsOpen,Promo2Weeks,StoreType,Assortment,...,Promo2SinceYear,State,Week,Events,Promo_fw,Promo_bw,StateHoliday_bool_fw,StateHoliday_bool_bw,SchoolHoliday_fw,SchoolHoliday_bw
0,0,4,2,6,30,0,24,25,2,0,...,0,4,30,0,1,5,0,0,1,5
1,1,4,2,6,30,0,24,25,0,0,...,2,11,30,0,1,5,0,0,1,5
2,2,4,2,6,30,0,24,25,0,0,...,3,6,30,0,1,5,0,0,1,5
3,3,4,2,6,30,0,24,25,2,2,...,0,0,30,20,1,5,0,0,1,5
4,4,4,2,6,30,0,3,25,0,0,...,0,9,30,20,1,5,0,0,1,5


Vemos que todas las categorías están codificadas

In [48]:
DataFrameSummary(df_train[cat_vars]).summary().loc[['uniques', 'types']]

Unnamed: 0,Store,DayOfWeek,Year,Month,Day,StateHoliday,CompetitionMonthsOpen,Promo2Weeks,StoreType,Assortment,...,Promo2SinceYear,State,Week,Events,Promo_fw,Promo_bw,StateHoliday_bool_fw,StateHoliday_bool_bw,SchoolHoliday_fw,SchoolHoliday_bw
uniques,1115,7,3,12,31,4,25,26,4,3,...,8,12,52,22,6,6,3,3,8,8
types,numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric,...,numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric


Todas las variables ahora son numéricas

In [49]:
df_train[contin_vars] = mapper_cont.transform(df_train)
df_test[contin_vars] = mapper_cont.transform(df_test)

In [50]:
df_train[contin_vars].head()

Unnamed: 0,CompetitionDistance,Max_TemperatureC,Mean_TemperatureC,Min_TemperatureC,Precipitationmm,Max_Humidity,Mean_Humidity,Min_Humidity,Max_Wind_SpeedKm_h,Mean_Wind_SpeedKm_h,CloudCover,trend,trend_DE,AfterStateHoliday_bool,BeforeStateHoliday_bool,Promo,SchoolHoliday,StateHoliday_bool
0,-0.511043,1.024446,0.818632,0.325047,-0.315778,0.61112,-1.510789,-1.62656,0.148437,-0.147662,-1.859165,1.744361,1.743049,0.644376,1.072424,1.273237,2.144211,-0.177442
1,-0.593412,0.551899,0.404215,0.170242,-0.315778,0.870277,-0.905024,-1.269897,-0.97008,-0.147662,-0.493858,1.303439,1.743049,0.965073,1.072424,1.273237,2.144211,-0.177442
2,1.002195,0.788172,0.404215,0.015437,-0.315778,0.870277,-0.980745,-1.320848,-0.97008,-1.165096,-1.404063,1.832546,1.743049,0.644376,1.072424,1.273237,2.144211,-0.177442
3,-0.587528,0.551899,0.542354,0.479853,-0.315778,0.092806,-0.980745,-1.015137,0.036585,0.7002,0.416347,0.774331,1.743049,0.965073,1.072424,1.273237,2.144211,-0.177442
4,2.859029,0.670036,0.680493,0.634658,-0.315778,-1.462136,-1.435069,-1.218945,-0.97008,-0.147662,-0.493858,1.479808,1.743049,0.644376,1.072424,1.273237,2.144211,-0.177442


Vemos que todos los valores están normalizados

## 5. Procesamiento final

In [51]:
df_sales = df_train[df_train.Sales!=0]

In [52]:
df_sales.shape

(844338, 81)

In [53]:
df_test.reset_index(inplace=True)

In [54]:
df_sales.reset_index(inplace=True)

In [55]:
df_sales.to_feather('train_normalized_data.fth')

In [56]:
df_test.to_feather('test_normalized_data.fth')