In [1]:
import pandas as pd
import pickle
import numpy as np

https://arxiv.org/abs/1604.06737

https://www.fast.ai/2018/04/29/categorical-embeddings/

https://youtu.be/XJ_waZlJU8g?t=4660

https://www.youtube.com/watch?v=5_xFdhfUnvQ&feature=youtu.be

https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

https://github.com/entron/entity-embedding-rossmann

https://github.com/terry-li-hm/rossmann

In [2]:
def apply_cats(df, trn):
    """Changes any columns of strings in df into categorical variables using trn as
    a template for the category codes.
    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values. The category codes are determined by trn.
    trn: A pandas dataframe. When creating a category for df, it looks up the
        what the category's code were in trn and makes those the category codes
        for df.
    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a
    note the type of col2 is string
    >>> train_cats(df)
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a
    now the type of col2 is category {a : 1, b : 2}
    >>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']})
    >>> apply_cats(df2, df)
           col1 col2
        0     1    b
        1     2    a
        2     3    a
    now the type of col is category {a : 1, b : 2}
    """
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = c.astype('category').cat.as_ordered()
            df[n].cat.set_categories(trn[n].cat.categories, ordered=True, inplace=True)

In [3]:
PATH='dataset/rossmann/'

In [46]:
joined = pd.read_feather(f'{PATH}joined.feather')
joined_test = pd.read_feather(f'{PATH}joined_test.feather')

Como elejir Validación:

https://www.fast.ai/2017/11/13/validation-sets/

In [5]:
joined.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
Store,1,2,3,4,5
DayOfWeek,5,5,5,5,5
Date,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00
Sales,5263,6064,8314,13995,4822
Customers,555,625,821,1498,559
Open,1,1,1,1,1
Promo,1,1,1,1,1
StateHoliday,False,False,False,False,False
SchoolHoliday,1,1,1,1,1


In [6]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

contin_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

n = len(joined); n

844338

In [7]:
# Me quedo solo con las variables de interes
dep = 'Sales'
joined = joined[cat_vars+contin_vars+[dep, 'Date']].copy()

joined_test[dep] = 0
joined_test = joined_test[cat_vars+contin_vars+[dep, 'Date', 'Id']].copy()

In [8]:
for v in cat_vars: joined[v] = joined[v].astype('category').cat.as_ordered()

In [9]:
# el +1 es para las unknown
cat_sz = [(c, len(joined[c].cat.categories)+1, min(50, (len(joined[c].cat.categories)+1+1)//2)) for c in cat_vars]

In [10]:
cat_sz

[('Store', 1116, 50),
 ('DayOfWeek', 8, 4),
 ('Year', 4, 2),
 ('Month', 13, 7),
 ('Day', 32, 16),
 ('StateHoliday', 3, 2),
 ('CompetitionMonthsOpen', 26, 13),
 ('Promo2Weeks', 27, 14),
 ('StoreType', 5, 3),
 ('Assortment', 4, 2),
 ('PromoInterval', 4, 2),
 ('CompetitionOpenSinceYear', 24, 12),
 ('Promo2SinceYear', 9, 5),
 ('State', 13, 7),
 ('Week', 53, 27),
 ('Events', 22, 11),
 ('Promo_fw', 7, 4),
 ('Promo_bw', 7, 4),
 ('StateHoliday_fw', 4, 2),
 ('StateHoliday_bw', 4, 2),
 ('SchoolHoliday_fw', 9, 5),
 ('SchoolHoliday_bw', 9, 5)]

In [11]:
# Aplica la misma regla de encoding usada en joined a joined_test
apply_cats(joined_test, joined)

In [12]:
for v in contin_vars:
    joined[v] = joined[v].fillna(0).astype('float32')
    joined_test[v] = joined_test[v].fillna(0).astype('float32')

In [13]:
joined = joined.set_index("Date")
joined_test = joined_test.set_index("Date")

In [48]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler

In [49]:
cat_maps = [(o, LabelEncoder()) for o in cat_vars]
contin_maps = [([o], StandardScaler()) for o in contin_vars]

In [56]:
joined['PromoInterval'].astype('str')

0                     None
1          Jan,Apr,Jul,Oct
2          Jan,Apr,Jul,Oct
3                     None
4                     None
5                     None
6                     None
7                     None
8                     None
9                     None
10         Jan,Apr,Jul,Oct
11         Jan,Apr,Jul,Oct
12         Feb,May,Aug,Nov
13         Jan,Apr,Jul,Oct
14         Jan,Apr,Jul,Oct
15                    None
16         Jan,Apr,Jul,Oct
17         Jan,Apr,Jul,Oct
18        Mar,Jun,Sept,Dec
19         Jan,Apr,Jul,Oct
20         Jan,Apr,Jul,Oct
21         Jan,Apr,Jul,Oct
22                    None
23         Jan,Apr,Jul,Oct
24                    None
25                    None
26         Jan,Apr,Jul,Oct
27        Mar,Jun,Sept,Dec
28                    None
29        Mar,Jun,Sept,Dec
                ...       
844308     Feb,May,Aug,Nov
844309     Jan,Apr,Jul,Oct
844310     Feb,May,Aug,Nov
844311     Jan,Apr,Jul,Oct
844312     Jan,Apr,Jul,Oct
844313                None
8

In [54]:
cat_mapper = DataFrameMapper(cat_maps)
cat_map_fit = cat_mapper.fit(joined)
# cat_cols = len(cat_map_fit.features)
# cat_cols

TypeError: PromoInterval: argument must be a string or number

In [14]:
from keras.layers import Flatten, Embedding, Input, merge, Dense, Dropout
from keras.models import Model

Using TensorFlow backend.


In [15]:
cont_out = len(contin_vars)

In [16]:
def get_emb(cat_sz):
    name, c, c2 = cat_sz
    inp = Input((1,), dtype='int64', name=name+'_in')
    u = Flatten(name=name+'_flt')(Embedding(c, c2, input_length=1)(inp))
    return inp, u

def get_contin(name):
    inp = Input((1,), name=name+'_in')
    return inp, Dense(1, name=name+'_d')(inp)

def get_model(cat_sz, contin_vars):
    conts = [get_contin(feat) for feat in contin_vars]
    cont_out = [d for inp,d in conts]
    cont_inp = [inp for inp,d in conts]

    embs = [get_emb(feat) for feat in cat_sz]
    # print([emb for inp,emb in embs] + cont_out)
    x = merge.Concatenate()([emb for inp,emb in embs] + cont_out)

    x = Dropout(0.02)(x)
    x = Dense(1000, activation='relu', init='uniform')(x)
    x = Dense(500, activation='relu', init='uniform')(x)
    x = Dense(1, activation='sigmoid')(x)

    model = Model([inp for inp,emb in embs] + cont_inp, x)
    model.compile('adam', 'mean_absolute_error')
    #model.compile(Adam(), 'mse')
    return model

In [17]:
model = get_model(cat_sz, contin_vars)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [18]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Store_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
DayOfWeek_in (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
Year_in (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
Month_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
Day_in (In

In [19]:
joined.shape

(844338, 39)

In [20]:
joined.columns

Index(['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday',
       'CompetitionMonthsOpen', 'Promo2Weeks', 'StoreType', 'Assortment',
       'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State',
       'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw',
       'StateHoliday_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw',
       'CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC',
       'Min_TemperatureC', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity',
       'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend',
       'trend_DE', 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo',
       'SchoolHoliday', 'Sales'],
      dtype='object')

In [21]:
len(model.input)

38

In [22]:
X_train = joined[['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday',
       'CompetitionMonthsOpen', 'Promo2Weeks', 'StoreType', 'Assortment',
       'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State',
       'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw',
       'StateHoliday_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw',
       'CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC',
       'Min_TemperatureC', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity',
       'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend',
       'trend_DE', 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo',
       'SchoolHoliday']]
y_train = joined[[dep]]

In [23]:
X_train.shape

(844338, 38)

In [35]:
X_train_list = np.hsplit(X_train.values, X_train.shape[1])

In [36]:
len(X_train_list)

38

In [37]:
model.fit(X_train_list, y_train.values[:,0])

Epoch 1/1


ValueError: invalid literal for int() with base 10: 'a'

In [41]:
for d in X_train_list:
    print(d)

[[1]
 [2]
 [3]
 ...
 [769]
 [948]
 [1097]]
[[5]
 [5]
 [5]
 ...
 [2]
 [2]
 [2]]
[[2015]
 [2015]
 [2015]
 ...
 [2013]
 [2013]
 [2013]]
[[7]
 [7]
 [7]
 ...
 [1]
 [1]
 [1]]
[[31]
 [31]
 [31]
 ...
 [1]
 [1]
 [1]]
[[False]
 [False]
 [False]
 ...
 [True]
 [True]
 [True]]
[[24]
 [24]
 [24]
 ...
 [0]
 [0]
 [24]]
[[0]
 [25]
 [25]
 ...
 [5]
 [0]
 [0]]
[['c']
 ['a']
 ['a']
 ...
 ['b']
 ['b']
 ['b']]
[['a']
 ['a']
 ['a']
 ...
 ['b']
 ['b']
 ['b']]
[[nan]
 ['Jan,Apr,Jul,Oct']
 ['Jan,Apr,Jul,Oct']
 ...
 ['Jan,Apr,Jul,Oct']
 [nan]
 [nan]]
[[2008]
 [2007]
 [2006]
 ...
 [1900]
 [1900]
 [2002]]
[[1900]
 [2010]
 [2011]
 ...
 [2012]
 [1900]
 [1900]]
[['HE']
 ['TH']
 ['NW']
 ...
 ['NW']
 ['BW']
 ['RP']]
[[31]
 [31]
 [31]
 ...
 [1]
 [1]
 [1]]
[['Fog']
 ['Fog']
 ['Fog']
 ...
 ['Rain']
 ['Fog-Rain']
 ['Rain']]
[[5.0]
 [1.0]
 [5.0]
 ...
 [1.0]
 [1.0]
 [1.0]]
[[5.0]
 [5.0]
 [5.0]
 ...
 [0.0]
 [0.0]
 [0.0]]
[[0.0]
 [0.0]
 [0.0]
 ...
 [1.0]
 [2.0]
 [1.0]]
[[0.0]
 [0.0]
 [0.0]
 ...
 [1.0]
 [1.0]
 [1.0]]
[[7.0]
 [1.