In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import date, timedelta
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from scipy import sparse as ssp

df_train = pd.read_csv(
    'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},  
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0}, 
    parse_dates=['date'], 
    skiprows=range(1, 66458909)
)

df_test = pd.read_csv(
    'test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=['date']  
).set_index(
    ['store_nbr', 'item_nbr', 'date'] 
)

items = pd.read_csv(
    'items.csv',
).set_index('item_nbr')

df_2017 = df_train.loc[df_train.date>=pd.datetime(2000,1,1)]
print("LEN DF: ", len(df_2017))
del df_train



  df_2017 = df_train.loc[df_train.date>=pd.datetime(2000,1,1)]


LEN DF:  59038132


In [2]:
df_2017

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-01,25,105574,2.564949,False
1,2016-01-01,25,105575,2.302585,False
2,2016-01-01,25,105857,1.386294,False
3,2016-01-01,25,108634,1.386294,False
4,2016-01-01,25,108701,1.098612,True
...,...,...,...,...,...
59038127,2017-08-15,54,2089339,1.609438,False
59038128,2017-08-15,54,2106464,0.693147,True
59038129,2017-08-15,54,2110456,5.262690,False
59038130,2017-08-15,54,2113914,5.293305,True


In [3]:

promo_2017_train = df_2017.set_index(
    ['store_nbr', 'item_nbr', 'date'])[['onpromotion']].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[['onpromotion']].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ['store_nbr', 'item_nbr', 'date'])[['unit_sales']].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

df_2017

Unnamed: 0_level_0,date,2016-01-01,2016-01-02,2016-01-03,2016-01-04,2016-01-05,2016-01-06,2016-01-07,2016-01-08,2016-01-09,2016-01-10,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.098612,1.098612,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000
1,99197,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.098612,0.000000,1.098612,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,103520,0.0,0.693147,0.000000,1.386294,0.000000,0.693147,1.098612,0.693147,1.386294,0.000000,...,0.000000,0.000000,1.386294,0.000000,1.386294,0.693147,0.693147,0.693147,0.000000,0.000000
1,103665,0.0,0.000000,0.000000,0.000000,1.098612,1.386294,0.693147,0.000000,1.945910,0.000000,...,0.693147,1.098612,0.000000,2.079442,2.302585,1.098612,0.000000,0.000000,0.693147,0.693147
1,105574,0.0,1.098612,1.098612,2.639057,2.302585,2.079442,1.386294,1.609438,1.609438,1.791759,...,0.000000,1.791759,2.079442,1.945910,2.397895,1.791759,1.791759,0.000000,1.386294,1.609438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,2110456,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,7.203406,6.481577,6.586172,3.218876,0.000000,0.000000,0.000000,0.000000,4.795791,5.262690
54,2113343,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.098612,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.693147,0.000000
54,2113914,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,2.890372,0.000000,2.397895,2.397895,1.609438,0.000000,0.000000,2.833213,2.197225,5.293305
54,2116416,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.693147,0.693147,0.000000,0.000000,0.000000,0.693147,0.693147,1.098612,1.098612,1.098612


In [4]:
items = items.reindex(df_2017.index.get_level_values(1))

items

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1
105574,GROCERY I,1045,0
...,...,...,...
2110456,BEVERAGES,1120,0
2113343,BEVERAGES,1114,0
2113914,CLEANING,3040,0
2116416,GROCERY I,1060,0


In [5]:
#extract a range of rows from the table (starts at dt - minus days and up to periods days (or another time unit))
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [6]:
#aggregates are over unit_sales
def timeSeriesProcess(df_2017, t2017, label):
    X = pd.DataFrame()
    duration_list = [1, 3, 7, 14, 28, 60, 140]
    for d in duration_list:
        timespan = get_timespan(df_2017, t2017, d, d)#get rows from t2017-d that cover d days (so the window of d days ago)
        #build aggregates
        X['{}_day_{}_2017'.format(label, d)] = timespan.mean(axis=1).values
        if d != 1:
            #compute aggregates of the window (for the same store and item aggregate over the timespan the variable (onpromotion or unit_sales))
            X['{}_day_{}_2017_max'.format(label,d)] = timespan.max(axis=1).values
            X['{}_day_{}_2017_min'.format(label,d)] = timespan.min(axis=1).values
            X['{}_day_{}_2017_var'.format(label,d)] = timespan.var(axis=1).values
            X['{}_day_{}_2017_skew'.format(label,d)] = timespan.skew(axis=1).values
            X['{}_day_{}_2017_kurt'.format(label,d)] = timespan.kurt(axis=1).values
            
            exp_sum = np.zeros(timespan.shape[0])
            for i in range(timespan.shape[1]):
                exp_sum += np.exp(-i/5) * timespan.iloc[:,i]
            X['{}_exp_moving_sum_{}'.format(label,d)] = exp_sum.values
    
    for idx in range(1,len(duration_list)):
        a = duration_list[idx-1]
        b = duration_list[idx]
        X['{}_day_{}sub{}_2017'.format(label, a,b)] = X['{}_day_{}_2017'.format(label, a)]                                                     - X['{}_day_{}_2017'.format(label, b)]
        
    for i in range(7):
        for j in [4, 10, 20]:
            timespan = get_timespan(df_2017, t2017, j*7-i, j, freq='7D')
            X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values
            
        date = t2017-timedelta(7-i)#go up to 7-i days in the past
        for m in [3,7,14,28,60,130]:
            X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
                df_2017, date, m, m).mean(axis=1).values#go up to 7-i days in the past. start: 7-i-m
            X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
                df_2017, date-timedelta(7), m, m).mean(axis=1).values
    print("After building aggregates ", X)
    
    return X

In [7]:
def prepare_dataset(df_2017, t2017, promo_2017, is_train=True):
    X = pd.DataFrame({
        'store_nbr':df_2017.index.get_level_values(0),
        'item_nbr':df_2017.index.get_level_values(1),
        'unpromo_16aftsum_2017':(1-get_timespan(promo_2017, t2017, 0, 16)).sum(axis=1).values
        #how many days is on promo in the next 16 days
    })
    print(X)   
    
    duration_list = [1, 3, 7, 14, 28, 60, 140]
    for d in duration_list:       
        X['promo_{}_2017'.format(d)] = get_timespan(promo_2017, t2017, d, d).sum(axis=1).values
        #how many days the item has been in promo in the last d days
        
    for i in range(16):#when is on promo in the next 16 days
        X['promo_{}'.format(i)] = promo_2017[str(t2017 + timedelta(days=i))].values.astype(np.uint8)
    
    print(X)
    
    X = pd.concat([X, timeSeriesProcess(df_2017, t2017, 'item')], axis=1)

    if is_train:
        y = df_2017[pd.date_range(t2017, periods=16)].values#next 16 days of sales, so item, store features -> sales
        return X, y
    return X

In [8]:
num_training_weeks = 8
print('Preparing dataset...')
t2017 = date(2017, 5, 31) 
X_l, y_l = [], []
for i in range(num_training_weeks):
    print('training set ' + str(i) + ':')
    delta = timedelta(days = 7 * i)
    X_tmp, y_tmp = prepare_dataset(
        df_2017, t2017 + delta, promo_2017
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)



Preparing dataset...
training set 0:
        store_nbr  item_nbr  unpromo_16aftsum_2017
0               1     96995                     16
1               1     99197                     16
2               1    103520                     16
3               1    103665                     16
4               1    105574                     16
...           ...       ...                    ...
172125         54   2110456                     16
172126         54   2113343                     16
172127         54   2113914                     16
172128         54   2116416                     16
172129         54   2124052                     16

[172130 rows x 3 columns]
        store_nbr  item_nbr  unpromo_16aftsum_2017  promo_1_2017  \
0               1     96995                     16             0   
1               1     99197                     16             0   
2               1    103520                     16             0   
3               1    103665                     16  

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.231049             0.693147   
1              0.000000         0.597253             1.098612   
2              0.000000         0.000000             0.000000   
3              0.000000         0.366204             1.098612   
4              1.098612         1.059351             1.386294   
...                 ...              ...                  ...   
172125         0.000000         0.000000             0.000000   
172126         0.000000         0.000000             0.000000   
172127         0.000000         0.000000             0.000000   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.160151              1.732051   
1                  0.000000             0.30

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.000000             0.000000   
1              0.000000         0.732408             1.098612   
2              1.609438         0.767528             1.609438   
3              1.791759         1.425555             1.791759   
4              2.079442         1.617343             2.079442   
...                 ...              ...                  ...   
172125         0.000000         0.000000             0.000000   
172126         0.000000         0.000000             0.000000   
172127         0.000000         0.000000             0.000000   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.000000              0.000000   
1                  0.000000             0.40

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.000000             0.000000   
1              0.693147         0.597253             1.098612   
2              1.098612         0.828302             1.386294   
3              0.693147         1.059351             1.386294   
4              1.098612         1.329661             2.197225   
...                 ...              ...                  ...   
172125         0.000000         0.000000             0.000000   
172126         0.000000         0.000000             0.000000   
172127         0.000000         0.000000             0.000000   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.000000              0.000000   
1                  0.000000             0.30

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.000000             0.000000   
1              2.639057         1.245890             2.639057   
2              0.000000         0.462098             1.386294   
3              1.098612         1.059351             1.386294   
4              1.945910         1.416165             2.302585   
...                 ...              ...                  ...   
172125         0.000000         0.000000             0.000000   
172126         0.000000         0.000000             0.000000   
172127         0.000000         0.000000             0.000000   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.000000              0.000000   
1                  0.000000             1.75

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.231049             0.693147   
1              1.386294         0.462098             1.386294   
2              1.098612         0.732408             1.098612   
3              0.000000         0.366204             1.098612   
4              1.098612         1.595831             2.302585   
...                 ...              ...                  ...   
172125         4.990433         4.487194             5.252273   
172126         0.000000         0.000000             0.000000   
172127         0.000000         0.000000             0.000000   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.160151              1.732051   
1                  0.000000             0.64

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.000000             0.000000   
1              0.000000         0.000000             0.000000   
2              0.000000         0.231049             0.693147   
3              0.000000         0.231049             0.693147   
4              2.197225         1.531707             2.397895   
...                 ...              ...                  ...   
172125         5.796058         4.352428             6.568078   
172126         0.000000         0.000000             0.000000   
172127         2.302585         2.097190             2.302585   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.000000              0.000000   
1                  0.000000             0.00

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.000000             0.000000   
1              0.693147         0.462098             0.693147   
2              1.098612         0.366204             1.098612   
3              1.098612         0.828302             1.386294   
4              1.609438         1.416165             1.945910   
...                 ...              ...                  ...   
172125         5.117994         2.951888             5.117994   
172126         0.000000         0.000000             0.000000   
172127         3.583519         2.957262             3.583519   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.000000              0.000000   
1                  0.000000             0.16

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.000000             0.000000   
1              1.386294         0.693147             1.386294   
2              0.000000         0.366204             1.098612   
3              1.791759         0.597253             1.791759   
4              2.079442         1.656604             2.079442   
...                 ...              ...                  ...   
172125         3.044522         4.242496             6.424869   
172126         0.693147         0.231049             0.693147   
172127         2.197225         2.490075             2.708050   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.000000              0.000000   
1                  0.000000             0.48

In [9]:
X_train = pd.concat(X_l, axis=0, ignore_index=True)
y_train = np.concatenate(y_l, axis=0)  
del X_l, y_l

#same preprocessing
print('validation set:')
X_val, y_val = prepare_dataset(df_2017, date(2017, 7, 26), promo_2017)
print('testing set:')
X_test = prepare_dataset(df_2017, date(2017, 8, 16), promo_2017, is_train=False)

del X_train['item_nbr']
del X_val['item_nbr']
del X_test['item_nbr']

le = LabelEncoder()
items['family'] = le.fit_transform(items['family'])
X_train['family'] = pd.concat([items['family']] * num_training_weeks).values
X_val['family'] = items['family'].values
X_test['family'] = items['family'].values
X_train['class'] = pd.concat([items['class']] * num_training_weeks).values
X_val['class'] = items['class'].values
X_test['class'] = items['class'].values

store_info = pd.read_csv('stores.csv', usecols=[0, 3, 4])
X_train = pd.merge(X_train, store_info, on='store_nbr', how='left')
X_val = pd.merge(X_val, store_info, on='store_nbr', how='left')
X_test = pd.merge(X_test, store_info, on='store_nbr', how='left')

cat_features = ['store_nbr','type','cluster','family','class']
num_features = [i for i in X_train.columns if i not in cat_features]
for col in cat_features:
    le = LabelEncoder()
    le.fit(pd.concat([X_train[col].drop_duplicates(), X_val[col].drop_duplicates(), X_test[col].drop_duplicates()]))
    X_train[col] = le.transform(X_train[col])
    X_val[col] = le.transform(X_val[col])
    X_test[col] = le.transform(X_test[col])

num_features.remove('item_day_3_2017_kurt')
X_train.drop(['item_day_3_2017_kurt'], axis=1, inplace=True)



validation set:
        store_nbr  item_nbr  unpromo_16aftsum_2017
0               1     96995                     16
1               1     99197                     16
2               1    103520                     16
3               1    103665                     16
4               1    105574                     16
...           ...       ...                    ...
172125         54   2110456                     16
172126         54   2113343                     16
172127         54   2113914                      3
172128         54   2116416                     15
172129         54   2124052                     16

[172130 rows x 3 columns]
        store_nbr  item_nbr  unpromo_16aftsum_2017  promo_1_2017  \
0               1     96995                     16             0   
1               1     99197                     16             0   
2               1    103520                     16             0   
3               1    103665                     16             0   
4    

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.000000             0.000000   
1              0.000000         0.000000             0.000000   
2              1.098612         1.059351             1.386294   
3              1.386294         1.229626             1.609438   
4              1.791759         1.866141             2.197225   
...                 ...              ...                  ...   
172125         2.564949         4.683775             6.293419   
172126         0.000000         0.462098             0.693147   
172127         2.484907         1.290400             2.484907   
172128         0.000000         0.000000             0.000000   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.000000              0.000000   
1                  0.000000             0.00

  X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(
  X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values


After building aggregates          item_day_1_2017  item_day_3_2017  item_day_3_2017_max  \
0              0.000000         0.000000             0.000000   
1              0.000000         0.000000             0.000000   
2              0.000000         0.231049             0.693147   
3              0.693147         0.462098             0.693147   
4              1.609438         0.998577             1.609438   
...                 ...              ...                  ...   
172125         5.262690         3.352827             5.262690   
172126         0.000000         0.231049             0.693147   
172127         5.293305         3.441248             5.293305   
172128         1.098612         1.098612             1.098612   
172129         0.000000         0.000000             0.000000   

        item_day_3_2017_min  item_day_3_2017_var  item_day_3_2017_skew  \
0                  0.000000             0.000000              0.000000   
1                  0.000000             0.00

  X_train['family'] = pd.concat([items['family']] * num_training_weeks).values
  X_val['family'] = items['family'].values
  X_test['family'] = items['family'].values
  X_train['class'] = pd.concat([items['class']] * num_training_weeks).values
  X_val['class'] = items['class'].values
  X_test['class'] = items['class'].values


In [10]:


enc = OneHotEncoder()
enc.fit(pd.concat([X_train[cat_features],X_val[cat_features],X_test[cat_features]]))
X_train_cat = enc.transform(X_train[cat_features])
X_val_cat = enc.transform(X_val[cat_features])
X_test_cat = enc.transform(X_test[cat_features])

cat_count_features = []
for col in cat_features:
    d = pd.concat([X_train[col],X_val[col],X_test[col]]).value_counts().to_dict()
    X_train['%s_count'%col] = X_train[col].apply(lambda x:d.get(x,0))
    X_val['%s_count'%col] = X_val[col].apply(lambda x:d.get(x,0))   
    X_test['%s_count'%col] = X_test[col].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%col)

X_train = ssp.hstack([X_train[num_features+cat_count_features].values,X_train_cat,]).tocsr()
X_val = ssp.hstack([X_val[num_features+cat_count_features].values,X_val_cat,]).tocsr()
X_test = ssp.hstack([X_test[num_features+cat_count_features].values,X_test_cat,]).tocsr()

print('Training and predicting ')

from sklearn.linear_model import LinearRegression

model = LinearRegression()

import time

#next 16 days of sales, so given item, store and temporal features, predict sales
#a model is trained for every day in the future (e.g., item, store and temporal features in the past -> 2nd day in the future)

for i in range(16):
    t = time.time()
    print('Step %d' % (i+1))
    model.fit(X_train, y_train[:, i])
    print(time.time()-t)
    pred = model.predict(X_val)
    
    print("mean_squared_error: ",np.sqrt(mean_squared_error(y_val[:, i], pred))) 
    print("eval: ", model.score(X_val, y_val[:, i]))




Training and predicting 
Step 1
60.23252320289612
mean_squared_error:  0.5471919778679295
eval:  0.7160291064694246
Step 2
65.63619017601013
mean_squared_error:  0.5713135538222999
eval:  0.6643942618006875
Step 3
64.89129090309143
mean_squared_error:  0.5844005197649521
eval:  0.6944016236925363
Step 4
67.9899799823761
mean_squared_error:  0.594651770078511
eval:  0.7134799642382867
Step 5
68.1728162765503
mean_squared_error:  0.5971027661821124
eval:  0.7257926387242475
Step 6
66.01492094993591
mean_squared_error:  0.6018477337568783
eval:  0.6872971176543063
Step 7
68.52496695518494
mean_squared_error:  0.6504563260565677
eval:  0.654696969698067
Step 8
58.87123680114746
mean_squared_error:  0.6314855648495705
eval:  0.6704393648888066
Step 9
57.66181421279907
mean_squared_error:  0.6183043949681394
eval:  0.6346252699606842
Step 10
67.93816590309143
mean_squared_error:  0.6113712266146879
eval:  0.6681471706844593
Step 11
67.52902889251709
mean_squared_error:  0.6158832981494515
ev

In [11]:
np.isnan(X_train.data).any()


False