In [51]:
import pandas as pd
import numpy as np
from datetime import datetime

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline


In [52]:
df_train_store = pd.read_csv('data/train_store.csv', low_memory = False, index_col=['Unnamed: 0'])

In [53]:
df_train_store.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844338 entries, 0 to 844337
Data columns (total 24 columns):
Store                        844338 non-null int64
DayOfWeek                    844338 non-null int64
Sales                        844338 non-null int64
Customers                    844338 non-null int64
Open                         844338 non-null int64
Promo                        844338 non-null int64
StateHoliday                 844338 non-null object
SchoolHoliday                844338 non-null int64
Year                         844338 non-null int64
Month                        844338 non-null int64
Day                          844338 non-null int64
WeekOfYear                   844338 non-null int64
SalePerCustomer              844338 non-null float64
StoreType                    844338 non-null object
Assortment                   844338 non-null object
CompetitionDistance          844338 non-null float64
CompetitionOpenSinceMonth    844338 non-null float64
CompetitionOp

In [54]:
df_train_store.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenByMonth,PromoOpenByMonth
0,1,5,5263,555,1,1,0,1,2015,7,...,a,1270.0,9.0,2008.0,0,0.0,0.0,0,82.0,24187.126437
1,1,4,5020,546,1,1,0,1,2015,7,...,a,1270.0,9.0,2008.0,0,0.0,0.0,0,82.0,24187.126437
2,1,3,4782,523,1,1,0,1,2015,7,...,a,1270.0,9.0,2008.0,0,0.0,0.0,0,82.0,24187.126437
3,1,2,5011,560,1,1,0,1,2015,7,...,a,1270.0,9.0,2008.0,0,0.0,0.0,0,82.0,24187.126437
4,1,1,6102,612,1,1,0,1,2015,7,...,a,1270.0,9.0,2008.0,0,0.0,0.0,0,82.0,24187.126437


### Converts categorical features into numerical features

In [55]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tmp = le.fit_transform(df_train_store['PromoInterval'])
df_train_store['PromoInterval'] = tmp

tmp1 = le.fit_transform(df_train_store['StateHoliday'])
df_train_store['StateHoliday'] = tmp1

tmp2 = le.fit_transform(df_train_store['StoreType'])
df_train_store['StoreType'] = tmp2

tmp3 = le.fit_transform(df_train_store['Assortment'])
df_train_store['Assortment'] = tmp3

### Splitting time series data into Train set and Test set

In [56]:
from sklearn.model_selection import TimeSeriesSplit

In [57]:
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)  


TimeSeriesSplit(max_train_size=None, n_splits=3)


In [58]:
X_columns = df_train_store.columns.values
X_columns = X_columns.tolist()
X_columns.remove('Sales')

In [59]:
X, y = df_train_store[X_columns], df_train_store['Sales']

In [60]:
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


TRAIN: [     0      1      2 ... 211083 211084 211085] TEST: [211086 211087 211088 ... 422167 422168 422169]
TRAIN: [     0      1      2 ... 422167 422168 422169] TEST: [422170 422171 422172 ... 633251 633252 633253]
TRAIN: [     0      1      2 ... 633251 633252 633253] TEST: [633254 633255 633256 ... 844335 844336 844337]


### Modeling

In [61]:
from sklearn.linear_model import LinearRegression


In [62]:
%%time

# instantiate and train model
lr = LinearRegression().fit(X_train, y_train)

print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))

lr.coef_: [-1.18048144e-01 -4.41384215e+01  7.37725053e+00  2.36883402e-10
  2.71083396e+02 -6.99307906e+02  1.64902992e+01  2.42121317e+00
  1.19588304e+01  1.13004419e-01  4.44337848e-02  7.25288950e+02
 -1.23467816e+02  4.43065204e+01 -1.58175448e-04 -3.80303248e+00
  3.70928231e+00 -1.27091356e+03 -1.80469557e+00  2.44326303e+00
  3.13209681e+00  3.05033190e-01  1.60488229e-01]
lr.intercept_: -21495.05850643999
CPU times: user 1.33 s, sys: 264 ms, total: 1.59 s
Wall time: 889 ms


In [63]:
%%time

print("Training set sore: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

Training set sore: 0.92
Test set score: 0.91
CPU times: user 176 ms, sys: 73.9 ms, total: 250 ms
Wall time: 207 ms


In [64]:
%%time

from sklearn.linear_model import Ridge

ridge = Ridge().fit(X_train, y_train)
print("Training set sore: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

Training set sore: 0.92
Test set score: 0.91
CPU times: user 335 ms, sys: 138 ms, total: 474 ms
Wall time: 431 ms


In [65]:
%%time

from sklearn.linear_model import Lasso

lasso = Lasso().fit(X_train, y_train)
print("Training set sore: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

Training set sore: 0.92
Test set score: 0.91
Number of features used: 20
CPU times: user 1min 22s, sys: 312 ms, total: 1min 22s
Wall time: 1min 22s


