In [82]:
import pandas as pd
import numpy as np
from datetime import datetime

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline


In [83]:
df_train_store = pd.read_csv('data/train_store.csv', low_memory = False, index_col=['Unnamed: 0'])

In [84]:
df_train_store.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844338 entries, 0 to 844337
Data columns (total 24 columns):
Store                        844338 non-null int64
DayOfWeek                    844338 non-null int64
Sales                        844338 non-null int64
Customers                    844338 non-null int64
Open                         844338 non-null int64
Promo                        844338 non-null int64
StateHoliday                 844338 non-null object
SchoolHoliday                844338 non-null int64
Year                         844338 non-null int64
Month                        844338 non-null int64
Day                          844338 non-null int64
WeekOfYear                   844338 non-null int64
SalePerCustomer              844338 non-null float64
StoreType                    844338 non-null object
Assortment                   844338 non-null object
CompetitionDistance          844338 non-null float64
CompetitionOpenSinceMonth    844338 non-null float64
CompetitionOp

In [85]:
df_train_store.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenByMonth,PromoOpenByMonth
0,292,3,5076,672,1,0,0,1,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
1,292,4,4580,662,1,0,0,1,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
2,292,5,4202,560,1,0,0,1,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
3,292,6,2748,340,1,0,0,0,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
4,292,1,9291,1002,1,1,0,0,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.45977


### Converts categorical features into numerical features

In [86]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tmp = le.fit_transform(df_train_store['PromoInterval'])
df_train_store['PromoInterval'] = tmp

tmp1 = le.fit_transform(df_train_store['StateHoliday'])
df_train_store['StateHoliday'] = tmp1

tmp2 = le.fit_transform(df_train_store['StoreType'])
df_train_store['StoreType'] = tmp2

tmp3 = le.fit_transform(df_train_store['Assortment'])
df_train_store['Assortment'] = tmp3

### Splitting time series data into Train set and Test set

In [87]:
from sklearn.model_selection import TimeSeriesSplit

In [88]:
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)  


TimeSeriesSplit(max_train_size=None, n_splits=3)


In [89]:
X_columns = df_train_store.columns.values
X_columns = X_columns.tolist()
X_columns.remove('Sales')
#X_columns.remove('PromoOpenByMonth')
X_columns.remove('SalePerCustomer')
X_columns.remove('Customers')

In [90]:
X_columns

['Store',
 'DayOfWeek',
 'Open',
 'Promo',
 'StateHoliday',
 'SchoolHoliday',
 'Year',
 'Month',
 'Day',
 'WeekOfYear',
 'StoreType',
 'Assortment',
 'CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'PromoInterval',
 'CompetitionOpenByMonth',
 'PromoOpenByMonth']

In [91]:
X, y = df_train_store[X_columns], df_train_store['Sales']

In [92]:
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


TRAIN: [     0      1      2 ... 211083 211084 211085] TEST: [211086 211087 211088 ... 422167 422168 422169]
TRAIN: [     0      1      2 ... 422167 422168 422169] TEST: [422170 422171 422172 ... 633251 633252 633253]
TRAIN: [     0      1      2 ... 633251 633252 633253] TEST: [633254 633255 633256 ... 844335 844336 844337]


In [93]:
X.head()

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenByMonth,PromoOpenByMonth
0,292,3,1,0,0,1,2013,1,2,1,...,0,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
1,292,4,1,0,0,1,2013,1,3,1,...,0,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
2,292,5,1,0,0,1,2013,1,4,1,...,0,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
3,292,6,1,0,0,0,2013,1,5,1,...,0,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
4,292,1,1,1,0,0,2013,1,7,2,...,0,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.45977


In [94]:
y.head()

0    5076
1    4580
2    4202
3    2748
4    9291
Name: Sales, dtype: int64

In [95]:
X_train_2 = X.iloc[0:round(len(X)*.7)]
X_test_2 = X.iloc[round(len(X)*.7):len(X)]

y_train_2 = y.iloc[0:round(len(y)*.7)]
y_test_2 = y.iloc[round(len(y)*.7):len(y)]

In [96]:
X_train_2.tail()

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenByMonth,PromoOpenByMonth
591032,337,2,1,0,0,0,2015,7,21,30,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.551724
591033,337,3,1,0,0,0,2015,7,22,30,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.551724
591034,337,4,1,0,0,0,2015,7,23,30,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.551724
591035,337,5,1,0,0,0,2015,7,24,30,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.551724
591036,337,6,1,0,0,0,2015,7,25,30,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.551724


In [97]:
X_test_2.head()

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenByMonth,PromoOpenByMonth
591037,337,1,1,1,0,1,2015,7,27,31,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.781609
591038,337,2,1,1,0,1,2015,7,28,31,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.781609
591039,337,3,1,1,0,1,2015,7,29,31,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.781609
591040,337,4,1,1,0,1,2015,7,30,31,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.781609
591041,337,5,1,1,0,1,2015,7,31,31,...,2,10600.0,7.0,2005.0,1,45.0,2014.0,1,120.0,8.781609


### Modeling

In [98]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from pandas import Series
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [99]:
scaler = MinMaxScaler()
scaler.fit(X_train)
MinMaxScaler(copy=True, feature_range=(0,1))
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [100]:
X_train_scaled

array([[1.43712575e-02, 3.33333333e-01, 0.00000000e+00, ...,
        0.00000000e+00, 3.05557850e-03, 9.98724136e-01],
       [1.43712575e-02, 5.00000000e-01, 0.00000000e+00, ...,
        0.00000000e+00, 3.05557850e-03, 9.98724136e-01],
       [1.43712575e-02, 6.66666667e-01, 0.00000000e+00, ...,
        0.00000000e+00, 3.05557850e-03, 9.98724136e-01],
       ...,
       [0.00000000e+00, 5.00000000e-01, 0.00000000e+00, ...,
        1.00000000e+00, 9.90998431e-04, 6.66410988e-04],
       [0.00000000e+00, 6.66666667e-01, 0.00000000e+00, ...,
        1.00000000e+00, 9.90998431e-04, 6.66410988e-04],
       [0.00000000e+00, 8.33333333e-01, 0.00000000e+00, ...,
        1.00000000e+00, 9.90998431e-04, 6.66410988e-04]])

In [101]:
%%time

# instantiate and train model
lr = LinearRegression().fit(X_train_scaled, y_train)

print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))

lr.coef_: [-1.84999632e+02 -8.58701080e+02  1.77159325e+13  2.14530590e+03
  2.27749396e+03  8.02561859e+01  6.39446880e+12  1.71243011e+13
 -7.80745908e+01 -1.51278715e+13 -3.09693359e+02  7.65678955e+02
 -2.40185938e+03 -1.86810558e+13 -3.76423274e+16  7.26858770e+04
  1.48312465e+13  3.11999001e+16 -2.64929688e+02 -3.77014841e+16
  3.12465889e+16]
lr.intercept_: 6448058983414589.0
CPU times: user 1.05 s, sys: 143 ms, total: 1.19 s
Wall time: 598 ms


In [102]:
%%time

print("Training set sore: {:.2f}".format(lr.score(X_train_scaled, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test_scaled, y_test)))

Training set sore: 0.19
Test set score: 0.19
CPU times: user 323 ms, sys: 26.3 ms, total: 350 ms
Wall time: 84.8 ms


In [103]:
%%time

# instantiate and train model
lr = LinearRegression().fit(X_train_2, y_train_2)

print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))

lr.coef_: [-4.31456722e-01 -1.38802636e+02 -8.98242974e-09  2.14933686e+03
  8.66913790e+02  7.92794221e+01  5.11535120e+01  7.16889560e+01
 -2.77803379e+00 -1.34495474e+00 -1.13839098e+02  3.91898471e+02
 -2.84300761e-02 -4.83518253e+01  6.07373548e+01  7.16496564e+04
  2.27438721e+01  5.00925665e+01 -4.86179610e+01  5.03466844e+00
  7.19368594e+00]
lr.intercept_: -391947.620809601
CPU times: user 1.09 s, sys: 146 ms, total: 1.24 s
Wall time: 538 ms


In [104]:
%%time

print("Training set sore: {:.2f}".format(lr.score(X_train_2, y_train_2)))
print("Test set score: {:.2f}".format(lr.score(X_test_2, y_test_2)))

Training set sore: 0.19
Test set score: 0.17
CPU times: user 508 ms, sys: 60.8 ms, total: 569 ms
Wall time: 163 ms


In [105]:
%%time

from sklearn.linear_model import Ridge

ridge = Ridge().fit(X_train, y_train)
print("Training set sore: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

Training set sore: 0.19
Test set score: 0.19
CPU times: user 308 ms, sys: 154 ms, total: 461 ms
Wall time: 438 ms


In [106]:
%%time

from sklearn.linear_model import Lasso

lasso = Lasso().fit(X_train, y_train)
print("Training set sore: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

Training set sore: 0.19
Test set score: 0.19
Number of features used: 19
CPU times: user 1min 19s, sys: 545 ms, total: 1min 20s
Wall time: 1min 20s


