In [15]:
import pandas as pd
import numpy as np
from datetime import datetime

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline


In [2]:
df_train_store = pd.read_csv('data/train_store.csv', low_memory = False, index_col=['Unnamed: 0'])

In [3]:
df_train_store.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844338 entries, 0 to 844337
Data columns (total 24 columns):
Store                        844338 non-null int64
DayOfWeek                    844338 non-null int64
Sales                        844338 non-null int64
Customers                    844338 non-null int64
Open                         844338 non-null int64
Promo                        844338 non-null int64
StateHoliday                 844338 non-null object
SchoolHoliday                844338 non-null int64
Year                         844338 non-null int64
Month                        844338 non-null int64
Day                          844338 non-null int64
WeekOfYear                   844338 non-null int64
SalePerCustomer              844338 non-null float64
StoreType                    844338 non-null object
Assortment                   844338 non-null object
CompetitionDistance          844338 non-null float64
CompetitionOpenSinceMonth    844338 non-null float64
CompetitionOp

In [4]:
df_train_store.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionOpenByMonth,PromoOpenByMonth
0,292,3,5076,672,1,0,0,1,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
1,292,4,4580,662,1,0,0,1,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
2,292,5,4202,560,1,0,0,1,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
3,292,6,2748,340,1,0,0,0,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.229885
4,292,1,9291,1002,1,1,0,0,2013,1,...,a,1100.0,6.0,2009.0,0,0.0,0.0,0,43.0,24156.45977


### Converts categorical features into numerical features

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tmp = le.fit_transform(df_train_store['PromoInterval'])
df_train_store['PromoInterval'] = tmp

tmp1 = le.fit_transform(df_train_store['StateHoliday'])
df_train_store['StateHoliday'] = tmp1

tmp2 = le.fit_transform(df_train_store['StoreType'])
df_train_store['StoreType'] = tmp2

tmp3 = le.fit_transform(df_train_store['Assortment'])
df_train_store['Assortment'] = tmp3

### Splitting time series data into Train set and Test set

In [6]:
from sklearn.model_selection import TimeSeriesSplit

  return f(*args, **kwds)


In [7]:
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)  


TimeSeriesSplit(max_train_size=None, n_splits=3)


In [8]:
X_columns = df_train_store.columns.values
X_columns = X_columns.tolist()
X_columns.remove('Sales')
X_columns.remove('PromoOpenByMonth')
X_columns.remove('SalePerCustomer')
X_columns.remove('Customers')

In [9]:
X, y = df_train_store[X_columns], df_train_store['Sales']

In [10]:
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


TRAIN: [     0      1      2 ... 211083 211084 211085] TEST: [211086 211087 211088 ... 422167 422168 422169]
TRAIN: [     0      1      2 ... 422167 422168 422169] TEST: [422170 422171 422172 ... 633251 633252 633253]
TRAIN: [     0      1      2 ... 633251 633252 633253] TEST: [633254 633255 633256 ... 844335 844336 844337]


### Model

In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

  return f(*args, **kwds)


In [12]:
scaler = MinMaxScaler()
scaler.fit(X_train)
MinMaxScaler(copy=True, feature_range=(0,1))
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
%%time

# Setup the hyperparameter grid
param_grid = {'min_samples_leaf': [1,5,10]}

# instantiate model
dtr = DecisionTreeRegressor()

# Instantiate the GridSearchCV object: gbrt_cv
dtr_cv = GridSearchCV(dtr, param_grid, cv=3)

# Fit it to the data
dtr_cv.fit(X_train_scaled, y_train)


CPU times: user 46.5 s, sys: 1.77 s, total: 48.2 s
Wall time: 48.4 s


In [14]:
#Print the tuned parameters and score
print("Tuned parameters: {}".format(dtr_cv.best_params_)) 
print("Best score is {}".format(dtr_cv.best_score_))
print("Best estimator is {}".format(dtr_cv.best_estimator_))

Tuned parameters: {'min_samples_leaf': 5}
Best score is -2.07034459195019
Best estimator is DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=5,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')


In [16]:
%%time
dtr = DecisionTreeRegressor(min_samples_leaf=5)
dtr.fit(X_train_scaled, y_train)
print("Training set sore: {:.2f}".format(dtr.score(X_train_scaled, y_train)))
print("Test set score: {:.2f}".format(dtr.score(X_test_scaled, y_test)))

Training set sore: 0.95
Test set score: -0.33
CPU times: user 5.68 s, sys: 120 ms, total: 5.8 s
Wall time: 5.82 s


In [16]:
dtr.predict(X_test)

array([5546., 1621., 6885., ..., 4790., 5032., 5265.])