# 0.0. Imports

In [95]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBRegressor

from IPython.display import HTML

from sklearn import model_selection as ms
from sklearn.dummy import DummyRegressor
from sklearn import metrics
from sklearn import linear_model as lm
from sklearn import ensemble as en
from sklearn import preprocessing as pp

## 0.1. Helper Function

In [130]:
def cross_validation( x_training, kfold, model_name, model, verbose=False ):
    mae_list = []
    mape_list = []
    rmse_list = []
    
    for k in reversed( range( 1, kfold+1 ) ):
        if verbose:
            print( '\nKFold Number: {}'.format( k ) )
        # start and end date for validation 
        validation_start_date = x_training['date'].max() - datetime.timedelta( days=k*6*7)
        validation_end_date = x_training['date'].max() - datetime.timedelta( days=(k-1)*6*7)

        # filtering dataset
        training = x_training[x_training['date'] < validation_start_date]
        validation = x_training[(x_training['date'] >= validation_start_date) & (x_training['date'] <= validation_end_date)]

        # training and validation dataset
        # training
        xtraining = training.drop( ['date', 'sales'], axis=1 ) 
        ytraining = training['sales']

        # validation
        xvalidation = validation.drop( ['date', 'sales'], axis=1 )
        yvalidation = validation['sales']

        # model
        m = model.fit( xtraining, ytraining )

        # prediction
        yhat = m.predict( xvalidation )

        # performance
        m_result = ml_error( model_name, yvalidation ,  yhat  )

        # store performance of each kfold iteration
        mae_list.append(  m_result['MAE'] )
        mape_list.append( m_result['MAPE'] )
        rmse_list.append( m_result['RMSE'] )
        
        dict_ = {'Model Name': model_name,
                 'MAE CV': np.round( np.mean( mae_list ), 3 ).astype( str ) + ' +/- ' + np.round( np.std( mae_list ), 3 ).astype( str ),
                 'MAPE CV': np.round( np.mean( mape_list ), 3 ).astype( str ) + ' +/- ' + np.round( np.std( mape_list ), 3 ).astype( str ),
                 'RMSE CV': np.round( np.mean( rmse_list ), 3 ).astype( str ) + ' +/- ' + np.round( np.std( rmse_list ), 3 ).astype( str ) }

    return pd.DataFrame( dict_, index=[0] )


def ml_error(model_name, y_test, yhat,):
    mae = metrics.mean_absolute_error(y_test, yhat)
    mape = metrics.mean_absolute_percentage_error(y_test, yhat)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, yhat))
    
    dict_ = {'Model': model_name,
             'MAE': mae,
             'MAPE': mape,
             'RMSE': rmse
    }
    
    return pd.DataFrame(dict_, index=[0])


def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    
    sns.set_style("whitegrid")
    
    sns.set()
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2. Load Data

In [97]:
df_store_raw = pd.read_csv('../data/store.csv', low_memory=False)
df_sales_raw = pd.read_csv('../data/train.csv', low_memory=False)

# merge
df_raw = pd.merge(df_store_raw, df_sales_raw, how='left', on='Store')

# 1.0. Data Description

In [98]:
df1 = df_raw.copy()

## 1.1. Rename Columns

In [99]:
cols = ['store',
        'store_type',
        'assortment',
        'competition_distance',
        'competition_open_since_month',
        'competition_open_since_year',
        'promo2',
        'promo2_since_week',
        'promo2_since_year',
        'promo_interval',
        'day_of_week',
        'date',
        'sales',
        'customers',
        'open',
        'promo',
        'state_holiday',
        'school_holiday']
df1.columns = cols

## 1.2. Data Dimensions

In [100]:
print('Number of Rows: {}'.format(df1.shape[0]))
print('Number of Cols: {}'.format(df1.shape[1]))

Number of Rows: 1017209
Number of Cols: 18


## 1.3. Data Types

In [101]:
df1.dtypes

store                             int64
store_type                       object
assortment                       object
competition_distance            float64
competition_open_since_month    float64
competition_open_since_year     float64
promo2                            int64
promo2_since_week               float64
promo2_since_year               float64
promo_interval                   object
day_of_week                       int64
date                             object
sales                             int64
customers                         int64
open                              int64
promo                             int64
state_holiday                    object
school_holiday                    int64
dtype: object

## 1.4. Check NA

In [102]:
df1.isna().mean()

store                          0.00
store_type                     0.00
assortment                     0.00
competition_distance           0.00
competition_open_since_month   0.32
competition_open_since_year    0.32
promo2                         0.00
promo2_since_week              0.50
promo2_since_year              0.50
promo_interval                 0.50
day_of_week                    0.00
date                           0.00
sales                          0.00
customers                      0.00
open                           0.00
promo                          0.00
state_holiday                  0.00
school_holiday                 0.00
dtype: float64

## 1.5. Replace NA

In [103]:
df1 = df1.dropna().reset_index(drop=True)

## 1.6. Change Dtypes

In [104]:
df1['date'] = pd.to_datetime( df1['date'] )

## 1.7. Statistical Descriptive

# 2.0. Feature Engeering

In [105]:
df2 = df1.copy()

# 3.0. Data Filtering

In [106]:
df3 = df2.copy()

## 3.1. Filter Rows

In [107]:
df3 = df3.loc[(df3['open'] != 0) & (df3['sales'] != 0) , :]

## 3.2. Filter Columns

In [108]:
cols_drop = ['promo_interval', 'open']

df3 = df3.drop(cols_drop, axis=1)

# 4.0. EDA

In [109]:
df4 = df3.copy()

# 5.0. Data Preparation

In [110]:
df5 = df4.copy()

## 5.1. Standardization

## 5.2. Rescale

## 5.3. Encoder



In [111]:
# df5['sales'] = np.log1p(df5['sales'])

In [112]:
le = pp.LabelEncoder()

df5['store_type'] = le.fit_transform( df5[['store_type']].values.ravel() )
df5['assortment'] = le.fit_transform( df5[['assortment']].values.ravel() )
df5['state_holiday'] = le.fit_transform( df5[['state_holiday']].values.ravel() )

# 6.0. Feature Selection

In [113]:
df6 = df5.copy()

## 6.1. Splitting data into training and test datasets

In [114]:
df6[['store', 'date']].groupby('store').max().reset_index()['date'][0] - datetime.timedelta(days=6*7)

Timestamp('2015-06-19 00:00:00')

In [115]:
X_train = df6[df6['date'] < '2015-06-19']
y_train = X_train['sales']

X_test = df6[df6['date'] >= '2015-06-19']
y_test = X_test['sales']

# 7.0. Model Training

In [116]:
x_train = X_train.drop( ['date', 'sales'], axis=1 )
x_test = X_test.drop( ['date', 'sales'], axis=1 )

x_training = X_train.copy()

## 7.1. Average Model

In [117]:
# model definition
model_avg = DummyRegressor()

# fit
model_avg.fit(x_train, y_train)

# predictions
yhat_baseline = model_avg.predict(x_test)

# perfomance
result_baseline = ml_error('DummyRegressor', y_test, yhat_baseline)
result_baseline

Unnamed: 0,Model,MAE,MAPE,RMSE
0,DummyRegressor,2021.75,0.37,2661.58


In [118]:
result_baseline_cv = cross_validation(x_training, 5, 'DummyRegressor', model_avg)
result_baseline_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,DummyRegressor,2142.95 +/- 335.77,0.36 +/- 0.01,2839.45 +/- 487.56


## 7.2. Linear Regression Model

In [119]:
# model definition
model_lr = lm.LinearRegression()

# fit
model_lr.fit(x_train, y_train)

# predictions
yhat_lr = model_lr.predict(x_test)

# perfomance
result_lr = ml_error('LinearRegression', y_test, yhat_lr)
result_lr

Unnamed: 0,Model,MAE,MAPE,RMSE
0,LinearRegression,901.44,0.14,1252.7


In [120]:
result_lr_cv = cross_validation(x_training, 5, 'LinearRegression', model_lr)
result_lr_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,LinearRegression,932.75 +/- 112.67,0.13 +/- 0.0,1276.53 +/- 176.78


## 7.3. Linear Regression - Lasso

In [121]:
# model definition
model_lrr = lm.Lasso()

# fit
model_lrr.fit(x_train, y_train)

# predictions
yhat_lrr = model_lr.predict(x_test)

# perfomance
result_lrr = ml_error('Lasso', y_test, yhat_lrr)
result_lrr

Unnamed: 0,Model,MAE,MAPE,RMSE
0,Lasso,903.62,0.14,1256.37


In [122]:
result_lrr_cv = cross_validation(x_training, 5, 'LinearRegression - Lasso', model_lrr)
result_lrr_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,LinearRegression - Lasso,932.87 +/- 112.86,0.13 +/- 0.0,1276.79 +/- 177.06


## 7.4. Random Forest Regressor

In [123]:
# model definition
model_rf = en.RandomForestRegressor()

# fit
model_rf.fit(x_train, y_train)

# predictions
yhat_rf = model_rf.predict(x_test)

# perfomance
result_rf = ml_error('RandomForestRegressor', y_test, yhat_rf)
result_rf

Unnamed: 0,Model,MAE,MAPE,RMSE
0,RandomForestRegressor,454.46,0.07,641.04


In [124]:
result_rf_cv = cross_validation(x_training, 5, 'RandomForestRegressor', model_rf)
result_rf_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,RandomForestRegressor,440.15 +/- 120.29,0.06 +/- 0.01,618.06 +/- 154.36


## 7.5. XGB Regressor

In [125]:
# model definition
model_xgb = XGBRegressor(objective='reg:squarederror')

# fit
model_xgb.fit(x_train, y_train)

# predictions
yhat_xgb = model_xgb.predict(x_test)

# perfomance
result_xgb = ml_error('XGBRegressor', y_test, yhat_xgb)
result_xgb

Unnamed: 0,Model,MAE,MAPE,RMSE
0,XGBRegressor,668.52,0.1,957.39


In [126]:
result_xgb_cv = cross_validation(x_training, 5, 'XGBRegressor', model_xgb)
result_xgb_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,XGBRegressor,693.29 +/- 125.89,0.1 +/- 0.01,963.24 +/- 171.03


## 7.6. Result

In [127]:
result = pd.concat([result_baseline, result_lr, result_lrr, result_rf, result_xgb])
result

Unnamed: 0,Model,MAE,MAPE,RMSE
0,DummyRegressor,2021.75,0.37,2661.58
0,LinearRegression,901.44,0.14,1252.7
0,Lasso,903.62,0.14,1256.37
0,RandomForestRegressor,454.46,0.07,641.04
0,XGBRegressor,668.52,0.1,957.39


In [133]:
result_cv = pd.concat([result_baseline_cv, result_lr_cv, result_lrr_cv, result_rf_cv, result_xgb_cv])
result_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,DummyRegressor,2142.95 +/- 335.77,0.36 +/- 0.01,2839.45 +/- 487.56
0,LinearRegression,932.75 +/- 112.67,0.13 +/- 0.0,1276.53 +/- 176.78
0,LinearRegression - Lasso,932.87 +/- 112.86,0.13 +/- 0.0,1276.79 +/- 177.06
0,RandomForestRegressor,440.15 +/- 120.29,0.06 +/- 0.01,618.06 +/- 154.36
0,XGBRegressor,693.29 +/- 125.89,0.1 +/- 0.01,963.24 +/- 171.03


# 8.0. Hyperparameter Fine Tuning

# 9.0. Model Perfomance

# 10.0. Deploy to Product