In [1]:
import pandas as pd
import numpy as np
import inflection
import math
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import random
import warnings

from scipy           import stats
from IPython.display import HTML
from IPython.display import Image
from tabulate        import tabulate
from boruta          import BorutaPy

from sklearn.metrics       import mean_absolute_error, mean_squared_error
from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder

In [14]:
def cross_validation(x_training, kfold, model_name, model):
    mae_list = []
    mape_list = []
    rmse_list = []
    for k in range(1, kfold + 1):
    #     print('\nKFolder = {}'.format(k))
    
        # start and end date for validation
        validation_start_date = x_training['date'].max() - datetime.timedelta( days = k*6*7)
        validation_end_date = x_training['date'].max() - datetime.timedelta( days = (k - 1)*6*7)

        # filtering dataset
        training = x_training[x_training['date'] < validation_start_date]
        validation = x_training[(x_training['date'] >= validation_start_date) & (x_training['date'] <= validation_end_date)]

        # training and validation dataset
        xtraining = training.drop(['date','sales'], axis = 1)
        ytraining = training['sales']

        xvalidation = validation.drop(['date','sales'], axis = 1)
        yvalidation = validation['sales']

        # model
        m = model.fit( xtraining, ytraining)

        # prediction 
        yhat = m.predict( xvalidation)

        # performance
        m_result = ml_error(model_name, np.expm1(yvalidation), np.expm1(yhat))
        mae_list.append(m_result['MAE'])
        mape_list.append(m_result['MAPE'])
        rmse_list.append(m_result['RMSE'])

    return pd.DataFrame({'Model Name':model_name,
                        'MAE CV': np.round(np.mean(mae_list), 2).astype(str) + ' +/- ' + np.round(np.std(mae_list), 2).astype(str),
                        'MAPE CV': np.round(np.mean(mape_list), 2).astype(str) + ' +/- ' + np.round(np.std(mape_list), 2).astype(str),
                        'RMSE CV': np.round(np.mean(rmse_list), 2).astype(str) + ' +/- ' + np.round(np.std(rmse_list), 2).astype(str)}, index = [0])

def mean_absolute_percentage_error(y, yhat):
    return np.mean(np.abs((y - yhat)/y))

def ml_error(model_name, y, yhat):
    mae = mean_absolute_error(y, yhat)
    mape = mean_absolute_percentage_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    
    return pd.DataFrame({'Model Name':model_name,'MAE':mae,'MAPE':mape,'RMSE':rmse}, index = [0])


In [4]:
df6 = pd.read_csv('df6.csv')
df6.head()

Unnamed: 0.1,Unnamed: 0,store,day_of_week,date,sales,promo,school_holiday,store_type,assortment,competition_distance,...,sh_public holiday,sh_regular day,day_of_week_sin,day_of_week_cos,month_sin,month_cos,day_sin,day_cos,week_of_year_sin,week_of_year_cos
0,0,1,5,2015-07-31,8.568646,1,1,2,1,-0.170968,...,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984
1,1,2,5,2015-07-31,8.71029,1,1,0,1,-0.283871,...,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984
2,2,3,5,2015-07-31,9.025816,1,1,0,1,1.903226,...,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984
3,3,4,5,2015-07-31,9.546527,1,1,2,2,-0.275806,...,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984
4,4,5,5,2015-07-31,8.481151,1,1,0,1,4.448387,...,0,1,-0.974928,-0.222521,-0.5,-0.866025,0.207912,0.978148,-0.568065,-0.822984


In [5]:
cols_drop = ['week_of_year','day','month','day_of_week','promo_since','competition_since']
df6.drop(cols_drop, axis = 1, inplace = True)

# Train, Test split. 

# Training dataset
X_train = df6[df6['date'] < '2015-06-19']
y_train = X_train['sales']

# test dataset
X_test = df6[df6['date'] >= '2015-06-19']
y_test = X_test['sales']

print('Training Min Date: {}'.format(X_train['date'].min()))
print('Training Max Date: {}'.format(X_train['date'].max()))

print('Test Min Date: {}'.format(X_test['date'].min()))
print('Test Max Date: {}'.format(X_test['date'].max()))

X_train_n = X_train.drop(['date','sales'], axis = 1).values
y_train_n = y_train.values.ravel()

Training Min Date: 2013-01-01
Training Max Date: 2015-06-18
Test Min Date: 2015-06-19
Test Max Date: 2015-07-31


In [6]:
# Selectec columns boruta
cols_selected_boruta = [
 'store',
 'promo',
 'store_type',
 'assortment',
 'competition_distance',
 'competition_open_since_month',
 'competition_open_since_year',
 'promo2',
 'promo2_since_week',
 'promo2_since_year',
 'competition_time_month',
 'promo_time_week',
 'day_of_week_sin', 
 'day_of_week_cos',
 'month_sin',
 'month_cos',
 'day_sin',
 'day_cos',
 'week_of_year_sin',
 'week_of_year_cos']

In [7]:
x_train = X_train[ cols_selected_boruta]
x_test = X_test[ cols_selected_boruta]

In [8]:
# Adding sales and date to columns selected by boruta.
add_features = ['date','sales']
cols_selected_boruta.extend(add_features)
cols_selected_boruta_full = cols_selected_boruta.copy()

In [27]:
x_training = X_train[ cols_selected_boruta_full]
x_training['date'] = pd.to_datetime(x_training['date'])

In [28]:
aux1 = x_test.copy()
aux1['sales'] = y_test.copy()

In [29]:
# model
model_xgb = xgb.XGBRegressor(objective = 'reg:squarederror',
                             n_estimators = 100,
                             eta = 0.1,
                             max_depth = 10,
                             subsample = 0.7,
                             closample_bytree = 0.9).fit(x_train, y_train)

# prediction
yhat_xgb = model_xgb.predict(x_test)

# performance
xgb_result = ml_error('XGBoost Regressor', np.expm1(y_test), np.expm1(yhat_xgb))
xgb_result

Parameters: { "closample_bytree" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,XGBoost Regressor,864.842658,0.126081,1278.785981


In [38]:
param = {'n_estimators':[1500, 1700, 2500, 3000, 3500],
         'eta':[0.01, 0.03, 0.3],
         'max_depth':[3, 5, 10],
         'subsample':[0.1, 0.5, 0.7],
         'colsample_bytree':[0.3, 0.7, 0.9],
         'min_child_weight':[3, 8, 15]}
MAX_EVAL = 10
warnings.filterwarnings('ignore')

In [37]:
final_result = pd.DataFrame()
# model
for i in range(MAX_EVAL):
    # choose values for parameters randomly
    hp = {k: v[random.randint(0,len(v) - 1)] for k, v in param.items()}
    print(hp)
    
    # model
    model_xgb = xgb.XGBRegressor(#objecitve = 'reg:squarederror',
                                 n_estimators = hp['n_estimators'],
                                 eta = hp['eta'], 
                                 max_depth = hp['max_depth'],
                                 subsample = hp['subsample'],
                                 #closample_bytree = hp['colsample_bytree'],
                                 min_child_weight = hp['min_child_weight'])

    # performance
    xgb_result = cross_validation(x_training, 2, 'XGBoost Regressor', model_xgb)
    final_result = pd.concat([final_result, xgb_result])
final_result

{'n_estimators': 17, 'eta': 0.01, 'max_depth': 3, 'subsample': 0.5, 'colsample_bytree': 0.9, 'min_child_weight': 8}
{'n_estimators': 17, 'eta': 0.01, 'max_depth': 10, 'subsample': 0.5, 'colsample_bytree': 0.7, 'min_child_weight': 8}


Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,XGBoost Regressor,7420.51 +/- 128.29,1.0 +/- 0.0,8074.64 +/- 175.38
0,XGBoost Regressor,7420.51 +/- 128.28,1.0 +/- 0.0,8074.62 +/- 175.37
