In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime
from IPython.display import Image
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


In [2]:
df6 = pd.read_csv('df6.csv')
df6['date'] = pd.to_datetime(df6['date'])
df6 = df6.sample(int(len(df6)*0.5), random_state=42)
df6.head(5)

Unnamed: 0,store,date,sales,promo,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,...,state_holiday_public holiday,state_holiday_regular_day,day_of_week_sin,day_of_week_cos,month_sin,month_cos,day_sin,day_cos,week_of_year_sin,week_of_year_cos
43879,257,2015-06-16,8.688622,1,0,0,1,-0.308065,12,2012,...,0,1,0.974928,-0.222521,1.224647e-16,-1.0,-0.2079117,-0.978148,0.120537,-0.992709
562681,4,2013-10-30,8.962007,0,0,2,3,-0.275806,9,2009,...,0,1,0.433884,-0.900969,-0.8660254,0.5,-2.449294e-16,1.0,-0.822984,0.568065
239643,593,2014-11-05,9.117348,1,0,0,3,4.956452,3,2010,...,0,1,0.433884,-0.900969,-0.5,0.866025,0.8660254,0.5,-0.748511,0.663123
689976,1026,2013-06-19,8.965973,1,0,2,1,-0.303226,6,2011,...,0,1,0.433884,-0.900969,1.224647e-16,-1.0,-0.7431448,-0.669131,0.120537,-0.992709
397240,19,2014-04-28,9.215029,1,0,0,3,0.146774,4,2014,...,0,1,0.781831,0.62349,0.8660254,-0.5,-0.4067366,0.913545,0.822984,-0.568065


# 7. MODELAÇÃO DE APRENDIZADO DE MÁQUINA

In [3]:
def ml_error(model_name, y_test, yhat):
    mae = (np.sum(np.abs(np.expm1(y_test) - np.expm1(yhat))))/len(np.expm1(yhat))
    mape = np.sum(np.abs(np.expm1(y_test) - np.expm1(yhat))/np.expm1(yhat))/len(yhat)
    rmse = np.sqrt((np.sum((np.expm1(y_test) - np.expm1(yhat))**2))/len(np.expm1(yhat)))
    
    return {'Model name': model_name,
            'MAE': mae,
            'MAPE': mape,
            'RMSE': rmse}

def cross_validation(X_train, kfold, model):
    l_mae = []
    l_mape = []
    l_rmse = []

    for k in range(kfold,0,-1):
        start_validation = X_train['date'].max() - datetime.timedelta(days=6*7*k)
        end_validation = X_train['date'].max() - datetime.timedelta(days=6*7*(k-1))
        print(f'Start validation on {start_validation} and end at {end_validation}')
        
        train = X_train[X_train['date'] < start_validation]
        validation = X_train[(X_train['date'] >= start_validation) & (X_train['date'] <= end_validation)]
        
        X_train_temp = train.drop(['date','sales'], axis=1)
        y_train_temp = train['sales']
        
        X_test_temp = validation.drop(['date','sales'], axis=1)
        y_test_temp = validation['sales']
        
        m = model.fit(X_train_temp, y_train_temp)
        
        yhat = m.predict(X_test_temp)
        
        m_result = ml_error(model, y_test_temp, yhat)
        
        mae = (np.sum(np.abs(np.expm1(y_test_temp) - np.expm1(yhat))))/len(np.expm1(yhat))
        l_mae.append(mae)
        mape = np.sum(np.abs(np.expm1(y_test_temp) - np.expm1(yhat))/np.expm1(yhat))/len(yhat)
        l_mape.append(mape)
        rmse = np.sqrt((np.sum((np.expm1(y_test_temp) - np.expm1(yhat))**2))/len(np.expm1(yhat)))
        l_rmse.append(rmse)
           
    return {'Model Name':model,
            'MAE CV':np.round(np.mean(l_mae),2).astype(str) + ' +/- ' + np.round(np.std(l_mae),2).astype(str),
            'MAPE CV':np.round(np.mean(l_mape),2).astype(str) + ' +/- ' + np.round(np.std(l_mape),2).astype(str),
            'RMSE CV':np.round(np.mean(l_rmse),2).astype(str) + ' +/- ' + np.round(np.std(l_rmse),2).astype(str)}

In [4]:
X_train = df6[df6['date'] < '2015-06-19']
y_train = X_train['sales']

X_test = df6[df6['date'] >= '2015-06-19']
y_test = X_test['sales']

In [5]:
cols_selected_boruta = [
    'store',
    'promo',
    'store_type',
    'assortment',
    'competition_distance',
    'competition_open_since_month',
    'competition_open_since_year',
    'promo2',
    'promo2_since_week',
    'promo2_since_year',
    'competition_time_month',
    'promo_time_week',
    'day_of_week_sin',
    'day_of_week_cos',
    'month_sin',
    'month_cos',
    'day_sin',
    'day_cos',
    'week_of_year_cos',
    'week_of_year_sin']

X_train_incomplete = X_train[cols_selected_boruta]
X_test_incomplete = X_test[cols_selected_boruta]

In [6]:
boruta_selection = cols_selected_boruta.copy()
boruta_selection.extend(['date','sales'])
X_train = df6[boruta_selection]

### Average Model

In [7]:
aux1 = X_test_incomplete.copy()
aux1['sales'] = y_test.copy()

#predictions
aux2 = aux1[['store','sales']].groupby('store').mean().reset_index().rename(columns={'sales':'predictions'})
aux1 = pd.merge(aux1, aux2, how='left', on='store')
yhat_baseline = aux1['predictions']

#performance
baseline_result = ml_error('Average Model',y_test, yhat_baseline) #Np.expm1 é utilizado para aplicar exponenciação (que é o oposto de logarítmo) e retornar o Sales ao valor original
baseline_result

{'Model name': 'Average Model',
 'MAE': 1434.4989793118416,
 'MAPE': 0.23131266028042133,
 'RMSE': 2759.0457120494457}

### Linear Regression Model 

In [8]:
#model
lr = LinearRegression().fit(X_train_incomplete, y_train)

#prediction
yhat_lr = lr.predict(X_test_incomplete)

#perfomance
lr_result = ml_error('Linear Regression', y_test, yhat_lr)
lr_result

{'Model name': 'Linear Regression',
 'MAE': 1858.464486374337,
 'MAPE': 0.27998846615313344,
 'RMSE': 2667.6584090729243}

### Linear Regression Model - CV

In [9]:
model = LinearRegression()
lr_result_cv = cross_validation(X_train, 5, model)
lr_result_cv

Start validation on 2015-01-02 00:00:00 and end at 2015-02-13 00:00:00
Start validation on 2015-02-13 00:00:00 and end at 2015-03-27 00:00:00
Start validation on 2015-03-27 00:00:00 and end at 2015-05-08 00:00:00
Start validation on 2015-05-08 00:00:00 and end at 2015-06-19 00:00:00
Start validation on 2015-06-19 00:00:00 and end at 2015-07-31 00:00:00


{'Model Name': LinearRegression(),
 'MAE CV': '1924.94 +/- 101.7',
 'MAPE CV': '0.29 +/- 0.01',
 'RMSE CV': '2726.03 +/- 192.73'}

### Linear Regression Regularized Model - Lasso

In [10]:
#model
lrr = Lasso(alpha=0.01).fit(X_train_incomplete, y_train)

#prediction
yhat_lrr = lrr.predict(X_test_incomplete)

#perfomance
lrr_result = ml_error('Linear Regression Lasso', y_test, yhat_lrr)
lrr_result

{'Model name': 'Linear Regression Lasso',
 'MAE': 1885.6576649715582,
 'MAPE': 0.2906173375992527,
 'RMSE': 2744.8264243233243}

### Linear Regression Regularized Model - Lasso - CV

In [11]:
model = Lasso(alpha=0.01)
lrr_result_cv = cross_validation(X_train, 5, model)
lrr_result_cv

Start validation on 2015-01-02 00:00:00 and end at 2015-02-13 00:00:00
Start validation on 2015-02-13 00:00:00 and end at 2015-03-27 00:00:00
Start validation on 2015-03-27 00:00:00 and end at 2015-05-08 00:00:00
Start validation on 2015-05-08 00:00:00 and end at 2015-06-19 00:00:00
Start validation on 2015-06-19 00:00:00 and end at 2015-07-31 00:00:00


{'Model Name': Lasso(alpha=0.01),
 'MAE CV': '1944.24 +/- 138.18',
 'MAPE CV': '0.3 +/- 0.02',
 'RMSE CV': '2815.68 +/- 228.13'}

### Random Forest Regressor

In [12]:
##model
rf = RandomForestRegressor(n_estimators = 100, n_jobs=-1, random_state=42).fit(X_train_incomplete, y_train)

##prediction
yhat_rf = rf.predict(X_test_incomplete)
#
##perfomance
rf_result = ml_error('Random Forest Regressor', y_test, yhat_rf)
rf_result

{'Model name': 'Random Forest Regressor',
 'MAE': 692.1793474050789,
 'MAPE': 0.09870067090594907,
 'RMSE': 1035.515449573678}

### Random Forest Regressor - CV

In [13]:
model = RandomForestRegressor(n_estimators = 100, n_jobs=-1, random_state=42)
rf_result_cv = cross_validation(X_train, 5, model)
rf_result_cv

Start validation on 2015-01-02 00:00:00 and end at 2015-02-13 00:00:00
Start validation on 2015-02-13 00:00:00 and end at 2015-03-27 00:00:00
Start validation on 2015-03-27 00:00:00 and end at 2015-05-08 00:00:00
Start validation on 2015-05-08 00:00:00 and end at 2015-06-19 00:00:00
Start validation on 2015-06-19 00:00:00 and end at 2015-07-31 00:00:00


{'Model Name': RandomForestRegressor(n_jobs=-1, random_state=42),
 'MAE CV': '750.06 +/- 113.79',
 'MAPE CV': '0.11 +/- 0.02',
 'RMSE CV': '1125.17 +/- 178.0'}

### XGBoost Regressor

In [20]:
#model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, eta=0.01, max_depth=10, subsample=0.7, colsample_bytree=0.9).fit(X_train_incomplete, y_train)

#prediction
yhat_xgb = model_xgb.predict(X_test_incomplete)

#perfomance
xgb_result = ml_error('XGBoost Regressor', y_test, yhat_xgb)
xgb_result

{'Model name': 'XGBoost Regressor',
 'MAE': 6676.870365104163,
 'MAPE': 21.002658028221266,
 'RMSE': 7328.829194117063}

### XGBoost Regressor - CV

In [15]:
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, eta=0.01, max_depth=10, subsample=0.7, colsample_bytree=0.9)
xgb_result_cv = cross_validation(X_train, 5, model)
xgb_result_cv

Start validation on 2015-01-02 00:00:00 and end at 2015-02-13 00:00:00
Start validation on 2015-02-13 00:00:00 and end at 2015-03-27 00:00:00
Start validation on 2015-03-27 00:00:00 and end at 2015-05-08 00:00:00
Start validation on 2015-05-08 00:00:00 and end at 2015-06-19 00:00:00
Start validation on 2015-06-19 00:00:00 and end at 2015-07-31 00:00:00


{'Model Name': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, enable_categorical=False,
              eta=0.01, gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.00999999978,
              max_delta_step=0, max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=6,
              num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None),
 'MAE CV': '6786.79 +/- 328.63',
 'MAPE CV': '21.47 +/- 0.84',
 'RMSE CV': '7413.67 +/- 379.84'}

### Module Comparison

In [16]:
modelling_result = pd.concat([pd.DataFrame([baseline_result]), pd.DataFrame([lr_result]), pd.DataFrame([lrr_result]), pd.DataFrame([rf_result]), pd.DataFrame([xgb_result])])
modelling_result.sort_values('RMSE')

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,Random Forest Regressor,692.179347,0.098701,1035.51545
0,Linear Regression,1858.464486,0.279988,2667.658409
0,Linear Regression Lasso,1885.657665,0.290617,2744.826424
0,Average Model,1434.498979,0.231313,2759.045712
0,XGBoost Regressor,6676.870365,21.002658,7328.829194


### Module Comparison - CV

In [17]:
modelling_result_cv = pd.concat([pd.DataFrame([lr_result_cv]), pd.DataFrame([lrr_result_cv]), pd.DataFrame([rf_result_cv]), pd.DataFrame([xgb_result_cv])])
modelling_result.sort_values('RMSE')

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,Random Forest Regressor,692.179347,0.098701,1035.51545
0,Linear Regression,1858.464486,0.279988,2667.658409
0,Linear Regression Lasso,1885.657665,0.290617,2744.826424
0,Average Model,1434.498979,0.231313,2759.045712
0,XGBoost Regressor,6676.870365,21.002658,7328.829194
