# training the selected XGB model to the final prediction

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics as metrics 

In [2]:
train = pd.read_csv("../data/trainp.csv")
dev = pd.read_csv("../data/devp.csv")
test= pd.read_csv("../data/testp.csv")


In [3]:
varsel = pd.read_csv("../data/varsel.csv")

In [9]:
varsel.head()

Unnamed: 0.1,Unnamed: 0,Variable,Lasso,Ridge,RandomForest,GradientBoost,Linear Regression,Ada Boost,dtree,Sum
0,0,monthly_oilprice_avg,0,0,1,0,0,0,1,2
1,1,total_month_holidays,0,0,0,0,0,0,0,0
2,2,perishable,0,0,0,1,0,0,0,1
3,3,BREAD.BAKERY,0,0,0,0,0,0,0,0
4,4,BEVERAGES,0,0,1,1,0,1,1,4


In [13]:
train.head()

Unnamed: 0,id,monthly_oilprice_avg,BEVERAGES,POULTRY,GROCERY.I,CLEANING,store_typeA,store_typeD,city_Guayaquil,total_city_sales.x,sales_perc.x,cluster8,top5_item,class1040,class1072,year2017,month8,item_per,store_per,total_unit_sales
0,1474241,0.201154,0,0,0,0,0,0,0,0.083314,0.048508,0,0,0,0,0,0,0.001325,0.023096,3.688879
1,973784,0.229517,0,0,1,0,0,0,0,0.021197,0.012943,0,0,0,0,0,0,0.001186,0.020886,3.89182
2,1458848,0.269875,0,0,0,0,0,0,1,0.228751,0.124626,0,0,0,0,0,0,0.001323,0.022468,3.258097
3,185972,0.942353,0,0,0,1,0,1,0,0.028841,0.018528,0,0,0,0,0,0,0.001161,0.022039,5.023881
4,483059,0.229517,0,0,0,0,0,0,0,0.021197,0.012943,0,0,0,0,0,0,0.000969,0.020886,4.85203


In [10]:
v2 = varsel.loc[varsel['Sum']>=2]['Variable'].tolist()
v3 = varsel.loc[varsel['Sum']>=3]['Variable'].tolist()
v4 = varsel.loc[varsel['Sum']>=4]['Variable'].tolist()

In [11]:
v2

['monthly_oilprice_avg',
 'BEVERAGES',
 'POULTRY',
 'GROCERY.I',
 'CLEANING',
 'store_typeA',
 'store_typeD',
 'city_Guayaquil',
 'total_city_sales.x',
 'sales_perc.x',
 'cluster8',
 'top5_item',
 'class1040',
 'class1072',
 'year2017',
 'month8',
 'item_per',
 'store_per']

In [20]:
Xtrain= train.loc[:, v2]
y = train.loc[:,'total_unit_sales']

In [14]:
Xdev= dev.loc[:, v2]
ydev = dev.loc[:,'total_unit_sales']

In [15]:
Xtest= test.loc[:, v2]
ytest = test.loc[:,'total_unit_sales']

### XGboost

In [16]:
from xgboost.sklearn import XGBRegressor

## Running the base model

In [17]:
params = {'objective':'reg:linear', 'max_depth':6, 'learning_rate':1,
         'n_estimators':5}

In [21]:
xgb = XGBRegressor(**params).fit(Xtrain,y)



In [22]:
y_hat= xgb.predict(Xtrain)

In [24]:

print('RMSE= %f' % np.sqrt(metrics.mean_squared_error(y,y_hat)))

RMSE= 0.858874


In [25]:
ydev_hat= xgb.predict(Xdev)

In [27]:
print('RMSE= %f' % np.sqrt(metrics.mean_squared_error(ydev,ydev_hat)))


RMSE= 0.861525


In [28]:
ytest_hat= xgb.predict(Xtest)

In [29]:
print('RMSE= %f' % np.sqrt(metrics.mean_squared_error(ytest,ytest_hat)))

RMSE= 0.855770


## Running the xgb model with the selectd parameters after fine tunning

In [30]:
params1 = {'n_estimators': 971,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': False}

In [32]:
xgb1 = XGBRegressor(**params1).fit(Xtrain,y)

Parameters: { bootstrap, max_features, min_samples_leaf, min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [34]:
y_hat= xgb1.predict(Xtrain)

In [35]:
ydev_hat= xgb1.predict(Xdev)

In [37]:
ytest_hat= xgb1.predict(Xtest)

In [36]:
print('RMSE= %f' % np.sqrt(metrics.mean_squared_error(y,y_hat)))


RMSE= 0.568114


In [38]:
print('RMSE= %f' % np.sqrt(metrics.mean_squared_error(ydev,ydev_hat)))


RMSE= 0.583763


In [39]:
print('RMSE= %f' % np.sqrt(metrics.mean_squared_error(ytest,ytest_hat)))


RMSE= 0.580874


In [44]:
np.sqrt(metrics.mean_squared_error(ytest,ytest_hat))

0.5808741728768898

# Table of the final model scores

In [7]:
final_scores = pd.DataFrame(columns= ['Model','RMSE train','RMSE dev','RMSE test'])

In [5]:
add_row = pd.Series(['XGB regressor base mode',0.858874,0.861525,0.855770],index=['Model','RMSE train','RMSE dev','RMSE test'])

In [9]:
final_scores = final_scores.append(add_row, ignore_index=True)

In [11]:
add_row1 = pd.Series(['XGB regressor fine tuned mode',0.568114,0.583763,0.580874],index=['Model','RMSE train','RMSE dev','RMSE test'])

In [12]:
final_scores = final_scores.append(add_row1, ignore_index=True)

In [13]:
final_scores

Unnamed: 0,Model,RMSE train,RMSE dev,RMSE test
0,XGB regressor base mode,0.858874,0.861525,0.85577
1,XGB regressor fine tuned mode,0.568114,0.583763,0.580874


In [53]:
test['predicted_sales'] = ytest_hat

In [74]:
final = test.loc[:,['total_unit_sales','predicted_sales']]

In [75]:
final['sales_reverse_log'] = 10**final['total_unit_sales'] - 1

In [77]:
final['predicted_sales_reverse_log'] = 10**final['predicted_sales'] - 1

In [78]:
final

Unnamed: 0,total_unit_sales,predicted_sales,sales_reverse_log,predicted_sales_reverse_log
0,4.043051,4.385918,1.104109e+04,24316.457031
1,3.784190,4.431681,6.083006e+03,27018.708984
2,4.262680,4.335666,1.830864e+04,21659.361328
3,3.806662,4.383003,6.406115e+03,24153.789062
4,5.659482,4.672786,4.565426e+05,47073.554688
5,3.850148,4.256692,7.080864e+03,18057.925781
6,4.828314,4.229294,6.734530e+04,16953.845703
7,4.584967,4.184513,3.845530e+04,15292.718750
8,4.204693,4.132079,1.602011e+04,13553.363281
9,5.793014,4.679325,6.208875e+05,47787.636719


In [79]:
final.to_csv("c:/Users/Guy/Documents/DataScience/project/predictXGboost.csv")

We can now decide a threshold for selecting our variables!