# Extreme Gradient Boosting (XGBoost)

In [1]:
import pandas as pd
import numpy as np
from aux_fun import my_eval, r2_month
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

## Dropping problems, setting variables

In [2]:
train_data = pd.read_csv('./data/train_imputed.csv')
train_data.head()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,Region_GDP,...,Day,Month,Year,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,1000,0,1,0,326,7,495,5676,9643,17130,...,1,3,2016,1,0,0,0,1,0,0
1,1000,0,1,0,326,7,608,8111,9643,17130,...,2,3,2016,1,0,0,0,1,0,0
2,1000,0,1,0,326,7,665,8300,9643,17130,...,4,3,2016,1,0,0,0,1,0,0
3,1000,0,1,0,326,7,630,7154,9643,17130,...,5,3,2016,1,0,0,0,1,0,0
4,1000,0,0,0,326,7,0,0,9643,17130,...,6,3,2016,1,0,0,0,1,0,0


In [3]:
X=train_data.drop(['NumberOfSales','WindDirDegrees','NumberOfCustomers'], axis=1)
y=train_data['NumberOfSales']

## Algorithm Evaluation

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
ind_params = {'max_depth' : 10, 'n_estimators': 1000}

In [6]:
model=xgb.XGBRegressor(**ind_params)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [7]:
print("Prediction Error:",my_eval(X_test, y_test, y_pred))
print("R2:",r2_score(y_test, y_pred))
print("R2 Month:",r2_month(X_test, y_test, y_pred))

Prediction Error: 0.0318053400264
R2: 0.957183318211
R2 Month: 0.992250546342


## Algorithm Prediction

In [8]:
train_data = pd.read_csv('./data/train_imputed.csv')
X=train_data.drop(['NumberOfSales','WindDirDegrees','NumberOfCustomers'], axis=1)
y=train_data['NumberOfSales']
train_data.head()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,Region_GDP,...,Day,Month,Year,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,1000,0,1,0,326,7,495,5676,9643,17130,...,1,3,2016,1,0,0,0,1,0,0
1,1000,0,1,0,326,7,608,8111,9643,17130,...,2,3,2016,1,0,0,0,1,0,0
2,1000,0,1,0,326,7,665,8300,9643,17130,...,4,3,2016,1,0,0,0,1,0,0
3,1000,0,1,0,326,7,630,7154,9643,17130,...,5,3,2016,1,0,0,0,1,0,0
4,1000,0,0,0,326,7,0,0,9643,17130,...,6,3,2016,1,0,0,0,1,0,0


In [9]:
prediction_data = pd.read_csv('./data/test_imputed.csv')
prediction_data=prediction_data.drop('WindDirDegrees', axis=1)
prediction_data.head()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,Region_AreaKM2,Region_GDP,Region_PopulationK,Max_Dew_PointC,...,Day,Month,Year,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,1000,0,1,0,326,7,9643,17130,2770,276,...,1,3,2018,1,0,0,0,1,0,0
1,1000,0,1,0,326,7,9643,17130,2770,277,...,2,3,2018,1,0,0,0,1,0,0
2,1000,0,1,0,326,7,9643,17130,2770,275,...,3,3,2018,1,0,0,0,1,0,0
3,1000,0,0,0,326,7,9643,17130,2770,280,...,4,3,2018,1,0,0,0,1,0,0
4,1000,0,1,1,326,7,9643,17130,2770,280,...,5,3,2018,1,0,0,0,1,0,0


In [10]:
model=xgb.XGBRegressor(**ind_params)
model.fit(X,y)
y_pred=model.predict(prediction_data)

Grouping results by month

In [11]:
export = pd.DataFrame(prediction_data['StoreID'])
export['Month']=prediction_data['Month']
export['NumberOfSales'] = y_pred
export = export.groupby(['StoreID','Month'], as_index=False)['NumberOfSales'].sum()

Exporting Predictions

In [12]:
export.to_csv('./predictions/xgboost.csv', index=False)