## Part II: feature engineering and preliminary models

In [1]:
import  pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn import cross_validation, metrics   
from sklearn.grid_search import GridSearchCV   



In [2]:
data=pd.read_csv("store_1.csv")
stores=pd.read_csv("stores.csv")
holidays=pd.read_csv("holidays_events.csv")

# merge the holidays information to the dataset
data=pd.merge(data,stores,on="store_nbr")
holidays=holidays[['date','locale_name']]
df=pd.merge(data,holidays,how='left',left_on=['date','city'],right_on=['date','locale_name'])
df['holidays']=np.where(df['locale_name'].isnull(),0,1)

# extract the days from date column, and this will create a new feature, days_from
hard_date = pd.to_datetime('20130101',format='%Y%m%d')
df['days_from'] = pd.to_datetime(df['date'])-hard_date
df['days_from']=df['days_from'].dt.days

# year is converted the years since 2013; and this will also create a new feature
df['year']=df['year']-2013

# remove the columns which are not features for training
df=df.drop(['Unnamed: 0','id','locale_name','store_nbr'],axis=1)

# create the oil price feature
oilprice=pd.read_csv("oil.csv")
df['date']=pd.to_datetime(df['date'])
oilprice.dropna(inplace=True)
oilprice['date']=pd.to_datetime(oilprice['date'])
df=pd.merge(df,oilprice,how='left',left_on='date',right_on='date')
df.dcoilwtico.fillna(method='ffill',inplace=True)

# a magtitude 7.8 earthquake struck Ecuador on April 16,2016. Remove the data of a month from April 16,2016
df=df[(df['date']<'2016-04-16') | (df['date']>'2016-05-16')]

# drop some features with zero variance or near-zero variance
df=df.drop(['date','city','state','type','cluster'],axis=1)

In [3]:
# choose one items for trainning
df=df[df['item_nbr']==103665]
df.month=df.month.astype('category')
df.dayofweek=df.dayofweek.astype('category')
Categories=df[['month','dayofweek']]
dummies=pd.get_dummies(Categories,drop_first=True)
df=df.drop(['item_nbr','month','dayofweek'],axis=1)
df=pd.concat([df,dummies],axis=1)
targets=df.unit_sales
features=df.drop('unit_sales',axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.1, random_state=42)

In [4]:
# Gradient Boost Algorithm
params = {'max_depth':3,
'min_child_weight':10,
'learning_rate':0.3,
'subsample':0.5,
'colsample_bytree':0.6,
'obj':'reg:linear',
'n_estimators':1000,
'eta':0.3}

dtrain=xgb.DMatrix(X_train.values,
                     y_train.values,
                     feature_names=X_train.columns.values)
clf=xgb.train(params, dtrain)
dtest=xgb.DMatrix(X_test.values,
                     y_test.values,
                     feature_names=X_test.columns.values)
predicted_xgb=np.around(clf.predict(dtest))

# Random forest Regression
model=RandomForestRegressor(max_depth=2, random_state=0, n_jobs=-1)
model.fit(X_train,y_train)
predicted_rf=np.around(model.predict(X_test))

# Support vector Machine
clf = SVR(C=1.0, epsilon=0.2)
clf.fit(X_train,y_train)
predicted_svm=np.around(clf.predict(X_test))

In [7]:
# compare these three models with the criterion of  square errors
actual=y_test.values

error_svm=predicted_svm-actual
np.sum(error_svm**2)

error_rf=predicted_rf-actual
np.sum(error_rf**2)

# the results shows the xgb has the best performance
error_xgb=predicted_xgb-actual
np.sum(error_xgb**2)

In [None]:
# Model evaluation and tuning
