## Model Selection

In [1]:
# Load libraries
import numpy as np 
import pandas as pd 
from subprocess import check_output
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import glob, re
from sklearn import *
from datetime import datetime
from xgboost import XGBRegressor

In [3]:
np.random.seed(10)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

### 1.Data import and aggregation

In [6]:
# Data Aggregation
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [7]:
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,visit_date,visitors,air_store_id,latitude,longitude,month,date,dw,...,Ōsaka-fu,Hyōgo-ken,Hokkaidō,Shizuoka-ken,Fukuoka-ken,Hiroshima-ken,Niigata-ken,Miyagi-ken,reserve_visitors_air_1,air_date_diff_1
0,0,0,2016-01-13,25,air_ba937bf13d40fb24,35.658068,139.751599,1,13,2,...,0,0,0,0,0,0,0,0,,
1,1,1,2016-01-13,21,air_25e9888d30b386df,35.626568,139.725858,1,13,2,...,0,0,0,0,0,0,0,0,,
2,2,2,2016-01-13,40,air_fd6aac1043520e83,35.658068,139.751599,1,13,2,...,0,0,0,0,0,0,0,0,,
3,3,3,2016-01-13,5,air_64d4491ad8cdb1c6,35.658068,139.751599,1,13,2,...,0,0,0,0,0,0,0,0,,
4,4,4,2016-01-13,16,air_5c65468938c07fa5,35.661777,139.704051,1,13,2,...,0,0,0,0,0,0,0,0,,


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247009 entries, 0 to 247008
Data columns (total 58 columns):
visit_date                      247009 non-null object
visitors                        247009 non-null int64
air_store_id                    247009 non-null object
latitude                        247009 non-null float64
longitude                       247009 non-null float64
month                           247009 non-null int64
date                            247009 non-null int64
dw                              247009 non-null int64
dy                              247009 non-null int64
holiday_flg                     247009 non-null int64
sunday                          247009 non-null int64
saturday                        247009 non-null int64
sat/sun/hol                     247009 non-null float64
precipitation                   247009 non-null float64
avg_temperature                 247009 non-null float64
hours_sunlight                  247009 non-null float64
avg_wind_sp

In [9]:
train = train.drop(['Unnamed: 0' , 'Unnamed: 0.1'], axis=1)

In [11]:
test.head()

Unnamed: 0.1,Unnamed: 0,id,visitors,visit_date,air_store_id,dw,dy,month,holiday_flg,latitude,...,Saturday,Tōkyō-to,Ōsaka-fu,Hyōgo-ken,Hiroshima-ken,Fukuoka-ken,Hokkaidō,Miyagi-ken,Niigata-ken,Shizuoka-ken
0,0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,0,35.694003,...,0,1,0,0,0,0,0,0,0,0
1,1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,0,35.694003,...,0,1,0,0,0,0,0,0,0,0
2,2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,0,35.694003,...,0,1,0,0,0,0,0,0,0,0
3,3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,0,35.694003,...,0,1,0,0,0,0,0,0,0,0
4,4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,0,35.694003,...,0,1,0,0,0,0,0,0,0,0


In [16]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32019 entries, 0 to 32018
Data columns (total 56 columns):
id                              32019 non-null object
visitors                        32019 non-null int64
visit_date                      32019 non-null object
air_store_id                    32019 non-null object
dw                              32019 non-null int64
dy                              32019 non-null int64
month                           32019 non-null int64
holiday_flg                     32019 non-null int64
latitude                        32019 non-null float64
longitude                       32019 non-null float64
sunday                          32019 non-null int64
saturday                        32019 non-null int64
sat/sun/hol                     32019 non-null float64
precipitation                   32019 non-null float64
avg_temperature                 32019 non-null float64
hours_sunlight                  32019 non-null float64
avg_wind_speed              

In [15]:
test = test.drop(['Unnamed: 0'], axis=1)

In [20]:
#for c, dtype in zip(train.columns, train.dtypes):
#    if dtype == np.float64:
#        train[c] = train[c].astype(np.float32)

#for c, dtype in zip(test.columns, test.dtypes):
#    if dtype == np.float64:
 #       test[c] = test[c].astype(np.float32)

## Model selection 

In [17]:
col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors']]

In [18]:
train = train.fillna(-1)
test = test.fillna(-1)

In [19]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

In [20]:
model1 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3, n_estimators=200, subsample=0.8, 
                      max_depth =10)

In [21]:
model2 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=4)

In [22]:
model3 = XGBRegressor(learning_rate=0.2, n_estimators=280, subsample=0.8, 
                      colsample_bytree=0.8, max_depth =12)
#random_state=3

In [23]:
model1.fit(train[col], np.log1p(train['visitors'].values))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.2, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, n_iter_no_change=None, presort='auto',
             random_state=3, subsample=0.8, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [24]:
model2.fit(train[col], np.log1p(train['visitors'].values))

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
          weights='uniform')

In [25]:
model3.fit(train[col], np.log1p(train['visitors'].values))

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=280, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [26]:
preds1 = model1.predict(train[col])

In [27]:
preds2 = model2.predict(train[col])

In [28]:
preds3 = model3.predict(train[col])

In [29]:
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values), preds1))

('RMSE GradientBoostingRegressor: ', 0.6043729591762557)


In [30]:
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), preds2))

('RMSE KNeighborsRegressor: ', 0.6657030733438531)


In [31]:
print('RMSE XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values), preds3))

('RMSE XGBRegressor: ', 0.5748429053017586)


In [None]:
preds1 = model1.predict(test[col])

In [None]:
preds2 = model2.predict(test[col])

In [None]:
preds3 = model3.predict(test[col])

In [None]:
test['visitors'] = 0.3*preds1+0.3*preds2+0.4*preds3
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)

In [None]:
sub1 = test[['id','visitors']].copy()
del train; del data;

In [32]:
X=train_io[col]
X_train = train_io[train_io.visit_date<'2017-03-01'][col]
X_test = train_io[train_io.visit_date>'2017-03-01'][col]

y_train = np.log1p(train_io[train_io.visit_date<'2017-03-01']['visitors'].values)
y_test = np.log1p(train_io[train_io.visit_date>'2017-03-01']['visitors'].values)

In [42]:
def RMSLE(y, pred):
    return mean_squared_error(y, pred)**0.5

In [50]:
model1 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3, n_estimators=200, subsample=0.8, 
                      max_depth =10)
model2 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=4)
model3 = XGBRegressor(learning_rate=0.2, n_estimators=280, subsample=0.8, colsample_bytree=0.8, max_depth =12)

In [51]:
model1.fit(train_io[col], np.log1p(train_io['visitors'].values))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.2, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, n_iter_no_change=None, presort='auto',
             random_state=3, subsample=0.8, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [54]:
model2.fit(train_io[col], np.log1p(train_io['visitors'].values))

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
          weights='uniform')

In [56]:
model3.fit(train_io[col], np.log1p(train_io['visitors'].values))

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=280, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [58]:
preds1 = model1.predict(train_io[col])
preds2 = model2.predict(train_io[col])
preds3 = model3.predict(train_io[col])

In [60]:
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train_io['visitors'].values), preds1))
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train_io['visitors'].values), preds2))
print('RMSE XGBRegressor: ', RMSLE(np.log1p(train_io['visitors'].values), preds3))

('RMSE GradientBoostingRegressor: ', 0.07063084570011836)
('RMSE KNeighborsRegressor: ', 0.5199218454804754)
('RMSE XGBRegressor: ', 0.00899810689801468)


In [30]:
preds1 = model1.predict(train_io[col])

print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train_io['visitors'].values), preds1))
#preds1 = model1.predict(test[col])

('RMSE GradientBoostingRegressor: ', 0.5707351538377076)


In [None]:
preds1 = model1.predict(test[col])
preds2 = model2.predict(test[col])
preds3 = model3.predict(test[col])

In [None]:
test['visitors'] = 0.3*preds1+0.3*preds2+0.4*preds3
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
del train; del data;

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [38]:
test.to_csv(r'C:\Users\sergey\Documents\Recruit Restaurant Visitor_2\test_2.csv')