## Model Selection

<a id='up'></a>

0.[Load libraries](#load-libraries)

1.[Data import and aggregation](#data-import)

2.[Model Selection](#model-select)

2.1.[Gradient Boosting Model](#gb-model)

2.2.[KNeighbors Model](#kn-model)

2.3.[XGB Model](#xgb-model)


3.[Prediction](#prediction)


## <a id='load-libraries'>0 Load libraries</a>

In [7]:
# Load libraries
import numpy as np 
import pandas as pd 
from subprocess import check_output
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
import glob, re
from sklearn import *
from datetime import datetime
from xgboost import XGBRegressor

In [9]:
np.random.seed(10)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

### <a id='data-import'>1.Data import and aggregation</a>

In [16]:
# Data Aggregation
train = pd.read_csv('../input/train.csv',index_col=[0])


In [54]:
test1 = pd.read_csv('../input/test.csv',index_col=[0])

In [17]:
train.head(2)

Unnamed: 0,Unnamed: 0.1,visit_date,visitors,air_store_id,latitude,longitude,month,date,dw,dy,...,Ōsaka-fu,Hyōgo-ken,Hokkaidō,Shizuoka-ken,Fukuoka-ken,Hiroshima-ken,Niigata-ken,Miyagi-ken,reserve_visitors_air_1,air_date_diff_1
0,0,2016-01-13,25,air_ba937bf13d40fb24,35.658068,139.751599,1,13,2,13,...,0,0,0,0,0,0,0,0,,
1,1,2016-01-13,21,air_25e9888d30b386df,35.626568,139.725858,1,13,2,13,...,0,0,0,0,0,0,0,0,,


In [18]:
train = train.drop(['Unnamed: 0.1'], axis=1)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247009 entries, 0 to 247008
Data columns (total 60 columns):
Unnamed: 0                      247009 non-null int64
Unnamed: 0.1                    247009 non-null int64
visit_date                      247009 non-null object
visitors                        247009 non-null int64
air_store_id                    247009 non-null object
latitude                        247009 non-null float64
longitude                       247009 non-null float64
month                           247009 non-null int64
date                            247009 non-null int64
dw                              247009 non-null int64
dy                              247009 non-null int64
holiday_flg                     247009 non-null int64
sunday                          247009 non-null int64
saturday                        247009 non-null int64
sat/sun/hol                     247009 non-null float64
precipitation                   247009 non-null float64
avg_temperature

[Up to the header](#up)

In [55]:
test1.head(2)

Unnamed: 0,id,visitors,visit_date,air_store_id,dw,dy,date,month,holiday_flg,reserve_visitors_air_1,...,Saturday,Tōkyō-to,Ōsaka-fu,Hyōgo-ken,Hiroshima-ken,Fukuoka-ken,Hokkaidō,Miyagi-ken,Niigata-ken,Shizuoka-ken
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,23,4,0,,...,0,1,0,0,0,0,0,0,0,0
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,24,4,0,,...,0,1,0,0,0,0,0,0,0,0


In [56]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32019 entries, 0 to 32018
Data columns (total 59 columns):
id                              32019 non-null object
visitors                        32019 non-null int64
visit_date                      32019 non-null object
air_store_id                    32019 non-null object
dw                              32019 non-null int64
dy                              32019 non-null int64
date                            32019 non-null int64
month                           32019 non-null int64
holiday_flg                     32019 non-null int64
reserve_visitors_air_1          1131 non-null float64
air_date_diff_1                 1131 non-null float64
latitude                        32019 non-null float64
longitude                       32019 non-null float64
sunday                          32019 non-null int64
saturday                        32019 non-null int64
sat/sun/hol                     32019 non-null float64
precipitation                   

[Up to the header](#up)

## <a id='model-select'>2. Model selection</a>


In [30]:
print(train.columns)
print(train.shape)

train = train.drop(['avg_temperature', 'hours_sunlight','avg_wind_speed','avg_vapor_pressure',\
                    'avg_humidity','avg_sea_pressure', 'avg_local_pressure', 'solar_radiation',\
                    'cloud_cover', 'high_temperature', 'low_temperature'], axis=1)

Index(['visit_date', 'visitors', 'air_store_id', 'latitude', 'longitude',
       'month', 'date', 'dw', 'dy', 'holiday_flg', 'sunday', 'saturday',
       'sat/sun/hol', 'precipitation', 'avg_temperature', 'hours_sunlight',
       'avg_wind_speed', 'avg_vapor_pressure', 'avg_humidity',
       'avg_sea_pressure', 'avg_local_pressure', 'solar_radiation',
       'cloud_cover', 'high_temperature', 'low_temperature', 'lon_plus_lat',
       'Dining bar', 'Izakaya', 'Other', 'Italian/French', 'Cafe/Sweets',
       'Japanese food', 'Bar/Cocktail', 'Creative cuisine', 'Western food',
       'Yakiniku/Korean food', 'Asian', 'International cuisine',
       'Okonomiyaki/Monja/Teppanyaki', 'Karaoke/Party', 'Wednesday',
       'Thursday', 'Friday', 'Saturday', 'Monday', 'Tuesday', 'Sunday',
       'Tōkyō-to', 'Ōsaka-fu', 'Hyōgo-ken', 'Hokkaidō', 'Shizuoka-ken',
       'Fukuoka-ken', 'Hiroshima-ken', 'Niigata-ken', 'Miyagi-ken',
       'reserve_visitors_air_1', 'air_date_diff_1'],
      dtype='object'

[Up to the header](#up)

In [31]:
train.columns

Index(['visit_date', 'visitors', 'air_store_id', 'latitude', 'longitude',
       'month', 'date', 'dw', 'dy', 'holiday_flg', 'sunday', 'saturday',
       'sat/sun/hol', 'precipitation', 'lon_plus_lat', 'Dining bar', 'Izakaya',
       'Other', 'Italian/French', 'Cafe/Sweets', 'Japanese food',
       'Bar/Cocktail', 'Creative cuisine', 'Western food',
       'Yakiniku/Korean food', 'Asian', 'International cuisine',
       'Okonomiyaki/Monja/Teppanyaki', 'Karaoke/Party', 'Wednesday',
       'Thursday', 'Friday', 'Saturday', 'Monday', 'Tuesday', 'Sunday',
       'Tōkyō-to', 'Ōsaka-fu', 'Hyōgo-ken', 'Hokkaidō', 'Shizuoka-ken',
       'Fukuoka-ken', 'Hiroshima-ken', 'Niigata-ken', 'Miyagi-ken',
       'reserve_visitors_air_1', 'air_date_diff_1'],
      dtype='object')

[Up to the header](#up)

In [34]:
print(test1.columns)
print(test1.shape)

test1 = test1.drop(['avg_temperature', 'hours_sunlight','avg_wind_speed', 'avg_vapor_pressure', 'avg_humidity',\
                  'avg_sea_pressure', 'avg_local_pressure', 'solar_radiation','cloud_cover',\
                  'high_temperature', 'low_temperature'], axis=1)

Index(['id', 'visitors', 'visit_date', 'air_store_id', 'dw', 'dy', 'month',
       'holiday_flg', 'latitude', 'longitude', 'sunday', 'saturday',
       'sat/sun/hol', 'precipitation', 'avg_temperature', 'hours_sunlight',
       'avg_wind_speed', 'avg_vapor_pressure', 'avg_humidity',
       'avg_sea_pressure', 'avg_local_pressure', 'solar_radiation',
       'cloud_cover', 'high_temperature', 'low_temperature', 'lon_plus_lat',
       'Italian/French', 'Izakaya', 'Dining bar', 'Cafe/Sweets',
       'Japanese food', 'Western food', 'Okonomiyaki/Monja/Teppanyaki',
       'Other', 'Yakiniku/Korean food', 'Asian', 'Bar/Cocktail',
       'Creative cuisine', 'International cuisine', 'Karaoke/Party', 'Sunday',
       'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Tōkyō-to', 'Ōsaka-fu', 'Hyōgo-ken', 'Hiroshima-ken', 'Fukuoka-ken',
       'Hokkaidō', 'Miyagi-ken', 'Niigata-ken', 'Shizuoka-ken'],
      dtype='object')
(32019, 56)


[Up to the header](#up)

In [57]:
col10 = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors']]
col11 = [c for c in test1 if c not in ['id', 'air_store_id', 'visit_date','visitors']]

[Up to the header](#up)

In [58]:
train = train.fillna(-1)
test1 = test1.fillna(-1)

In [44]:
train.head(1)

Unnamed: 0,visit_date,visitors,air_store_id,latitude,longitude,month,date,dw,dy,holiday_flg,...,Ōsaka-fu,Hyōgo-ken,Hokkaidō,Shizuoka-ken,Fukuoka-ken,Hiroshima-ken,Niigata-ken,Miyagi-ken,reserve_visitors_air_1,air_date_diff_1
0,2016-01-13,25,air_ba937bf13d40fb24,35.658068,139.751599,1,13,2,13,0,...,0,0,0,0,0,0,0,0,-1.0,-1.0


In [59]:
test1.head(1)

Unnamed: 0,id,visitors,visit_date,air_store_id,dw,dy,date,month,holiday_flg,reserve_visitors_air_1,...,Saturday,Tōkyō-to,Ōsaka-fu,Hyōgo-ken,Hiroshima-ken,Fukuoka-ken,Hokkaidō,Miyagi-ken,Niigata-ken,Shizuoka-ken
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,23,4,0,-1.0,...,0,1,0,0,0,0,0,0,0,0


In [39]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

[Up to the header](#up)

### <a id='gb-model'>2.1 Gradient Boosting Model</a>

In [40]:
model1 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3, n_estimators=200, subsample=0.8, 
                      max_depth =10)

### <a id='kn-model'>2.2 KNeighbors Model</a>

In [41]:
model2 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=4)

### <a id='xgb-model'>2.3 XGB Model</a>

In [42]:
model3 = XGBRegressor(learning_rate=0.2, n_estimators=280, subsample=0.8, 
                      colsample_bytree=0.8, max_depth =12)
#random_state=3

[Up to the header](#up)

In [43]:
model1.fit(train[col10], np.log1p(train['visitors'].values))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.2, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, n_iter_no_change=None, presort='auto',
             random_state=3, subsample=0.8, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [46]:
model2.fit(train[col10], np.log1p(train['visitors'].values))

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
          weights='uniform')

In [47]:
model3.fit(train[col10], np.log1p(train['visitors'].values))

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=12, min_child_weight=1, missing=None, n_estimators=280,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

[Up to the header](#up)

## <a id='prediction'>Prediction</a>

In [48]:
preds1 = model1.predict(train[col10])

In [49]:
preds2 = model2.predict(train[col10])

In [50]:
preds3 = model3.predict(train[col10])

[Up to the header](#up)

In [51]:
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values), preds1))

RMSE GradientBoostingRegressor:  0.6019631986290861


In [52]:
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), preds2))

RMSE KNeighborsRegressor:  0.6649371164127708


In [53]:
print('RMSE XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values), preds3))

RMSE XGBRegressor:  0.5713126937807613


In [60]:
preds1 = model1.predict(test1[col10])

In [61]:
preds2 = model2.predict(test1[col10])

In [63]:
preds3 = model3.predict(test1[col10])

In [64]:
test1['visitors'] = 0.3*preds1+0.3*preds2+0.4*preds3
test1['visitors'] = np.expm1(test1['visitors']).clip(lower=0.)

In [69]:
sub1 = test1[['id','visitors']].copy()
#del train #del data;

### <a id='export-csv'>Export to csv</a>

In [74]:
# here you have to write path, where result file will be stored
#export_csv = sub1.to_csv ('../input/submission.csv', index = None, header=True) 
#print (df4)

[Up to the header](#up)

### <a id='golden-week'>Golden Week</a>

In [2]:
sub11 = pd.r
subCopy = sub11.copy()

NameError: name 'sub1' is not defined

In [42]:
first_sub['tmp'] = np.nan
         
first_sub['air_store_id'] = first_sub.id.map(lambda x: '_'.join(x.split('_')[:-1]))
first_sub['date'] = first_sub.id.map(lambda x: x.split('_')[2])
first_sub['date'] =first_sub['date'].astype('datetime64[ns]')

first_sub.loc[first_sub.date=='2017-5-3','tmp'] = np.sqrt(first_sub.loc[first_sub.date=='2017-4-29','visitors']\
                                                          .values*first_sub.loc[first_sub.date=='2017-5-13','visitors']\
                                                          .values)

first_sub.loc[first_sub.date=='2017-5-4','tmp'] = np.sqrt(first_sub.loc[first_sub.date=='2017-4-29','visitors']\
                                                          .values*first_sub.loc[first_sub.date=='2017-5-13','visitors']\
                                                          .values)

first_sub.loc[first_sub.date=='2017-5-5','tmp'] = np.sqrt(first_sub.loc[first_sub.date=='2017-4-29','visitors']\
                                                          .values*first_sub.loc[first_sub.date=='2017-5-13','visitors']\
                                                          .values)

first_sub.loc[first_sub.date=='2017-5-2','tmp'] = np.sqrt(first_sub.loc[first_sub.date=='2017-4-28','visitors']\
                                                          .values*first_sub.loc[first_sub.date=='2017-5-12','visitors']\
                                                          .values)


In [50]:
model1 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3, n_estimators=200, subsample=0.8, 
                      max_depth =10)
model2 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=4)
model3 = XGBRegressor(learning_rate=0.2, n_estimators=280, subsample=0.8, colsample_bytree=0.8, max_depth =12)

In [51]:
model1.fit(train_io[col], np.log1p(train_io['visitors'].values))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.2, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, n_iter_no_change=None, presort='auto',
             random_state=3, subsample=0.8, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [54]:
model2.fit(train_io[col], np.log1p(train_io['visitors'].values))

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
          weights='uniform')

In [56]:
model3.fit(train_io[col], np.log1p(train_io['visitors'].values))

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=280, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [58]:
preds1 = model1.predict(train_io[col])
preds2 = model2.predict(train_io[col])
preds3 = model3.predict(train_io[col])

In [60]:
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train_io['visitors'].values), preds1))
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train_io['visitors'].values), preds2))
print('RMSE XGBRegressor: ', RMSLE(np.log1p(train_io['visitors'].values), preds3))

('RMSE GradientBoostingRegressor: ', 0.07063084570011836)
('RMSE KNeighborsRegressor: ', 0.5199218454804754)
('RMSE XGBRegressor: ', 0.00899810689801468)


In [30]:
preds1 = model1.predict(train_io[col])

print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train_io['visitors'].values), preds1))
#preds1 = model1.predict(test[col])

('RMSE GradientBoostingRegressor: ', 0.5707351538377076)


In [None]:
preds1 = model1.predict(test[col])
preds2 = model2.predict(test[col])
preds3 = model3.predict(test[col])

In [None]:
test['visitors'] = 0.3*preds1+0.3*preds2+0.4*preds3
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
del train; del data;

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [38]:
test.to_csv(r'C:\Users\sergey\Documents\Recruit Restaurant Visitor_2\test.csv')