## Variable Selection

In [1]:
# Load libraries
import numpy as np 
import pandas as pd 
from subprocess import check_output
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import glob, re
from sklearn import *
from datetime import datetime
from xgboost import XGBRegressor

In [3]:
np.random.seed(10)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

### 1.Data import and aggregation

In [4]:
# Data Aggregation
train_wo = pd.read_csv('train_without_orders.csv')
train_io = pd.read_csv('train_in_orders.csv')

In [5]:
train_wo.head()

Unnamed: 0.1,Unnamed: 0,visit_date,visitors,air_genre_name,air_area_name,air_store_id,latitude,longitude,cluster,prefecture,...,avg_wind_speed,avg_vapor_pressure,avg_humidity,avg_sea_pressure,avg_local_pressure,solar_radiation,cloud_cover,high_temperature,low_temperature,lon_plus_lat
0,0,2016-01-13,25,Dining bar,Tōkyō-to Minato-ku Shibakōen,air_ba937bf13d40fb24,35.658068,139.751599,1,Tōkyō-to,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.409667
1,1,2016-01-13,21,Izakaya,Tōkyō-to Shinagawa-ku Higashigotanda,air_25e9888d30b386df,35.626568,139.725858,1,Tōkyō-to,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.352426
2,2,2016-01-13,40,Izakaya,Tōkyō-to Minato-ku Shibakōen,air_fd6aac1043520e83,35.658068,139.751599,1,Tōkyō-to,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.409667
3,3,2016-01-13,5,Dining bar,Tōkyō-to Minato-ku Shibakōen,air_64d4491ad8cdb1c6,35.658068,139.751599,1,Tōkyō-to,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.409667
4,4,2016-01-13,16,Other,Tōkyō-to Shibuya-ku Shibuya,air_5c65468938c07fa5,35.661777,139.704051,1,Tōkyō-to,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.365828


In [6]:
train_io.head()

Unnamed: 0.1,Unnamed: 0,visit_date,visitors,reserve_visitors_air,air_date_diff,air_genre_name,air_area_name,air_store_id,latitude,longitude,...,avg_wind_speed,avg_vapor_pressure,avg_humidity,avg_sea_pressure,avg_local_pressure,solar_radiation,cloud_cover,high_temperature,low_temperature,lon_plus_lat
0,0,2016-01-13,25,,,Dining bar,Tōkyō-to Minato-ku Shibakōen,air_ba937bf13d40fb24,35.658068,139.751599,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.409667
1,1,2016-01-13,21,,,Izakaya,Tōkyō-to Shinagawa-ku Higashigotanda,air_25e9888d30b386df,35.626568,139.725858,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.352426
2,2,2016-01-13,40,,,Izakaya,Tōkyō-to Minato-ku Shibakōen,air_fd6aac1043520e83,35.658068,139.751599,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.409667
3,3,2016-01-13,5,,,Dining bar,Tōkyō-to Minato-ku Shibakōen,air_64d4491ad8cdb1c6,35.658068,139.751599,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.409667
4,4,2016-01-13,16,,,Other,Tōkyō-to Shibuya-ku Shibuya,air_5c65468938c07fa5,35.661777,139.704051,...,1.514286,4.9,60.0,1013.1,1010.1,10.86,2.5,9.171429,-2.028571,175.365828


In [7]:
train_io.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247009 entries, 0 to 247008
Data columns (total 35 columns):
Unnamed: 0              247009 non-null int64
visit_date              247009 non-null object
visitors                247009 non-null int64
reserve_visitors_air    26729 non-null float64
air_date_diff           26729 non-null float64
air_genre_name          247009 non-null object
air_area_name           247009 non-null object
air_store_id            247009 non-null object
latitude                247009 non-null float64
longitude               247009 non-null float64
cluster                 247009 non-null int64
prefecture              247009 non-null object
month                   247009 non-null int64
date                    247009 non-null int64
dw                      247009 non-null int64
dy                      247009 non-null int64
day_of_week             247009 non-null object
holiday_flg             247009 non-null int64
sunday                  247009 non-null int64
sat

  ### 1.1.Convert the date format 

#### One Hat Encoding function

In [8]:
def our_hat_code(myData, colQuery):
    temp = list(myData[colQuery].unique())
    print(temp)
    n = len(temp) 
    print(n)
    for ii in range(0,n):
        myData[temp[ii]] = None
        myData.loc[(myData[colQuery] == temp[ii] ),temp[ii]] = 1
        myData.loc[(myData[colQuery] != temp[ii]),temp[ii]] = 0

In [9]:
# One Hat Encoding air_genre_name
our_hat_code(train_io,'air_genre_name')

['Dining bar', 'Izakaya', 'Other', 'Italian/French', 'Cafe/Sweets', 'Japanese food', 'Bar/Cocktail', 'Creative cuisine', 'Western food', 'Yakiniku/Korean food', 'Asian', 'International cuisine', 'Okonomiyaki/Monja/Teppanyaki', 'Karaoke/Party']
14


In [10]:
# One Hat Encoding day_of_week
our_hat_code(train_io,'day_of_week')

['Wednesday', 'Thursday', 'Friday', 'Saturday', 'Monday', 'Tuesday', 'Sunday']
7


In [11]:
# One Hat Encoding prefecture
train_io['prefecture'] = train_io['prefecture'].astype('category')
our_hat_code(train_io,'prefecture')

['T\xc5\x8dky\xc5\x8d-to', '\xc5\x8csaka-fu', 'Hy\xc5\x8dgo-ken', 'Hokkaid\xc5\x8d', 'Shizuoka-ken', 'Fukuoka-ken', 'Hiroshima-ken', 'Niigata-ken', 'Miyagi-ken']
9


In [12]:
# One Hat Encoding cluster
#train_io['cluster'] = train_io['cluster'].astype('category')
#our_hat_code(train_io,'cluster')

In [13]:
train_io.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247009 entries, 0 to 247008
Data columns (total 65 columns):
Unnamed: 0                      247009 non-null int64
visit_date                      247009 non-null object
visitors                        247009 non-null int64
reserve_visitors_air            26729 non-null float64
air_date_diff                   26729 non-null float64
air_genre_name                  247009 non-null object
air_area_name                   247009 non-null object
air_store_id                    247009 non-null object
latitude                        247009 non-null float64
longitude                       247009 non-null float64
cluster                         247009 non-null int64
prefecture                      247009 non-null category
month                           247009 non-null int64
date                            247009 non-null int64
dw                              247009 non-null int64
dy                              247009 non-null int64
day_of_week 

In [14]:
train_io = train_io.drop(['calendar_date' , 'prefecture', 'cluster', 'air_genre_name','air_area_name','day_of_week'], axis=1)

In [15]:
train_io.head()

Unnamed: 0.1,Unnamed: 0,visit_date,visitors,reserve_visitors_air,air_date_diff,air_store_id,latitude,longitude,month,date,...,Sunday,Tōkyō-to,Ōsaka-fu,Hyōgo-ken,Hokkaidō,Shizuoka-ken,Fukuoka-ken,Hiroshima-ken,Niigata-ken,Miyagi-ken
0,0,2016-01-13,25,,,air_ba937bf13d40fb24,35.658068,139.751599,1,13,...,0,1,0,0,0,0,0,0,0,0
1,1,2016-01-13,21,,,air_25e9888d30b386df,35.626568,139.725858,1,13,...,0,1,0,0,0,0,0,0,0,0
2,2,2016-01-13,40,,,air_fd6aac1043520e83,35.658068,139.751599,1,13,...,0,1,0,0,0,0,0,0,0,0
3,3,2016-01-13,5,,,air_64d4491ad8cdb1c6,35.658068,139.751599,1,13,...,0,1,0,0,0,0,0,0,0,0
4,4,2016-01-13,16,,,air_5c65468938c07fa5,35.661777,139.704051,1,13,...,0,1,0,0,0,0,0,0,0,0


In [16]:
train_io['reserve_visitors_air_1'] = train_io['reserve_visitors_air']
train_io['air_date_diff_1'] = train_io['air_date_diff']

In [17]:
train_io = train_io.drop(['air_date_diff','reserve_visitors_air'], axis=1)

In [18]:
train_io.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247009 entries, 0 to 247008
Data columns (total 59 columns):
Unnamed: 0                      247009 non-null int64
visit_date                      247009 non-null object
visitors                        247009 non-null int64
air_store_id                    247009 non-null object
latitude                        247009 non-null float64
longitude                       247009 non-null float64
month                           247009 non-null int64
date                            247009 non-null int64
dw                              247009 non-null int64
dy                              247009 non-null int64
holiday_flg                     247009 non-null int64
sunday                          247009 non-null int64
saturday                        247009 non-null int64
sat/sun/hol                     247009 non-null float64
precipitation                   247009 non-null float64
avg_temperature                 247009 non-null float64
hours_sunligh

In [25]:
train_io.to_csv(r'C:\Users\sergey\Documents\Recruit Restaurant Visitor_2\train.csv')

Now all data is converted to the format of the number.

### Preparation and separation of data forvVariable selection

In [19]:
varSel = pd.DataFrame({'Variable': train_io.columns[4:59]})
varSel.head(59)

Unnamed: 0,Variable
0,latitude
1,longitude
2,month
3,date
4,dw
5,dy
6,holiday_flg
7,sunday
8,saturday
9,sat/sun/hol


In [20]:
train_io = train_io.dropna()

In [21]:
## remove unnecessary vars
X = train_io.loc[:,train_io.columns[4:59]]
X.head()

Unnamed: 0,latitude,longitude,month,date,dw,dy,holiday_flg,sunday,saturday,sat/sun/hol,...,Ōsaka-fu,Hyōgo-ken,Hokkaidō,Shizuoka-ken,Fukuoka-ken,Hiroshima-ken,Niigata-ken,Miyagi-ken,reserve_visitors_air_1,air_date_diff_1
26,35.661777,139.704051,1,13,2,13,0,0,0,0.0,...,0,0,0,0,0,0,0,0,2.0,0.0
30,35.658068,139.751599,1,13,2,13,0,0,0,0.0,...,0,0,0,0,0,0,0,0,8.0,4.5
39,35.658068,139.751599,1,13,2,13,0,0,0,0.0,...,0,0,0,0,0,0,0,0,2.0,0.0
40,35.658068,139.751599,1,13,2,13,0,0,0,0.0,...,0,0,0,0,0,0,0,0,12.0,0.0
49,35.661777,139.704051,1,13,2,13,0,0,0,0.0,...,0,0,0,0,0,0,0,0,17.0,3.5


In [22]:
y = train_io.loc[:,train_io.columns[2]]
print([X.shape,y.shape])

[(22265, 55), (22265L,)]


### Variable Selection using LASSO

In [23]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

lassomod = Lasso(alpha=0.1).fit(X, y)

In [24]:
model = SelectFromModel(lassomod, prefit=True)
model.get_support()

array([ True,  True, False,  True,  True,  True,  True, False,  True,
       False,  True, False,  True,  True,  True,  True, False,  True,
        True, False,  True,  True, False,  True,  True, False,  True,
        True,  True, False, False, False,  True, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False, False,  True, False, False, False, False,  True,
        True])

In [25]:
varSel['Lasso'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso
0,latitude,1
1,longitude,1
2,month,0
3,date,1
4,dw,1
5,dy,1
6,holiday_flg,1
7,sunday,0
8,saturday,1
9,sat/sun/hol,0


## Variable Selection using Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

rfmod = RandomForestClassifier().fit(X, y)

In [28]:
model = SelectFromModel(rfmod, prefit=True)
model.get_support()

array([ True,  True,  True,  True, False,  True, False, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True])

In [29]:
varSel['RandomForest'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso,RandomForest
0,latitude,1,1
1,longitude,1,1
2,month,0,1
3,date,1,1
4,dw,1,0
5,dy,1,1
6,holiday_flg,1,0
7,sunday,0,0
8,saturday,1,0
9,sat/sun/hol,0,0


### Variable Selection using Gradient Boosting classification

In [30]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel

In [31]:
gbmod = GradientBoostingClassifier().fit(X, y)

In [32]:
model = SelectFromModel(gbmod, prefit=True)
model.get_support()

array([ True,  True, False,  True, False,  True, False, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True, False, False, False, False,
       False,  True, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True])

In [33]:
varSel['GradientBoost'] = model.get_support().astype('int64')
varSel

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost
0,latitude,1,1,1
1,longitude,1,1,1
2,month,0,1,0
3,date,1,1,1
4,dw,1,0,0
5,dy,1,1,1
6,holiday_flg,1,0,0
7,sunday,0,0,0
8,saturday,1,0,0
9,sat/sun/hol,0,0,0


### Summarization and Selection of Variables 

In [34]:
varSel['Sum'] =  np.sum(varSel,axis=1)
varSel

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost,Sum
0,latitude,1,1,1,3
1,longitude,1,1,1,3
2,month,0,1,0,1
3,date,1,1,1,3
4,dw,1,0,0,1
5,dy,1,1,1,3
6,holiday_flg,1,0,0,1
7,sunday,0,0,0,0
8,saturday,1,0,0,1
9,sat/sun/hol,0,0,0,0


In [35]:
varSel.groupby('Sum')['Variable'].count()

Sum
0    21
1    15
2     4
3    15
Name: Variable, dtype: int64

In [36]:
result = varSel[varSel['Sum'] == 0]

In [37]:
result

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost,Sum
7,sunday,0,0,0,0
9,sat/sun/hol,0,0,0,0
25,Other,0,0,0,0
29,Bar/Cocktail,0,0,0,0
30,Creative cuisine,0,0,0,0
31,Western food,0,0,0,0
33,Asian,0,0,0,0
34,International cuisine,0,0,0,0
36,Karaoke/Party,0,0,0,0
37,Wednesday,0,0,0,0


In [40]:
result1 = varSel[varSel['Sum'] == 1]

In [42]:
result1

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost,Sum
2,month,0,1,0,1
4,dw,1,0,0,1
6,holiday_flg,1,0,0,1
8,saturday,1,0,0,1
19,cloud_cover,0,1,0,1
23,Dining bar,1,0,0,1
24,Izakaya,1,0,0,1
26,Italian/French,1,0,0,1
27,Cafe/Sweets,1,0,0,1
32,Yakiniku/Korean food,1,0,0,1


In [43]:
result2 = varSel[varSel['Sum'] == 2]

In [44]:
result2

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost,Sum
11,avg_temperature,0,1,1,2
16,avg_sea_pressure,0,1,1,2
22,lon_plus_lat,0,1,1,2
28,Japanese food,1,0,1,2


In [45]:
result3 = varSel[varSel['Sum'] == 3]

In [46]:
result3

Unnamed: 0,Variable,Lasso,RandomForest,GradientBoost,Sum
0,latitude,1,1,1,3
1,longitude,1,1,1,3
3,date,1,1,1,3
5,dy,1,1,1,3
10,precipitation,1,1,1,3
12,hours_sunlight,1,1,1,3
13,avg_wind_speed,1,1,1,3
14,avg_vapor_pressure,1,1,1,3
15,avg_humidity,1,1,1,3
17,avg_local_pressure,1,1,1,3
