# Recruit Restaurant Visitor Forecasting Version II

Following:
https://www.kaggle.com/tunguz/surprise-me-2

The difficulty in my baseline model: reserved tables have many missing restaurants. How to use them? 

In [1]:
import glob, re
import numpy as np
import pandas as pd
from sklearn import *
from datetime import datetime
from xgboost import XGBRegressor

## Load Data

In [3]:
df_ar = pd.read_csv('data/air_reserve.csv')
df_hr = pd.read_csv('data/hpg_reserve.csv')
df_astore = pd.read_csv('data/air_store_info.csv')
df_hstore = pd.read_csv('data/hpg_store_info.csv')
df_storeid = pd.read_csv('data/store_id_relation.csv')
df_av = pd.read_csv('data/air_visit_data.csv')
date_info = pd.read_csv('data/date_info.csv')
df_submission = pd.read_csv('data/sample_submission.csv')

The data description see:

https://github.com/dongzhang84/Kaggle/blob/master/Restaurant_Visitor_Forecasting/Visitor_forecasting_v1.ipynb

In [4]:
df_hr.head()

Unnamed: 0,hpg_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,hpg_c63f6f42e088e50f,2016-01-01 11:00:00,2016-01-01 09:00:00,1
1,hpg_dac72789163a3f47,2016-01-01 13:00:00,2016-01-01 06:00:00,3
2,hpg_c8e24dcf51ca1eb5,2016-01-01 16:00:00,2016-01-01 14:00:00,2
3,hpg_24bb207e5fd49d4a,2016-01-01 17:00:00,2016-01-01 11:00:00,5
4,hpg_25291c542ebb3bc2,2016-01-01 17:00:00,2016-01-01 03:00:00,13


In [6]:
df_hr = pd.merge(df_hr, df_storeid, how='inner', on='hpg_store_id')
df_hr.head()

Unnamed: 0,hpg_store_id,visit_datetime,reserve_datetime,reserve_visitors,air_store_id
0,hpg_878cc70b1abc76f7,2016-01-01 19:00:00,2016-01-01 15:00:00,4,air_db80363d35f10926
1,hpg_878cc70b1abc76f7,2016-01-02 19:00:00,2016-01-02 14:00:00,2,air_db80363d35f10926
2,hpg_878cc70b1abc76f7,2016-01-03 18:00:00,2016-01-02 20:00:00,6,air_db80363d35f10926
3,hpg_878cc70b1abc76f7,2016-01-06 20:00:00,2016-01-04 22:00:00,3,air_db80363d35f10926
4,hpg_878cc70b1abc76f7,2016-01-11 18:00:00,2016-01-11 14:00:00,2,air_db80363d35f10926


In [7]:
df_ar['visit_datetime'] = pd.to_datetime(df_ar['visit_datetime'])
df_ar['visit_datetime'] = df_ar['visit_datetime'].dt.date

In [11]:
df_ar['reserve_datetime'] = pd.to_datetime(df_ar['reserve_datetime'])
df_ar['reserve_datetime'] = df_ar['reserve_datetime'].dt.date

In [13]:
df_ar['reserve_datetime_diff'] = df_ar.apply(lambda r: (r['visit_datetime'] 
                                                              - r['reserve_datetime']).days, axis=1)

In [16]:
tmp1 = df_ar.groupby(['air_store_id','visit_datetime'], as_index=False)\
       [['reserve_datetime_diff', 'reserve_visitors']].sum()\
       .rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1', 'reserve_visitors':'rv1'})

In [18]:
tmp2 = df_ar.groupby(['air_store_id','visit_datetime'], as_index=False)\
       [['reserve_datetime_diff', 'reserve_visitors']].mean()\
       .rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2', 'reserve_visitors':'rv2'})

In [21]:
df_ar.head()

Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors,reserve_datetime_diff
0,air_877f79706adbfb06,2016-01-01,2016-01-01,1,0
1,air_db4b38ebe7a7ceff,2016-01-01,2016-01-01,3,0
2,air_db4b38ebe7a7ceff,2016-01-01,2016-01-01,6,0
3,air_877f79706adbfb06,2016-01-01,2016-01-01,2,0
4,air_db80363d35f10926,2016-01-01,2016-01-01,5,0


In [23]:
df_ar = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])
df_ar.head()

Unnamed: 0,air_store_id,visit_date,rs1,rv1,rs2,rv2
0,air_00a91d42b08b08d9,2016-10-31,0,2,0.0,2.0
1,air_00a91d42b08b08d9,2016-12-05,4,9,4.0,9.0
2,air_00a91d42b08b08d9,2016-12-14,6,18,6.0,18.0
3,air_00a91d42b08b08d9,2016-12-17,6,2,6.0,2.0
4,air_00a91d42b08b08d9,2016-12-20,2,4,2.0,4.0


In [27]:
len(df_ar.air_store_id.unique())

314

In [28]:
df_hr['visit_datetime'] = pd.to_datetime(df_hr['visit_datetime'])
df_hr['visit_datetime'] = df_hr['visit_datetime'].dt.date

df_hr['reserve_datetime'] = pd.to_datetime(df_hr['reserve_datetime'])
df_hr['reserve_datetime'] = df_hr['reserve_datetime'].dt.date

In [29]:
df_hr['reserve_datetime_diff'] = df_hr.apply(lambda r: (r['visit_datetime'] 
                                             - r['reserve_datetime']).days, axis=1)

In [30]:
tmp1 = df_hr.groupby(['air_store_id','visit_datetime'], as_index=False)\
       [['reserve_datetime_diff', 'reserve_visitors']].sum()\
       .rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1', 'reserve_visitors':'rv1'})

tmp2 = df_hr.groupby(['air_store_id','visit_datetime'], as_index=False)\
       [['reserve_datetime_diff', 'reserve_visitors']].mean()\
       .rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2', 'reserve_visitors':'rv2'})

df_hr = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])
df_hr.head()

Unnamed: 0,air_store_id,visit_date,rs1,rv1,rs2,rv2
0,air_00a91d42b08b08d9,2016-01-14,3,2,3.0,2.0
1,air_00a91d42b08b08d9,2016-01-15,6,4,6.0,4.0
2,air_00a91d42b08b08d9,2016-01-16,3,2,3.0,2.0
3,air_00a91d42b08b08d9,2016-01-22,3,2,3.0,2.0
4,air_00a91d42b08b08d9,2016-01-29,6,5,6.0,5.0


In [31]:
len(df_hr.air_store_id.unique())

150

In [32]:
# featuring df_av

df_av['visit_date'] = pd.to_datetime(df_av['visit_date'])
df_av['dow'] = df_av['visit_date'].dt.dayofweek
df_av['year'] = df_av['visit_date'].dt.year
df_av['month'] = df_av['visit_date'].dt.month
df_av['visit_date'] = df_av['visit_date'].dt.date

In [36]:
df_av.head()

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month
0,air_ba937bf13d40fb24,2016-01-13,25,2,2016,1
1,air_ba937bf13d40fb24,2016-01-14,32,3,2016,1
2,air_ba937bf13d40fb24,2016-01-15,29,4,2016,1
3,air_ba937bf13d40fb24,2016-01-16,22,5,2016,1
4,air_ba937bf13d40fb24,2016-01-18,6,0,2016,1


In [34]:
# featuring df_submission

df_submission['visit_date'] = df_submission['id'].map(lambda x: str(x).split('_')[2])
df_submission['air_store_id'] = df_submission['id'].map(lambda x: '_'.join(x.split('_')[:2]))
df_submission['visit_date'] = pd.to_datetime(df_submission['visit_date'])
df_submission['dow'] = df_submission['visit_date'].dt.dayofweek
df_submission['year'] = df_submission['visit_date'].dt.year
df_submission['month'] = df_submission['visit_date'].dt.month
df_submission['visit_date'] = df_submission['visit_date'].dt.date

In [35]:
df_submission.head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4


In [37]:
unique_stores = df_submission['air_store_id'].unique()

In [39]:
# what is this for?

stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], 
                   axis=0, ignore_index=True).reset_index(drop=True)

In [41]:
stores.head(15)

Unnamed: 0,air_store_id,dow
0,air_00a91d42b08b08d9,0
1,air_0164b9927d20bcc3,0
2,air_0241aa3964b7f861,0
3,air_0328696196e46f18,0
4,air_034a3d5b40d5b1b1,0
5,air_036d4f1ee7285390,0
6,air_0382c794b73b51ad,0
7,air_03963426c9312048,0
8,air_04341b588bde96cd,0
9,air_049f6d5b402a31b2,0


In [44]:
stores[stores.air_store_id == 'air_00a91d42b08b08d9']

Unnamed: 0,air_store_id,dow
0,air_00a91d42b08b08d9,0
821,air_00a91d42b08b08d9,1
1642,air_00a91d42b08b08d9,2
2463,air_00a91d42b08b08d9,3
3284,air_00a91d42b08b08d9,4
4105,air_00a91d42b08b08d9,5
4926,air_00a91d42b08b08d9,6


In [45]:
stores[stores.air_store_id == 'air_0164b9927d20bcc3']

Unnamed: 0,air_store_id,dow
1,air_0164b9927d20bcc3,0
822,air_0164b9927d20bcc3,1
1643,air_0164b9927d20bcc3,2
2464,air_0164b9927d20bcc3,3
3285,air_0164b9927d20bcc3,4
4106,air_0164b9927d20bcc3,5
4927,air_0164b9927d20bcc3,6


In [46]:
#sure it can be compressed...
tmp = df_av.groupby(['air_store_id','dow'], as_index=False)['visitors']\
     .min().rename(columns={'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

In [52]:
stores.head()

Unnamed: 0,air_store_id,dow,min_visitors
0,air_00a91d42b08b08d9,0,1.0
1,air_0164b9927d20bcc3,0,2.0
2,air_0241aa3964b7f861,0,2.0
3,air_0328696196e46f18,0,2.0
4,air_034a3d5b40d5b1b1,0,1.0


In [50]:
tmp[tmp.air_store_id == 'air_f927b2da69a82341']

Unnamed: 0,air_store_id,dow,min_visitors
5588,air_f927b2da69a82341,0,4
5589,air_f927b2da69a82341,1,1
5590,air_f927b2da69a82341,2,24
5591,air_f927b2da69a82341,3,3
5592,air_f927b2da69a82341,4,4
5593,air_f927b2da69a82341,5,4


In [51]:
# check why min_visitors has nan value

stores[stores.air_store_id == 'air_f927b2da69a82341']

Unnamed: 0,air_store_id,dow,min_visitors
799,air_f927b2da69a82341,0,4.0
1620,air_f927b2da69a82341,1,1.0
2441,air_f927b2da69a82341,2,24.0
3262,air_f927b2da69a82341,3,3.0
4083,air_f927b2da69a82341,4,4.0
4904,air_f927b2da69a82341,5,4.0
5725,air_f927b2da69a82341,6,


In [53]:
tmp = df_av.groupby(['air_store_id','dow'], as_index=False)['visitors']\
      .mean().rename(columns={'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
stores.head(15)

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors
0,air_00a91d42b08b08d9,0,1.0,22.457143
1,air_0164b9927d20bcc3,0,2.0,7.5
2,air_0241aa3964b7f861,0,2.0,8.920635
3,air_0328696196e46f18,0,2.0,6.416667
4,air_034a3d5b40d5b1b1,0,1.0,11.864865
5,air_036d4f1ee7285390,0,4.0,19.6
6,air_0382c794b73b51ad,0,1.0,20.795455
7,air_03963426c9312048,0,2.0,26.030303
8,air_04341b588bde96cd,0,5.0,35.41791
9,air_049f6d5b402a31b2,0,2.0,9.027778


In [54]:
tmp = df_av.groupby(['air_store_id','dow'], as_index=False)['visitors']\
      .median().rename(columns={'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
stores.head()

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0
3,air_0328696196e46f18,0,2.0,6.416667,4.0
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0


In [55]:
tmp = df_av.groupby(['air_store_id','dow'], as_index=False)['visitors']\
      .max().rename(columns={'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])

tmp = df_av.groupby(['air_store_id','dow'], as_index=False)['visitors']\
      .count().rename(columns={'visitors':'count_observations'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

stores.head()

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0


In [56]:
stores = pd.merge(stores, df_astore, how='left', on=['air_store_id']) 
stores.head()

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0,Dining bar,Ōsaka-fu Ōsaka-shi Nakanochō,34.701279,135.52809
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229


In [57]:
# NEW FEATURES FROM Georgii Vyshnia

stores['air_genre_name'] = stores['air_genre_name'].map(lambda x: str(str(x).replace('/',' ')))
stores['air_area_name'] = stores['air_area_name'].map(lambda x: str(str(x).replace('-',' ')))

In [58]:
stores.head()

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,139.753595
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0,Italian French,Tōkyō to Minato ku Shibakōen,35.658068,139.751599
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0,Izakaya,Tōkyō to Taitō ku Higashiueno,35.712607,139.779996
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0,Dining bar,Ōsaka fu Ōsaka shi Nakanochō,34.701279,135.52809
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0,Cafe Sweets,Ōsaka fu Ōsaka shi Ōhiraki,34.692337,135.472229


In [60]:
# encoding

lbl = preprocessing.LabelEncoder()

In [61]:
for i in range(10):
    stores['air_genre_name'+str(i)] = lbl.fit_transform(stores['air_genre_name']\
                                     .map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
    stores['air_area_name'+str(i)] = lbl.fit_transform(stores['air_area_name']\
                                     .map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))

In [62]:
stores.head()

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,...,air_genre_name5,air_area_name5,air_genre_name6,air_area_name6,air_genre_name7,air_area_name7,air_genre_name8,air_area_name8,air_genre_name9,air_area_name9
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0,Italian French,Tōkyō to Chiyoda ku Kudanminami,35.694003,...,0,0,0,0,0,0,0,0,0,0
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0,Italian French,Tōkyō to Minato ku Shibakōen,35.658068,...,0,0,0,0,0,0,0,0,0,0
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0,Izakaya,Tōkyō to Taitō ku Higashiueno,35.712607,...,0,0,0,0,0,0,0,0,0,0
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0,Dining bar,Ōsaka fu Ōsaka shi Nakanochō,34.701279,...,0,0,0,0,0,0,0,0,0,0
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0,Cafe Sweets,Ōsaka fu Ōsaka shi Ōhiraki,34.692337,...,0,0,0,0,0,0,0,0,0,0


In [80]:
stores['air_genre_name'][:3].map(lambda x: str(str(x).split(' ')) )

0    ['Italian', 'French']
1    ['Italian', 'French']
2              ['Izakaya']
Name: air_genre_name, dtype: object

In [84]:
lbl.fit_transform(stores['air_genre_name'][:3].map(lambda x: str(str(x).split(' ')) ))

array([0, 0, 1])

In [88]:
lbl.fit_transform(stores['air_genre_name'][:3].map(lambda x: str(str(x).split(' ')[0])))

array([0, 0, 1])

In [90]:
# encoding name, i-th name in i-th column

stores['air_genre_name'].head(10)

0    Italian French
1    Italian French
2           Izakaya
3        Dining bar
4       Cafe Sweets
5       Cafe Sweets
6       Cafe Sweets
7           Izakaya
8           Izakaya
9     Japanese food
Name: air_genre_name, dtype: object

In [91]:
for i in range(10):
    stores['air_genre_name'+str(i)] = lbl.fit_transform(stores['air_genre_name']\
                                     .map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
    stores['air_area_name'+str(i)] = lbl.fit_transform(stores['air_area_name']\
                                     .map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))

In [92]:
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

In [94]:
stores.head(10)

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,...,air_genre_name5,air_area_name5,air_genre_name6,air_area_name6,air_genre_name7,air_area_name7,air_genre_name8,air_area_name8,air_genre_name9,air_area_name9
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0,6,44,35.694003,...,0,0,0,0,0,0,0,0,0,0
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0,6,62,35.658068,...,0,0,0,0,0,0,0,0,0,0
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0,7,82,35.712607,...,0,0,0,0,0,0,0,0,0,0
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0,4,98,34.701279,...,0,0,0,0,0,0,0,0,0,0
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0,2,102,34.692337,...,0,0,0,0,0,0,0,0,0,0
5,air_036d4f1ee7285390,0,4.0,19.6,19.0,38.0,40.0,2,31,34.799767,...,0,0,0,0,0,0,0,0,0,0
6,air_0382c794b73b51ad,0,1.0,20.795455,21.0,47.0,44.0,2,68,35.602125,...,0,0,0,0,0,0,0,0,0,0
7,air_03963426c9312048,0,2.0,26.030303,26.0,70.0,66.0,7,15,34.386245,...,0,0,0,0,0,0,0,0,0,0
8,air_04341b588bde96cd,0,5.0,35.41791,33.0,76.0,67.0,7,66,35.735623,...,0,0,0,0,0,0,0,0,0,0
9,air_049f6d5b402a31b2,0,2.0,9.027778,9.0,20.0,36.0,8,0,33.589216,...,0,0,0,0,0,0,0,0,0,0


In [100]:
date_info = date_info.rename(columns={'calendar_date':'visit_date'})

date_info['visit_date'] = pd.to_datetime(date_info['visit_date'])
date_info['day_of_week'] = lbl.fit_transform(date_info['day_of_week'])
date_info['visit_date'] = date_info['visit_date'].dt.date

In [102]:
date_info.head()

Unnamed: 0,visit_date,day_of_week,holiday_flg
0,2016-01-01,0,1
1,2016-01-02,2,1
2,2016-01-03,3,1
3,2016-01-04,1,0
4,2016-01-05,5,0


## Training and Testing

In [103]:
train = pd.merge(df_av, date_info, how='left', on=['visit_date']) 
test = pd.merge(df_submission, date_info, how='left', on=['visit_date']) 

In [106]:
test.head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month,day_of_week,holiday_flg
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,3,0
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,1,0
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,5,0
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,6,0
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,4,0


In [107]:
train.head()

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month,day_of_week,holiday_flg
0,air_ba937bf13d40fb24,2016-01-13,25,2,2016,1,6,0
1,air_ba937bf13d40fb24,2016-01-14,32,3,2016,1,4,0
2,air_ba937bf13d40fb24,2016-01-15,29,4,2016,1,0,0
3,air_ba937bf13d40fb24,2016-01-16,22,5,2016,1,2,0
4,air_ba937bf13d40fb24,2016-01-18,6,0,2016,1,1,0


In [108]:
train = pd.merge(train, stores, how='left', on=['air_store_id','dow']) 
test = pd.merge(test, stores, how='left', on=['air_store_id','dow'])

In [110]:
test.head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month,day_of_week,holiday_flg,min_visitors,...,air_genre_name5,air_area_name5,air_genre_name6,air_area_name6,air_genre_name7,air_area_name7,air_genre_name8,air_area_name8,air_genre_name9,air_area_name9
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,3,0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,1,0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,5,0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,6,0,15.0,...,0,0,0,0,0,0,0,0,0,0
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,4,0,15.0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
train.head()

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month,day_of_week,holiday_flg,min_visitors,mean_visitors,...,air_genre_name5,air_area_name5,air_genre_name6,air_area_name6,air_genre_name7,air_area_name7,air_genre_name8,air_area_name8,air_genre_name9,air_area_name9
0,air_ba937bf13d40fb24,2016-01-13,25,2,2016,1,6,0,7.0,23.84375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,air_ba937bf13d40fb24,2016-01-14,32,3,2016,1,4,0,2.0,20.292308,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,air_ba937bf13d40fb24,2016-01-15,29,4,2016,1,0,0,4.0,34.738462,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,air_ba937bf13d40fb24,2016-01-16,22,5,2016,1,2,0,6.0,27.651515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,air_ba937bf13d40fb24,2016-01-18,6,0,2016,1,1,0,2.0,13.754386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
