In [1]:
import pandas as pd

import numpy as np

In [2]:
import datetime

In [3]:
from sklearn import preprocessing

from sklearn.model_selection import train_test_split

In [124]:
from keras.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Dense, LSTM, TimeDistributed, Input
from keras.optimizers import Adam

In [5]:
# function for root mean squared logarithmic error
def rmsle(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)**.5

In [6]:
# DATA IMPORT

In [7]:
# air reservation system
air_reserve = pd.read_csv('air_reserve.csv')
air_store_info = pd.read_csv('air_store_info.csv')
air_visit_data = pd.read_csv('air_visit_data.csv')

In [8]:
air_store_info.head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599


In [9]:
# hpg reservation system
hpg_reserve = pd.read_csv('hpg_reserve.csv')
hpg_store_info = pd.read_csv('hpg_store_info.csv')

In [10]:
hpg_store_info.head()

Unnamed: 0,hpg_store_id,hpg_genre_name,hpg_area_name,latitude,longitude
0,hpg_6622b62385aec8bf,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
1,hpg_e9e068dd49c5fa00,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
2,hpg_2976f7acb4b3a3bc,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
3,hpg_e51a522e098f024c,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
4,hpg_e3d0e1519894f275,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221


In [11]:
# additional data
store_id_relation = pd.read_csv('store_id_relation.csv')
date_info = pd.read_csv('date_info.csv')

In [12]:
store_id_relation.head()

Unnamed: 0,air_store_id,hpg_store_id
0,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a
1,air_a24bf50c3e90d583,hpg_c34b496d0305a809
2,air_c7f78b4f3cba33ff,hpg_cd8ae0d9bbd58ff9
3,air_947eb2cae4f3e8f2,hpg_de24ea49dc25d6b8
4,air_965b2e0cf4119003,hpg_653238a84804d8e7


In [13]:
# test data
sample_sub = pd.read_csv('sample_submission.csv')

In [14]:
sample_sub.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,0
1,air_00a91d42b08b08d9_2017-04-24,0
2,air_00a91d42b08b08d9_2017-04-25,0
3,air_00a91d42b08b08d9_2017-04-26,0
4,air_00a91d42b08b08d9_2017-04-27,0


In [15]:
# DATA PREPARATION

In [16]:
# transform test data
air_test = sample_sub.copy()
air_test['air_store_id'] = air_test['id'].apply(lambda x: str(x)[:-11])
air_test['visit_date'] = air_test['id'].apply(lambda x: str(x)[-10:])

In [17]:
air_test.head()

Unnamed: 0,id,visitors,air_store_id,visit_date
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23
1,air_00a91d42b08b08d9_2017-04-24,0,air_00a91d42b08b08d9,2017-04-24
2,air_00a91d42b08b08d9_2017-04-25,0,air_00a91d42b08b08d9,2017-04-25
3,air_00a91d42b08b08d9_2017-04-26,0,air_00a91d42b08b08d9,2017-04-26
4,air_00a91d42b08b08d9_2017-04-27,0,air_00a91d42b08b08d9,2017-04-27


In [18]:
# dataframe for predictions
submission_lstm = air_test.copy()

In [19]:
# test set for merger with train set
air_test = air_test.drop(['id', 'visitors'], axis=1)

In [20]:
air_test.head()

Unnamed: 0,air_store_id,visit_date
0,air_00a91d42b08b08d9,2017-04-23
1,air_00a91d42b08b08d9,2017-04-24
2,air_00a91d42b08b08d9,2017-04-25
3,air_00a91d42b08b08d9,2017-04-26
4,air_00a91d42b08b08d9,2017-04-27


In [21]:
# combine air and hpg databases
hpg_air_reserve = store_id_relation.join(hpg_reserve.set_index('hpg_store_id'), on='hpg_store_id')

In [22]:
hpg_air_reserve.head()

Unnamed: 0,air_store_id,hpg_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a,2016-01-04 12:00:00,2016-01-03 14:00:00,7
0,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a,2016-01-04 14:00:00,2016-01-02 13:00:00,4
0,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a,2016-01-05 12:00:00,2016-01-01 08:00:00,3
0,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a,2016-01-09 12:00:00,2016-01-07 20:00:00,6
0,air_63b13c56b7201bd9,hpg_4bc649e72e2a239a,2016-01-11 12:00:00,2016-01-10 15:00:00,3


In [23]:
air_reserve_tmp = air_reserve.copy()

In [24]:
hpg_air_reserve = hpg_air_reserve.drop('hpg_store_id', axis=1)

In [25]:
reserve = pd.concat([air_reserve_tmp, hpg_air_reserve])

In [26]:
# convert columns of 'reserve' table into datatime format
reserve['visit_datetime'] = pd.to_datetime(reserve['visit_datetime'])
reserve['reserve_datetime'] = pd.to_datetime(reserve['reserve_datetime'])

In [27]:
# create column for visit date inside 'reserve' table
reserve['visit_date'] = reserve['visit_datetime'].apply(lambda x: str(x)[0:10])

In [28]:
# calculate the gap between visit time and reservation time inside 'reserve' table
reserve['hour_gap'] = reserve['visit_datetime'].sub(reserve['reserve_datetime'])
reserve['hour_gap']= reserve['hour_gap'].apply(lambda x: x / np.timedelta64(1, 'h'))

In [29]:
reserve.head()

Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors,visit_date,hour_gap
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1,2016-01-01,3.0
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3,2016-01-01,0.0
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6,2016-01-01,0.0
3,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2,2016-01-01,4.0
4,air_db80363d35f10926,2016-01-01 20:00:00,2016-01-01 01:00:00,5,2016-01-01,19.0


In [30]:
# separate reservation into 5 categories based on gap length
reserve['reserve_-12_h'] = np.where(reserve['hour_gap'] <= 12, reserve['reserve_visitors'], 0)
reserve['reserve_12_37_h'] = np.where((reserve['hour_gap'] <= 37) & (reserve['hour_gap'] > 12), reserve['reserve_visitors'], 0)
reserve['reserve_37_59_h'] = np.where((reserve['hour_gap'] <= 59) & (reserve['hour_gap'] > 37), reserve['reserve_visitors'], 0)
reserve['reserve_59_85_h'] = np.where((reserve['hour_gap'] <= 85) & (reserve['hour_gap'] > 59), reserve['reserve_visitors'], 0)
reserve['reserve_85+_h'] = np.where(reserve['hour_gap'] > 85, reserve['reserve_visitors'], 0)

In [31]:
# group by air_store_id and visit_date to enable joining with main table
group_list = ['air_store_id', 'visit_date', 'reserve_visitors', 'reserve_-12_h',
              'reserve_12_37_h', 'reserve_37_59_h', 'reserve_59_85_h', 'reserve_85+_h']

reserve = reserve[group_list].groupby(['air_store_id', 'visit_date'], as_index=False).sum()

for i in group_list[2:]:
    reserve[i] = reserve[i].apply(lambda x: np.log1p(x))

In [32]:
reserve.head()

Unnamed: 0,air_store_id,visit_date,reserve_visitors,reserve_-12_h,reserve_12_37_h,reserve_37_59_h,reserve_59_85_h,reserve_85+_h
0,air_00a91d42b08b08d9,2016-01-14,1.098612,0.0,0.0,0.0,1.098612,0.0
1,air_00a91d42b08b08d9,2016-01-15,1.609438,0.0,0.0,0.0,0.0,1.609438
2,air_00a91d42b08b08d9,2016-01-16,1.098612,0.0,0.0,0.0,1.098612,0.0
3,air_00a91d42b08b08d9,2016-01-22,1.098612,0.0,0.0,0.0,1.098612,0.0
4,air_00a91d42b08b08d9,2016-01-29,1.791759,0.0,0.0,0.0,0.0,1.791759


In [33]:
# GENRE DATA

In [34]:
# total amount of restaurants of specific genres by area_name
air_genres_area = air_store_info.copy()
air_genres_area = air_genres_area[['air_store_id', 'air_genre_name', 'air_area_name']].groupby(['air_genre_name', 'air_area_name'],
                                                                                              as_index=False).count()
air_genres_area = air_genres_area.rename(columns={'air_store_id': 'genre_in_area'})

In [35]:
air_genres_area.head()

Unnamed: 0,air_genre_name,air_area_name,genre_in_area
0,Asian,Tōkyō-to Shibuya-ku Shibuya,2
1,Bar/Cocktail,Fukuoka-ken Fukuoka-shi Daimyō,7
2,Bar/Cocktail,Fukuoka-ken Fukuoka-shi Hakata Ekimae,2
3,Bar/Cocktail,Hiroshima-ken Hiroshima-shi Kokutaijimachi,2
4,Bar/Cocktail,Hokkaidō Asahikawa-shi 6 Jōdōri,4


In [36]:
# total amount of restaurants in area
air_area = air_store_info.copy()
air_area = air_area[['air_store_id', 'air_area_name']].groupby(['air_area_name'], as_index=False).count()
air_area = air_area.rename(columns={'air_store_id': 'total_r_in_area'})

In [37]:
air_area.head()

Unnamed: 0,air_area_name,total_r_in_area
0,Fukuoka-ken Fukuoka-shi Daimyō,64
1,Fukuoka-ken Fukuoka-shi Hakata Ekimae,16
2,Fukuoka-ken Fukuoka-shi Imaizumi,2
3,Fukuoka-ken Fukuoka-shi Momochi,6
4,Fukuoka-ken Fukuoka-shi Shiobaru,7


In [38]:
# WEEKENDS AND HOLIDAYS

In [39]:
# additional features for weekends and holidays
date_info_mod = date_info.copy()
date_info_mod['holiday_eve'] = np.zeros(date_info_mod.shape[0])
date_info_mod['holiday_eve'].iloc[:-1] = date_info_mod['holiday_flg'].copy().values[1:]
date_info_mod['non_working'] = np.where(date_info_mod['day_of_week'].isin(['Saturday', 'Sunday']) |
                                       date_info_mod['holiday_flg'] == 1, 1, 0)
date_info_mod = date_info_mod.drop('holiday_flg', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [40]:
date_info_mod.head()

Unnamed: 0,calendar_date,day_of_week,holiday_eve,non_working
0,2016-01-01,Friday,1.0,1
1,2016-01-02,Saturday,1.0,1
2,2016-01-03,Sunday,0.0,1
3,2016-01-04,Monday,0.0,0
4,2016-01-05,Tuesday,0.0,0


In [41]:
# average visitors per restaurant by working and non-working days
air_visit_wd = air_visit_data.join(date_info_mod.set_index('calendar_date'), on='visit_date')
air_visit_wd['visitors'] = air_visit_wd['visitors'].apply(lambda x: np.log1p(x))

In [42]:
air_visit_wd.head()

Unnamed: 0,air_store_id,visit_date,visitors,day_of_week,holiday_eve,non_working
0,air_ba937bf13d40fb24,2016-01-13,3.258097,Wednesday,0.0,0
1,air_ba937bf13d40fb24,2016-01-14,3.496508,Thursday,0.0,0
2,air_ba937bf13d40fb24,2016-01-15,3.401197,Friday,0.0,0
3,air_ba937bf13d40fb24,2016-01-16,3.135494,Saturday,0.0,1
4,air_ba937bf13d40fb24,2016-01-18,1.94591,Monday,0.0,0


In [43]:
# average visitors per restaurant
mean_df = air_visit_wd[['visitors', 'air_store_id', 'non_working']].copy().groupby(['air_store_id', 'non_working'], as_index=False).mean()
mean_df = mean_df.rename(columns={'visitors': 'visitors_mean'})

In [44]:
mean_df.head()

Unnamed: 0,air_store_id,non_working,visitors_mean
0,air_00a91d42b08b08d9,0,3.309565
1,air_00a91d42b08b08d9,1,2.485089
2,air_0164b9927d20bcc3,0,2.205374
3,air_0164b9927d20bcc3,1,1.701912
4,air_0241aa3964b7f861,0,2.118182


In [45]:
# median visitors per restaurant
median_df = air_visit_wd[['visitors', 'air_store_id', 'non_working']].copy().groupby(['air_store_id', 'non_working'], as_index=False).median()
median_df = median_df.rename(columns={'visitors': 'visitors_median'})

In [46]:
median_df.head()

Unnamed: 0,air_store_id,non_working,visitors_median
0,air_00a91d42b08b08d9,0,3.367296
1,air_00a91d42b08b08d9,1,2.484907
2,air_0164b9927d20bcc3,0,2.197225
3,air_0164b9927d20bcc3,1,1.666102
4,air_0241aa3964b7f861,0,2.197225


In [74]:
# min visitors per restaurant
min_df = air_visit_wd[['visitors', 'air_store_id', 'non_working']].copy().groupby(['air_store_id', 'non_working'], as_index=False).min()
min_df = min_df.rename(columns={'visitors': 'visitors_min'})

In [75]:
min_df.head()

Unnamed: 0,air_store_id,non_working,visitors_min
0,air_00a91d42b08b08d9,0,0.693147
1,air_00a91d42b08b08d9,1,1.098612
2,air_0164b9927d20bcc3,0,0.693147
3,air_0164b9927d20bcc3,1,0.693147
4,air_0241aa3964b7f861,0,0.693147


In [76]:
# max visitors per restaurant
max_df = air_visit_wd[['visitors', 'air_store_id', 'non_working']].copy().groupby(['air_store_id', 'non_working'], as_index=False).max()
max_df = max_df.rename(columns={'visitors': 'visitors_max'})

In [77]:
max_df.head()

Unnamed: 0,air_store_id,non_working,visitors_max
0,air_00a91d42b08b08d9,0,4.060443
1,air_00a91d42b08b08d9,1,4.60517
2,air_0164b9927d20bcc3,0,3.332205
3,air_0164b9927d20bcc3,1,3.258097
4,air_0241aa3964b7f861,0,3.89182


In [78]:
# JOIN TABLES INTO TRAINING AND TEST SETS

In [79]:
# function for combining train/test dataset with additional information
def merge_join(df):
    # add month of visit
    df['month'] = df['visit_date'].apply(lambda x: float(str(x)[5:7]))
    # add weekday and holiday flag
    df = df.join(date_info_mod.set_index('calendar_date'), on='visit_date')
    # add genre and area name
    df = df.join(air_store_info.set_index('air_store_id'), on='air_store_id')
    # add quantity of same genre in area
    df = pd.merge(df, air_genres_area, how='left',
                 left_on=['air_genre_name', 'air_area_name'],
                 right_on=['air_genre_name', 'air_area_name'])
    # add total quantity of restaurants in area
    df = pd.merge(df, air_area, how='left',
                 left_on=['air_area_name'],
                 right_on=['air_area_name'])
    # add reservation information
    df = pd.merge(df, reserve, how = 'left',
                 left_on=['air_store_id', 'visit_date'],
                 right_on=['air_store_id', 'visit_date'])
    # add visitors number mean, median, max and min per each restaurant
    df = pd.merge(df, mean_df, how = 'left',
                  left_on = ['air_store_id', 'non_working'],
                  right_on = ['air_store_id', 'non_working'])

    df = pd.merge(df, median_df, how = 'left',
                  left_on = ['air_store_id', 'non_working'],
                  right_on = ['air_store_id', 'non_working'])
    
    df = pd.merge(df, max_df, how = 'left',
                  left_on = ['air_store_id', 'non_working'],
                  right_on = ['air_store_id', 'non_working'])
    
    df = pd.merge(df, min_df, how = 'left',
                  left_on = ['air_store_id', 'non_working'],
                  right_on = ['air_store_id', 'non_working'])
    # change NaN to 0
    df = df.fillna(0)
    
    return df

In [80]:
# combine train/test data with additional information
air_train = air_visit_data.copy()
X = merge_join(air_train)
X_test = merge_join(air_test)

In [81]:
X.head()

Unnamed: 0,air_store_id,visit_date,visitors,month,day_of_week,holiday_eve,non_working,air_genre_name,air_area_name,latitude,...,reserve_visitors,reserve_-12_h,reserve_12_37_h,reserve_37_59_h,reserve_59_85_h,reserve_85+_h,visitors_mean,visitors_median,visitors_max,visitors_min
0,air_ba937bf13d40fb24,2016-01-13,25,1.0,Wednesday,0.0,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,3.041217,3.113268,4.127134,1.098612
1,air_ba937bf13d40fb24,2016-01-14,32,1.0,Thursday,0.0,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,3.041217,3.113268,4.127134,1.098612
2,air_ba937bf13d40fb24,2016-01-15,29,1.0,Friday,0.0,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,3.041217,3.113268,4.127134,1.098612
3,air_ba937bf13d40fb24,2016-01-16,22,1.0,Saturday,0.0,1,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,2.917806,3.178054,4.007333,0.693147
4,air_ba937bf13d40fb24,2016-01-18,6,1.0,Monday,0.0,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,3.041217,3.113268,4.127134,1.098612


In [82]:
X_test.head()

Unnamed: 0,air_store_id,visit_date,month,day_of_week,holiday_eve,non_working,air_genre_name,air_area_name,latitude,longitude,...,reserve_visitors,reserve_-12_h,reserve_12_37_h,reserve_37_59_h,reserve_59_85_h,reserve_85+_h,visitors_mean,visitors_median,visitors_max,visitors_min
0,air_00a91d42b08b08d9,2017-04-23,4.0,Sunday,0.0,1,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,...,0.0,0.0,0.0,0.0,0.0,0.0,2.485089,2.484907,4.60517,1.098612
1,air_00a91d42b08b08d9,2017-04-24,4.0,Monday,0.0,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,...,0.0,0.0,0.0,0.0,0.0,0.0,3.309565,3.367296,4.060443,0.693147
2,air_00a91d42b08b08d9,2017-04-25,4.0,Tuesday,0.0,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,...,0.0,0.0,0.0,0.0,0.0,0.0,3.309565,3.367296,4.060443,0.693147
3,air_00a91d42b08b08d9,2017-04-26,4.0,Wednesday,0.0,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,...,0.0,0.0,0.0,0.0,0.0,0.0,3.309565,3.367296,4.060443,0.693147
4,air_00a91d42b08b08d9,2017-04-27,4.0,Thursday,0.0,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,...,0.0,0.0,0.0,0.0,0.0,0.0,3.309565,3.367296,4.060443,0.693147


In [83]:
# ENCODE STRING FEATURES

In [84]:
# weekday
le_weekday = preprocessing.LabelEncoder()
le_weekday.fit(X['day_of_week'])
X['day_of_week'] = le_weekday.transform(X['day_of_week'])
X_test['day_of_week'] = le_weekday.transform(X_test['day_of_week'])

In [85]:
# Genre name
le_genre = preprocessing.LabelEncoder()
le_genre.fit(X['air_genre_name'])
X['air_genre_name'] = le_genre.transform(X['air_genre_name'])
X_test['air_genre_name'] = le_genre.transform(X_test['air_genre_name'])

In [86]:
# Area name
le_area = preprocessing.LabelEncoder()
le_area.fit(X['air_area_name'])
X['air_area_name'] = le_area.transform(X['air_area_name'])
X_test['air_area_name'] = le_area.transform(X_test['air_area_name'])

In [87]:
# id
le_id = preprocessing.LabelEncoder()
le_id.fit(X['air_store_id'])
X['air_store_id'] = le_id.transform(X['air_store_id'])
X_test['air_store_id'] = le_id.transform(X_test['air_store_id'])

In [88]:
# SIMULTANEOUS TRANSFORMATION OF TRAIN AND TEST SETS

In [89]:
# combine train and test sets
X_all = X.append(X_test, sort=False)

In [90]:
X_all.head()

Unnamed: 0,air_store_id,visit_date,visitors,month,day_of_week,holiday_eve,non_working,air_genre_name,air_area_name,latitude,...,reserve_visitors,reserve_-12_h,reserve_12_37_h,reserve_37_59_h,reserve_59_85_h,reserve_85+_h,visitors_mean,visitors_median,visitors_max,visitors_min
0,603,2016-01-13,25.0,1.0,6,0.0,0,4,62,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,3.041217,3.113268,4.127134,1.098612
1,603,2016-01-14,32.0,1.0,4,0.0,0,4,62,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,3.041217,3.113268,4.127134,1.098612
2,603,2016-01-15,29.0,1.0,0,0.0,0,4,62,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,3.041217,3.113268,4.127134,1.098612
3,603,2016-01-16,22.0,1.0,2,0.0,1,4,62,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,2.917806,3.178054,4.007333,0.693147
4,603,2016-01-18,6.0,1.0,1,0.0,0,4,62,35.658068,...,0.0,0.0,0.0,0.0,0.0,0.0,3.041217,3.113268,4.127134,1.098612


In [91]:
# date table (includes all dates for training and test period)
dates = np.arange(np.datetime64(X_all.visit_date.min()),
                 np.datetime64(X_all.visit_date.max()) + 1,
                 datetime.timedelta(days=1))
ids = X_all['air_store_id'].unique()
dates_all = dates.tolist() * len(ids)
ids_all = np.repeat(ids, len(dates.tolist())).tolist()
df_all = pd.DataFrame({"air_store_id": ids_all, "visit_date": dates_all})
df_all['visit_date'] = df_all['visit_date'].copy().apply(lambda x: str(x)[:10])

In [92]:
df_all.head()

Unnamed: 0,air_store_id,visit_date
0,603,2016-01-01
1,603,2016-01-02
2,603,2016-01-03
3,603,2016-01-04
4,603,2016-01-05


In [96]:
# create copy of X_all with data relevant to 'visit_date'
X_dates = X_all[['visit_date', 'month', 'day_of_week', 'holiday_eve', 'non_working']].copy()
# remove duplicates to avoid memory issues
X_dates = X_dates.drop_duplicates('visit_date')
# merge dataframe that represents all dates per each restaurant with information about each date
df_to_reshape = df_all.merge(X_dates, how='left', left_on='visit_date', right_on='visit_date')

In [97]:
# create copy of X_all with data relevant to 'air_store_id'
X_stores = X_all[['air_store_id', 'air_genre_name', 'air_area_name', 'latitude',
                 'longitude', 'genre_in_area', 'total_r_in_area']].copy()
# remove duplicates
X_stores = X_stores.drop_duplicates('air_store_id')
# merge dataframe that represents all dates per each restaurant with information about each restaurant
df_to_reshape = df_to_reshape.merge(X_stores, how = "left", left_on = 'air_store_id', right_on = 'air_store_id')

In [98]:
# merge dataframe that represents all dates per each restaurant with inf. about each restaurant per specific data
df_to_reshape = df_to_reshape.merge(X_all[['air_store_id', 'visit_date', 'reserve_visitors', 'visitors_mean',
                                          'visitors_median', 'visitors_max', 'visitors_min', 'visitors']],
                                   how='left', left_on=['air_store_id', 'visit_date'], right_on=['air_store_id', 'visit_date'])

In [99]:
df_to_reshape.head()

Unnamed: 0,air_store_id,visit_date,month,day_of_week,holiday_eve,non_working,air_genre_name,air_area_name,latitude,longitude,genre_in_area,total_r_in_area,reserve_visitors,visitors_mean,visitors_median,visitors_max,visitors_min,visitors
0,603,2016-01-01,1.0,0,1.0,1,4,62,35.658068,139.751599,8,51,,,,,,
1,603,2016-01-02,1.0,2,1.0,1,4,62,35.658068,139.751599,8,51,,,,,,
2,603,2016-01-03,1.0,3,0.0,1,4,62,35.658068,139.751599,8,51,,,,,,
3,603,2016-01-04,1.0,1,0.0,0,4,62,35.658068,139.751599,8,51,,,,,,
4,603,2016-01-05,1.0,5,0.0,0,4,62,35.658068,139.751599,8,51,,,,,,


In [101]:
# separate 'visitors' into output array
Y_lstm_df = df_to_reshape[['visit_date', 'air_store_id', 'visitors']].copy().fillna(0)
# take log(y+1)
Y_lstm_df['visitors'] = np.log1p(Y_lstm_df['visitors'].values)

In [102]:
# add flag for days when a restaurant was closed
df_to_reshape['closed_flag'] = np.where(df_to_reshape['visitors'].isnull() &
                                        df_to_reshape['visit_date'].isin(X['visit_date']).values, 1, 0)

In [103]:
# drop 'visitors' from dataset
df_to_reshape = df_to_reshape.drop(['visitors'], axis=1)

In [104]:
# fill in NaN values
df_to_reshape = df_to_reshape.fillna(-1)

In [105]:
# list of df_to_reshape columns without 'air_store_id' and 'visit_date'
columns_list = [x for x in list(df_to_reshape.iloc[:, 2:])]

In [106]:
# bound all numerical values between -1 and 1
# note: to avoid data leakage 'fit' should be made on train data and 'transform' on train and test data
# in this case all data in test set is taken from train set, thus fit/transform on all data
scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
scaler.fit(df_to_reshape[columns_list])
df_to_reshape[columns_list] = scaler.transform(df_to_reshape[columns_list])

  return self.partial_fit(X, y)


In [107]:
# SPECIFIC PREPARATION FOR NEURAL NETWORK AND ENCODER/DECODER

In [108]:
# reshape X into (samples, timesteps, features)
X_all_lstm = df_to_reshape.values[:, 2:].reshape(len(ids), len(dates), df_to_reshape.shape[1]-2)

In [112]:
# isolate output for train set and reshape it for time series
Y_lstm_df = Y_lstm_df.loc[Y_lstm_df['visit_date'].isin(X['visit_date'].values) &
                         Y_lstm_df['air_store_id'].isin(X['air_store_id'].values),]
Y_lstm = Y_lstm_df.values[:,2].reshape(len(X['air_store_id'].unique()),
                                      len(X['visit_date'].unique()),
                                      1)

In [115]:
# test dates
n_test_dates = len(X_test['visit_date'].unique())

In [116]:
# make additional features for number of visitors in t-1, t-2, ..., t-7
t_minus = np.ones([Y_lstm.shape[0], Y_lstm.shape[1], 1])

for i in range(1,8):
    temp = Y_lstm.copy()
    temp[:, i:, :] = Y_lstm[:, 0:-i, :].copy()
    t_minus = np.concatenate((t_minus[...], temp[...]), axis=2)
t_minus = t_minus[:, :, 1:]
print("t_minus shape", t_minus.shape)

t_minus shape (829, 478, 7)


In [117]:
# split X_all into training and test data
X_lstm = X_all_lstm[:, :-n_test_dates, :]
X_lstm_test = X_all_lstm[:, -n_test_dates:, :]

In [118]:
# add t-1, t-2, ..., t-7 visitors to feature vector
X_lstm = np.concatenate((X_lstm[...], t_minus[...]), axis=2)

In [119]:
# split training set into train and validation sets
X_tr = X_lstm[:, 39:-140, :]
Y_tr = Y_lstm[:, 39:-140, :]
X_val = X_lstm[:, -140:, :]
Y_val = Y_lstm[:, -140:, :]

In [120]:
# AE MODEL

In [121]:
num_encoder_tokens = X_lstm.shape[2]
latent_dim = 256 # or 64 to avoid "kernel run out of time" situation

In [122]:
# encoder training
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim,
              batch_input_shape=(1, None, num_encoder_tokens),
              stateful=False,
              return_sequences=True,
              return_state=True,
              recurrent_initializer='glorot_uniform')
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [123]:
# decoder training, using 'encoder_states' as initial state
decoder_inputs = Input(shape=(None, num_encoder_tokens))
decoder_lstm_1 = LSTM(latent_dim,
                     batch_input_shape=(1, None, num_encoder_tokens),
                     stateful=False,
                     return_sequences=True,
                     return_state=False,
                     dropout=.2,
                     recurrent_dropout=.2)
decoder_lstm_2 = LSTM(128, # or 32 to avoid 'kernel run out of time' situation
                     stateful=False,
                     return_sequences=True,
                     return_state=True,
                     dropout=.2,
                     recurrent_dropout=.2)
decoder_outputs, _, _ = decoder_lstm_2(decoder_lstm_1(decoder_inputs, initial_state=encoder_states))
decoder_dense = TimeDistributed(Dense(Y_lstm.shape[2], activation='relu'))
decoder_outputs = decoder_dense(decoder_outputs)

In [125]:
# training model
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
training_model.compile(optimizer='Adam', loss='mean_squared_error')

In [126]:
training_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 23)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 23)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  286720      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, None, 256)    286720      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

In [127]:
# GENERATOR APPLIED TO FEED ENCODER AND DECODER

In [130]:
# generator that randomly creates time series of 39 consecutive days
# these time series are all of shape (829 restaurants, 39 days, num_features)
def dec_enc_n_days_gen(X, Y, length):
    while True:
        decoder_boundary = X.shape[1] - length -1
        
        encoder_start = np.random.randint(0, decoder_boundary)
        encoder_end = encoder_start + length
        
        decoder_start = encoder_start + 1
        decoder_end = encoder_end + 1
        
        X_to_conc = X[:, encoder_start:encoder_end, :]
        Y_to_conc = Y[:, encoder_start:encoder_end, :]
        X_to_decoder = X[:, decoder_start:decoder_end, :]
        Y_decoder = Y[:, decoder_start:decoder_end, :]
        
        yield([X_to_conc, X_to_decoder, Y_decoder])

In [129]:
# TRAINING

In [131]:
# training on X_tr/Y_tr and validate with X_val/Y_val
"""
training_model.fit_generator(dec_enc_n_days_gen(X_tr, Y_tr, 39),
                            validation_data = dec_enc_n_days_gen(X_val, Y_val, 39),
                            steps_per_epoch=X_lstm.shape[0],
                            validation_steps=X_val.shape[0],
                            verbose=1,
                            epochs=1)
"""

# training on full dateset
training_model.fit_generator(dec_enc_n_days_gen(X_lstm[:, :, :], Y_lstm[:, :, :], 39),
                            steps_per_epoch=X_lstm[:, :, :].shape[0],
                            verbose=1,
                            epochs=1)

Epoch 1/1


ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[[-0.8181818181818182, 0.33333333333333326, -1.0, ...,
         3.1780538303479458, 3.6375861597263857, 2.772588722239781],
        [-0.8181818181818182, -1.0, -1.0, ..., 0.6931471805599453,
 ...