In [1]:
import time

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from datetime import date, timedelta

In [5]:
from dateutil.parser import parse

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
# prepare datasets
air_reserve = pd.read_csv('air_reserve.csv').rename(columns={'air_store_id':'store_id'})
hpg_reserve = pd.read_csv('hpg_reserve.csv').rename(columns={'hpg_store_id':'store_id'})
air_store = pd.read_csv('air_store_info.csv').rename(columns={'air_store_id':'store_id'})
hpg_store = pd.read_csv('hpg_store_info.csv').rename(columns={'hpg_store_id':'store_id'})
air_visit = pd.read_csv('air_visit_data.csv').rename(columns={'air_store_id':'store_id'})
store_id_map = pd.read_csv('store_id_relation.csv').set_index('hpg_store_id',drop=False)
date_info = pd.read_csv('date_info.csv').rename(columns={'calendar_date': 'visit_date'}).drop('day_of_week',axis=1)
submission = pd.read_csv('sample_submission.csv')

In [8]:
submission.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,0
1,air_00a91d42b08b08d9_2017-04-24,0
2,air_00a91d42b08b08d9_2017-04-25,0
3,air_00a91d42b08b08d9_2017-04-26,0
4,air_00a91d42b08b08d9_2017-04-27,0


In [9]:
submission['visit_date'] = submission['id'].str[-10:]
submission['store_id'] = submission['id'].str[:-11]

In [12]:
submission.head()

Unnamed: 0,id,visitors,visit_date,store_id
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9


In [10]:
air_reserve.head()

Unnamed: 0,store_id,visit_datetime,reserve_datetime,reserve_visitors
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6
3,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2
4,air_db80363d35f10926,2016-01-01 20:00:00,2016-01-01 01:00:00,5


In [14]:
air_reserve['visit_date'] = air_reserve['visit_datetime'].str[:10]
air_reserve['reserve_date'] = air_reserve['reserve_datetime'].str[:10]
air_reserve['dow'] = pd.to_datetime(air_reserve['visit_date']).dt.dayofweek

In [15]:
air_reserve.head()

Unnamed: 0,store_id,visit_datetime,reserve_datetime,reserve_visitors,visit_date,reserve_date,dow
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1,2016-01-01,2016-01-01,4
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3,2016-01-01,2016-01-01,4
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6,2016-01-01,2016-01-01,4
3,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2,2016-01-01,2016-01-01,4
4,air_db80363d35f10926,2016-01-01 20:00:00,2016-01-01 01:00:00,5,2016-01-01,2016-01-01,4


In [16]:
hpg_reserve['visit_date'] = hpg_reserve['visit_datetime'].str[:10]
hpg_reserve['reserve_date'] = hpg_reserve['reserve_datetime'].str[:10]
hpg_reserve['dow'] = pd.to_datetime(hpg_reserve['visit_date']).dt.dayofweek

In [17]:
hpg_reserve.head()

Unnamed: 0,store_id,visit_datetime,reserve_datetime,reserve_visitors,visit_date,reserve_date,dow
0,hpg_c63f6f42e088e50f,2016-01-01 11:00:00,2016-01-01 09:00:00,1,2016-01-01,2016-01-01,4
1,hpg_dac72789163a3f47,2016-01-01 13:00:00,2016-01-01 06:00:00,3,2016-01-01,2016-01-01,4
2,hpg_c8e24dcf51ca1eb5,2016-01-01 16:00:00,2016-01-01 14:00:00,2,2016-01-01,2016-01-01,4
3,hpg_24bb207e5fd49d4a,2016-01-01 17:00:00,2016-01-01 11:00:00,5,2016-01-01,2016-01-01,4
4,hpg_25291c542ebb3bc2,2016-01-01 17:00:00,2016-01-01 03:00:00,13,2016-01-01,2016-01-01,4


In [18]:
air_visit['id'] = air_visit['store_id'] + '_' + air_visit['visit_date']

In [19]:
air_visit.head()

Unnamed: 0,store_id,visit_date,visitors,id
0,air_ba937bf13d40fb24,2016-01-13,25,air_ba937bf13d40fb24_2016-01-13
1,air_ba937bf13d40fb24,2016-01-14,32,air_ba937bf13d40fb24_2016-01-14
2,air_ba937bf13d40fb24,2016-01-15,29,air_ba937bf13d40fb24_2016-01-15
3,air_ba937bf13d40fb24,2016-01-16,22,air_ba937bf13d40fb24_2016-01-16
4,air_ba937bf13d40fb24,2016-01-18,6,air_ba937bf13d40fb24_2016-01-18


In [20]:
hpg_reserve['store_id'] = hpg_reserve['store_id'].map(store_id_map['air_store_id']).fillna(hpg_reserve['store_id'])

In [21]:
hpg_reserve.head()

Unnamed: 0,store_id,visit_datetime,reserve_datetime,reserve_visitors,visit_date,reserve_date,dow
0,hpg_c63f6f42e088e50f,2016-01-01 11:00:00,2016-01-01 09:00:00,1,2016-01-01,2016-01-01,4
1,hpg_dac72789163a3f47,2016-01-01 13:00:00,2016-01-01 06:00:00,3,2016-01-01,2016-01-01,4
2,hpg_c8e24dcf51ca1eb5,2016-01-01 16:00:00,2016-01-01 14:00:00,2,2016-01-01,2016-01-01,4
3,hpg_24bb207e5fd49d4a,2016-01-01 17:00:00,2016-01-01 11:00:00,5,2016-01-01,2016-01-01,4
4,hpg_25291c542ebb3bc2,2016-01-01 17:00:00,2016-01-01 03:00:00,13,2016-01-01,2016-01-01,4


In [22]:
hpg_store['store_id'] = hpg_store['store_id'].map(store_id_map['air_store_id']).fillna(hpg_store['store_id'])

In [23]:
hpg_store.rename(columns={'hpg_genre_name':'air_genre_name', 'hpg_area_name':'air_area_name'}, inplace=True)

In [24]:
hpg_store.head()

Unnamed: 0,store_id,air_genre_name,air_area_name,latitude,longitude
0,hpg_6622b62385aec8bf,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
1,hpg_e9e068dd49c5fa00,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
2,hpg_2976f7acb4b3a3bc,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
3,hpg_e51a522e098f024c,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221
4,hpg_e3d0e1519894f275,Japanese style,Tōkyō-to Setagaya-ku Taishidō,35.643675,139.668221


In [26]:
data = pd.concat([air_visit, submission]).copy()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [28]:
data['dow'] = pd.to_datetime(data['visit_date']).dt.dayofweek

In [29]:
date_info['holiday_flg2'] = pd.to_datetime(date_info['visit_date']).dt.dayofweek
date_info['holiday_flg2'] = ((date_info['holiday_flg2'] > 4) | (date_info['holiday_flg'] == 1)).astype(int)

In [30]:
date_info.head()

Unnamed: 0,visit_date,holiday_flg,holiday_flg2
0,2016-01-01,1,1
1,2016-01-02,1,1
2,2016-01-03,1,1
3,2016-01-04,0,0
4,2016-01-05,0,0


In [31]:
air_store['air_area_name0'] = air_store['air_area_name'].apply(lambda x: x.split(' ')[0])

In [32]:
lbl = LabelEncoder()

In [33]:
air_store['air_genre_name'] = lbl.fit_transform(air_store['air_genre_name'])
air_store['air_area_name0'] = lbl.fit_transform(air_store['air_area_name0'])

In [34]:
air_store.head()

Unnamed: 0,store_id,air_genre_name,air_area_name,latitude,longitude,air_area_name0
0,air_0f0cdeee6c9bf3d7,6,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3
1,air_7cc17a324ae5c7dc,6,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3
2,air_fee8dcf4d619598e,6,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3
3,air_a17f0778617c76e2,6,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3
4,air_83db5aff8f50478e,6,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7


In [35]:
data['visitors'] = np.log1p(data['visitors'])

In [36]:
data = data.merge(air_store, on='store_id', how='left')
data = data.merge(date_info[['visit_date', 'holiday_flg', 'holiday_flg2']], on=['visit_date'], how='left')

In [37]:
data.head()

Unnamed: 0,id,store_id,visit_date,visitors,dow,air_genre_name,air_area_name,latitude,longitude,air_area_name0,holiday_flg,holiday_flg2
0,air_ba937bf13d40fb24_2016-01-13,air_ba937bf13d40fb24,2016-01-13,3.258097,2,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
1,air_ba937bf13d40fb24_2016-01-14,air_ba937bf13d40fb24,2016-01-14,3.496508,3,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
2,air_ba937bf13d40fb24_2016-01-15,air_ba937bf13d40fb24,2016-01-15,3.401197,4,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0
3,air_ba937bf13d40fb24_2016-01-16,air_ba937bf13d40fb24,2016-01-16,3.135494,5,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,1
4,air_ba937bf13d40fb24_2016-01-18,air_ba937bf13d40fb24,2016-01-18,1.94591,0,4,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,7,0,0


In [38]:
def concat(L):
    result = None
    for l in L:
        if result is None:
            result = l
        else:
            try:
                result[l.columns.tolist()] = l
            except:
                print(l.head())
    return result

In [39]:
def left_merge(data1, data2, on):
    if not isinstance(on, list):
        on = [on]
    if (set(on) & set(data2.columns)) != set(on):
        data2_temp = data2.reset_index()
    else:
        data2_temp = data2.copy()
    columns = [f for f in data2.columns if f not in on]
    result = data1.merge(data2_temp, on=on, how='left')
    result = result[columns]
    return result

In [40]:
def diff_of_days(day1, day2):
    days = (parse(day1[:10]) - parse(day2[:10])).days
    return days

In [41]:
def date_add_days(start_date, days):
    end_date = parse(start_date[:10]) + timedelta(days=days)
    end_date = end_date.strftime('%Y-%m-%d')
    return end_date

In [42]:
def get_label(end_date, n_day):
    label_end_date = date_add_days(end_date, n_day)
    label = data[(data['visit_date'] < label_end_date) & (data['visit_date'] >= end_date)].copy()
    label['end_date'] = end_date
    label['diff_day'] = label['visit_date'].apply(lambda x: diff_of_days(x, end_date))
    label['month'] = label['visit_date'].str[5:7].astype(int)
    label['year'] = label['visit_date'].str[:4].astype(int)
    for i in [3, 2, 1, -1]:
        date_info_temp = date_info.copy()
        date_info_temp['visit_date'] = date_info_temp['visit_date'].apply(lambda x: date_add_days(x,i))
        date_info_temp.rename(columns={'holiday_flg':'ahead_holiday_{}'.format(i),
                                       'holiday_flg2':'ahead_holiday2_{}'.format(i)},
                             inplace=True)
        label = label.merge(date_info_temp, on=['visit_date'], how='left')
    label = label.reset_index(drop=True)
    return label

In [44]:
def get_store_visitor_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_tmp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    result = data_tmp.groupby(['store_id'], as_index=False)['visitors'].agg({'store_min{}'.format(n_day): 'min',
                                                                            'store_mean{}'.format(n_day): 'mean',
                                                                            'store_median{}'.format(n_day): 'median',
                                                                            'store_max{}'.format(n_day): 'max',
                                                                            'store_count{}'.format(n_day): 'count',
                                                                             'store_std{}'.format(n_day): 'std',
                                                                             'store_skew{}'.format(n_day): 'skew'})
    result = left_merge(label, result, on=['store_id']).fillna(0)
    return result

In [45]:
def get_store_exp_visitor_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_tmp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    data_tmp['visit_date'] = data_tmp['visit_date'].apply(lambda x: diff_of_days(key[0], x))
    data_tmp['weight'] = data_tmp['visit_date'].apply(lambda x: .985**x)
    data_tmp['visitors'] = data_tmp['visitors'] * data_tmp['weight']
    result1 = data_tmp.groupby(['store_id'], as_index=False)['visitors'].agg({'store_exp_mean{}'.format(n_day): 'sum'})
    result2 = data_tmp.groupby(['store_id'], as_index=False)['weight'].agg({'store_exp_weight_sum{}'.format(n_day): 'sum'})
    result = result1.merge(result2, on=['store_id'], how='left')
    result['store_exp_mean{}'.format(n_day)] = result['store_exp_mean{}'.format(n_day)] / result['store_exp_weight_sum{}'.format(n_day)]
    result = left_merge(label, result, on=['store_id']).fillna(0)
    return result

In [46]:
def get_store_week_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_tmp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    result = data_tmp.groupby(['store_id', 'dow'], as_index=False)['visitors'].agg({'store_dow_min'.format(n_day): 'min',
                                                                                   'store_dow_mean{}'.format(n_day): 'mean',
                                                                                     'store_dow_median{}'.format(n_day): 'median',
                                                                                     'store_dow_max{}'.format(n_day): 'max',
                                                                                     'store_dow_count{}'.format(n_day): 'count',
                                                                                     'store_dow_std{}'.format(n_day): 'std',
                                                                                     'store_dow_skew{}'.format(n_day): 'skew'})
    result = left_merge(label, result, on=['store_id', 'dow']).fillna(0)
    return result

In [65]:
def get_store_week_diff_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_tmp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    result = data_tmp.set_index(['store_id', 'visit_date'])['visitors'].unstack()
    result = result.diff(axis=1).iloc[:,1:]
    c = result.columns
    result['store_diff_mean'] = np.abs(result[c]).mean(axis=1)
    result['store_diff_std'] = result[c].std(axis=1)
    result['store_diff_max'] = result[c].max(axis=1)
    result['store_diff_min'] = result[c].min(axis=1)
    result = left_merge(label, result[['store_diff_mean', 'store_diff_std', 'store_diff_max', 'store_diff_min']],
                        on=['store_id']).fillna(0)
    return result

In [48]:
def get_store_all_week_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_tmp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    result_tmp = data_tmp.groupby(['store_id', 'dow'], as_index=False)['visitors'].agg({'store_dow_mean{}'.format(n_day): 'mean',
                                                                                        'store_dow_median{}'.format(n_day): 'median',
                                                                                        'store_dow_sum{}'.format(n_day): 'max',
                                                                                        'store_dow_count{}'.format(n_day): 'count'})
    result = pd.DataFrame()
    for i in range(7):
        result_sub = result_tmp[result_tmp['dow'] == i].copy()
        result_sub = result_sub.set_index('store_id')
        result_sub = result_sub.add_prefix(str(i))
        result_sub = left_merge(label, result_sub, on=['store_id']).fillna(0)
    return result

In [49]:
def get_store_week_exp_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    data_temp['visit_date'] = data_temp['visit_date'].apply(lambda x: diff_of_days(key[0],x))
    data_temp['visitors2'] = data_temp['visitors']
    result = None
    for i in [0.9,0.95,0.97,0.98,0.985,0.99,0.999,0.9999]:
        data_temp['weight'] = data_temp['visit_date'].apply(lambda x: i**x)
        data_temp['visitors1'] = data_temp['visitors'] * data_temp['weight']
        data_temp['visitors2'] = data_temp['visitors2'] * data_temp['weight']
        result1 = data_temp.groupby(['store_id', 'dow'], as_index=False)['visitors1'].agg({'store_dow_exp_mean{}_{}'.format(n_day,i): 'sum'})
        result3 = data_temp.groupby(['store_id', 'dow'], as_index=False)['visitors2'].agg({'store_dow_exp_mean2{}_{}'.format(n_day, i): 'sum'})
        result2 = data_temp.groupby(['store_id', 'dow'], as_index=False)['weight'].agg({'store_dow_exp_weight_sum{}_{}'.format(n_day,i): 'sum'})
        result_temp = result1.merge(result2, on=['store_id', 'dow'], how='left')
        result_temp = result_temp.merge(result3, on=['store_id', 'dow'], how='left')
        result_temp['store_dow_exp_mean{}_{}'.format(n_day,i)] = result_temp['store_dow_exp_mean{}_{}'.format(n_day,i)]/result_temp['store_dow_exp_weight_sum{}_{}'.format(n_day,i)]
        result_temp['store_dow_exp_mean2{}_{}'.format(n_day, i)] = result_temp[ 'store_dow_exp_mean2{}_{}'.format(n_day, i)]/result_temp['store_dow_exp_weight_sum{}_{}'.format(n_day, i)]
        if result is None:
            result = result_temp
        else:
            result = result.merge(result_temp, on=['store_id','dow'], how='left')
    result = left_merge(label, result, on=['store_id', 'dow']).fillna(0)
    return result

In [50]:
def get_store_holiday_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    result1 = data_temp.groupby(['store_id', 'holiday_flg'], as_index=False)['visitors'].agg({'store_holiday_min{}'.format(n_day): 'min',
                                                                                              'store_holiday_mean{}'.format(n_day): 'mean',
                                                                                              'store_holiday_median{}'.format(n_day): 'median',
                                                                                              'store_holiday_max{}'.format(n_day): 'max',
                                                                                              'store_holiday_count{}'.format(n_day): 'count',
                                                                                              'store_holiday_std{}'.format(n_day): 'std',
                                                                                              'store_holiday_skew{}'.format(n_day): 'skew'})
    result1 = left_merge(label, result1, on=['store_id', 'holiday_flg']).fillna(0)
    result2 = data_temp.groupby(['store_id', 'holiday_flg2'], as_index=False)['visitors'].agg({'store_holiday2_min{}'.format(n_day): 'min',
                                                                                               'store_holiday2_mean{}'.format(n_day): 'mean',
                                                                                               'store_holiday2_median{}'.format(n_day): 'median',
                                                                                               'store_holiday2_max{}'.format(n_day): 'max',
                                                                                               'store_holiday2_count{}'.format(n_day): 'count',
                                                                                               'store_holiday2_std{}'.format(n_day): 'std',
                                                                                               'store_holiday2_skew{}'.format(n_day): 'skew'})
    result2 = left_merge(label, result2, on=['store_id', 'holiday_flg2']).fillna(0)
    result = pd.concat([result1, result2], axis=1)
    return result

In [51]:
def get_genre_visitor_feat(label, key, n_day):
    start_date = date_add_days(key[0],-n_day)
    data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    result = data_temp.groupby(['air_genre_name'], as_index=False)['visitors'].agg({'genre_min{}'.format(n_day): 'min',
                                                                                    'genre_mean{}'.format(n_day): 'mean',
                                                                                    'genre_median{}'.format(n_day): 'median',
                                                                                    'genre_max{}'.format(n_day): 'max',
                                                                                    'genre_count{}'.format(n_day): 'count',
                                                                                    'genre_std{}'.format(n_day): 'std',
                                                                                    'genre_skew{}'.format(n_day): 'skew'})
    result = left_merge(label, result, on=['air_genre_name']).fillna(0)
    return result

In [52]:
def get_genre_exp_visitor_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    data_temp['visit_date'] = data_temp['visit_date'].apply(lambda x: diff_of_days(key[0],x))
    data_temp['weight'] = data_temp['visit_date'].apply(lambda x: 0.985**x)
    data_temp['visitors'] = data_temp['visitors'] * data_temp['weight']
    result1 = data_temp.groupby(['air_genre_name'], as_index=False)['visitors'].agg({'genre_exp_mean{}'.format(n_day): 'sum'})
    result2 = data_temp.groupby(['air_genre_name'], as_index=False)['weight'].agg({'genre_exp_weight_sum{}'.format(n_day): 'sum'})
    result = result1.merge(result2, on=['air_genre_name'], how='left')
    result['genre_exp_mean{}'.format(n_day)] = result['genre_exp_mean{}'.format(n_day)]/result['genre_exp_weight_sum{}'.format(n_day)]
    result = left_merge(label, result, on=['air_genre_name']).fillna(0)
    return result

In [53]:
def get_genre_week_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    result = data_temp.groupby(['air_genre_name', 'dow'], as_index=False)['visitors'].agg({'genre_dow_min{}'.format(n_day): 'min',
                                                                                           'genre_dow_mean{}'.format(n_day): 'mean',
                                                                                           'genre_dow_median{}'.format(n_day): 'median',
                                                                                           'genre_dow_max{}'.format(n_day): 'max',
                                                                                           'genre_dow_count{}'.format(n_day): 'count',
                                                                                           'genre_dow_std{}'.format(n_day): 'std',
                                                                                           'genre_dow_skew{}'.format(n_day): 'skew'})
    result = left_merge(label, result, on=['air_genre_name', 'dow']).fillna(0)
    return result

In [54]:
def get_genre_week_exp_feat(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    data_temp['visit_date'] = data_temp['visit_date'].apply(lambda x: diff_of_days(key[0],x))
    data_temp['weight'] = data_temp['visit_date'].apply(lambda x: 0.985**x)
    data_temp['visitors'] = data_temp['visitors'] * data_temp['weight']
    result1 = data_temp.groupby(['air_genre_name', 'dow'], as_index=False)['visitors'].agg({'genre_dow_exp_mean{}'.format(n_day): 'sum'})
    result2 = data_temp.groupby(['air_genre_name', 'dow'], as_index=False)['weight'].agg({'genre_dow_exp_weight_sum{}'.format(n_day): 'sum'})
    result = result1.merge(result2, on=['air_genre_name', 'dow'], how='left')
    result['genre_dow_exp_mean{}'.format(n_day)] = result['genre_dow_exp_mean{}'.format(n_day)]/result['genre_dow_exp_weight_sum{}'.format(n_day)]
    result = left_merge(label, result, on=['air_genre_name', 'dow']).fillna(0)
    return result

In [55]:
def get_first_last_time(label, key, n_day):
    start_date = date_add_days(key[0], -n_day)
    data_tmp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy()
    data_tmp = data_tmp.sort_values('visit_date')
    result = data_tmp.groupby('store_id')['visit_date'].agg({'first_time': lambda x: diff_of_days(key[0], np.min(x)),
                                                            'last_time': lambda x: diff_of_days(key[0], np.max(x)),})
    result = left_merge(label, result, on=['store_id']).fillna(0)
    return result

In [56]:
# air_reserve
def get_reserve_feat(label,key):
    label_end_date = date_add_days(key[0], key[1])
    air_reserve_temp = air_reserve[(air_reserve.visit_date >= key[0]) &             # key[0] 是'2017-04-23'
                                   (air_reserve.visit_date < label_end_date) &      # label_end_date 是'2017-05-31'
                                   (air_reserve.reserve_date < key[0])].copy()
    air_reserve_temp = air_reserve_temp.merge(air_store,on='store_id',how='left')
    air_reserve_temp['diff_time'] = (pd.to_datetime(air_reserve['visit_datetime'])-pd.to_datetime(air_reserve['reserve_datetime'])).dt.days
    air_reserve_temp = air_reserve_temp.merge(air_store,on='store_id')
    air_result = air_reserve_temp.groupby(['store_id', 'visit_date'])['reserve_visitors'].agg({'air_reserve_visitors': 'sum',
                                                                                               'air_reserve_count': 'count'})
    air_store_diff_time_mean = air_reserve_temp.groupby(['store_id', 'visit_date'])['diff_time'].agg({'air_store_diff_time_mean': 'mean'})
    air_diff_time_mean = air_reserve_temp.groupby(['visit_date'])['diff_time'].agg({'air_diff_time_mean': 'mean'})
    air_result = air_result.unstack().fillna(0).stack()
    air_date_result = air_reserve_temp.groupby(['visit_date'])['reserve_visitors'].agg({'air_date_visitors': 'sum',
                                                                                        'air_date_count': 'count'})
    hpg_reserve_temp = hpg_reserve[(hpg_reserve.visit_date >= key[0]) & (hpg_reserve.visit_date < label_end_date) & (hpg_reserve.reserve_date < key[0])].copy()
    hpg_reserve_temp['diff_time'] = (pd.to_datetime(hpg_reserve['visit_datetime']) - pd.to_datetime(hpg_reserve['reserve_datetime'])).dt.days
    hpg_result = hpg_reserve_temp.groupby(['store_id', 'visit_date'])['reserve_visitors'].agg({'hpg_reserve_visitors': 'sum',
                                                                                               'hpg_reserve_count': 'count'})
    hpg_result = hpg_result.unstack().fillna(0).stack()
    hpg_date_result = hpg_reserve_temp.groupby(['visit_date'])['reserve_visitors'].agg({'hpg_date_visitors': 'sum',
                                                                                        'hpg_date_count': 'count'})
    hpg_store_diff_time_mean = hpg_reserve_temp.groupby(['store_id', 'visit_date'])['diff_time'].agg({'hpg_store_diff_time_mean': 'mean'})
    hpg_diff_time_mean = hpg_reserve_temp.groupby(['visit_date'])['diff_time'].agg({'hpg_diff_time_mean': 'mean'})
    air_result = left_merge(label, air_result, on=['store_id','visit_date']).fillna(0)
    air_store_diff_time_mean = left_merge(label, air_store_diff_time_mean, on=['store_id', 'visit_date']).fillna(0)
    hpg_result = left_merge(label, hpg_result, on=['store_id', 'visit_date']).fillna(0)
    hpg_store_diff_time_mean = left_merge(label, hpg_store_diff_time_mean, on=['store_id', 'visit_date']).fillna(0)
    air_date_result = left_merge(label, air_date_result, on=['visit_date']).fillna(0)
    air_diff_time_mean = left_merge(label, air_diff_time_mean, on=['visit_date']).fillna(0)
    hpg_date_result = left_merge(label, hpg_date_result, on=['visit_date']).fillna(0)
    hpg_diff_time_mean = left_merge(label, hpg_diff_time_mean, on=['visit_date']).fillna(0)
    result = pd.concat([air_result,
                        hpg_result,
                        air_date_result,
                        hpg_date_result,
                        air_store_diff_time_mean,
                        hpg_store_diff_time_mean,
                        air_diff_time_mean,
                        hpg_diff_time_mean],axis=1)
    return result

In [57]:
# second feature
def second_feat(result):
    result['store_mean_14_28_rate'] = result['store_mean14']/(result['store_mean28']+0.01)
    result['store_mean_28_56_rate'] = result['store_mean28'] / (result['store_mean56'] + 0.01)
    result['store_mean_56_1000_rate'] = result['store_mean56'] / (result['store_mean1000'] + 0.01)
    result['genre_mean_28_56_rate'] = result['genre_mean28'] / (result['genre_mean56'] + 0.01)
    result['sgenre_mean_56_1000_rate'] = result['genre_mean56'] / (result['genre_mean1000'] + 0.01)
    return result

In [58]:
# 制作训练集
def make_feats(end_date,n_day):
    t0 = time.time()
    key = end_date,n_day
    print('data key：{}'.format(key))
    print('add label')
    label = get_label(end_date,n_day)

    print('make features...')
    result = [label]
    result.append(get_store_visitor_feat(label, key, 1000))        # store features
    result.append(get_store_visitor_feat(label, key, 56))          # store features
    result.append(get_store_visitor_feat(label, key, 28))          # store features
    result.append(get_store_visitor_feat(label, key, 14))          # store features
    result.append(get_store_exp_visitor_feat(label, key, 1000))    # store exp features
    result.append(get_store_week_feat(label, key, 1000))           # store dow features
    result.append(get_store_week_feat(label, key, 56))             # store dow features
    result.append(get_store_week_feat(label, key, 28))             # store dow features
    result.append(get_store_week_feat(label, key, 14))             # store dow features
    result.append(get_store_week_diff_feat(label, key, 58))       # store dow diff features
    result.append(get_store_week_diff_feat(label, key, 1000))      # store dow diff features
    result.append(get_store_all_week_feat(label, key, 1000))       # store all week feat
    result.append(get_store_week_exp_feat(label, key, 1000))       # store dow exp feat
    result.append(get_store_holiday_feat(label, key, 1000))        # store holiday feat

    result.append(get_genre_visitor_feat(label, key, 1000))         # genre feature
    result.append(get_genre_visitor_feat(label, key, 56))           # genre feature
    result.append(get_genre_visitor_feat(label, key, 28))           # genre feature
    result.append(get_genre_exp_visitor_feat(label, key, 1000))     # genre feature
    result.append(get_genre_week_feat(label, key, 1000))            # genre dow feature
    result.append(get_genre_week_feat(label, key, 56))              # genre dow feature
    result.append(get_genre_week_feat(label, key, 28))              # genre dow feature
    result.append(get_genre_week_exp_feat(label, key, 1000))        # genre dow exp feature

    result.append(get_reserve_feat(label,key))                      # air_reserve
    result.append(get_first_last_time(label,key,1000))             # first time and last time

    result.append(label)

    print('merge...')
    result = concat(result)

    result = second_feat(result)

    print('data shape：{}'.format(result.shape))
    print('spending {}s'.format(time.time() - t0))
    return result

In [59]:
import datetime

In [60]:
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [61]:
train_feat = pd.DataFrame()

In [62]:
start_date = '2017-03-12'

In [66]:
for i in range(58):
    train_feat_sub = make_feats(date_add_days(start_date, i*(-7)), 39)
    train_feat = pd.concat([train_feat, train_feat_sub])

data key：('2017-03-12', 39)
add label
make features...


is deprecated and will be removed in a future version
  # This is added back by InteractiveShellApp.init_path()
is deprecated and will be removed in a future version
  if sys.path[0] == '':
is deprecated and will be removed in a future version
  del sys.path[0]
is deprecated and will be removed in a future version
  app.launch_new_instance()
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
  


merge...
data shape：(27728, 186)
spending 79.18613910675049s
data key：('2017-03-05', 39)
add label
make features...
merge...
data shape：(27705, 186)
spending 81.14532995223999s
data key：('2017-02-26', 39)
add label
make features...
merge...
data shape：(27745, 186)
spending 83.7630226612091s
data key：('2017-02-19', 39)
add label
make features...
merge...
data shape：(27696, 186)
spending 79.07429885864258s
data key：('2017-02-12', 39)
add label
make features...
merge...
data shape：(27596, 186)
spending 80.3424620628357s
data key：('2017-02-05', 39)
add label
make features...
merge...
data shape：(27561, 186)
spending 75.00303483009338s
data key：('2017-01-29', 39)
add label
make features...
merge...
data shape：(27511, 186)
spending 75.11559128761292s
data key：('2017-01-22', 39)
add label
make features...
merge...
data shape：(27474, 186)
spending 73.7740409374237s
data key：('2017-01-15', 39)
add label
make features...
merge...
data shape：(27389, 186)
spending 65.32996392250061s
data key：('201

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


data key：('2016-09-11', 39)
add label
make features...
merge...
data shape：(26767, 186)
spending 38.49063301086426s
data key：('2016-09-04', 39)
add label
make features...
merge...
data shape：(26707, 186)
spending 38.93418598175049s
data key：('2016-08-28', 39)
add label
make features...
merge...
data shape：(26806, 186)
spending 36.33412504196167s
data key：('2016-08-21', 39)
add label
make features...
merge...
data shape：(26698, 186)
spending 34.55828785896301s
data key：('2016-08-14', 39)
add label
make features...
merge...
data shape：(26261, 186)
spending 33.20579195022583s
data key：('2016-08-07', 39)
add label
make features...
merge...
data shape：(26168, 186)
spending 31.792171001434326s
data key：('2016-07-31', 39)
add label
make features...
merge...
data shape：(26106, 184)
spending 30.364362001419067s
data key：('2016-07-24', 39)
add label
make features...
merge...
data shape：(26147, 186)
spending 28.827205181121826s
data key：('2016-07-17', 39)
add label
make features...
merge...
data 

In [67]:
for i in range(1, 6):
    train_feat_sub = make_feats(date_add_days(start_date, i*7), 42-i*7)
    train_feat = pd.concat([train_feat, train_feat_sub])

data key：('2017-03-19', 35)
add label
make features...


is deprecated and will be removed in a future version
  # This is added back by InteractiveShellApp.init_path()
is deprecated and will be removed in a future version
  if sys.path[0] == '':
is deprecated and will be removed in a future version
  del sys.path[0]
is deprecated and will be removed in a future version
  app.launch_new_instance()
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
  


merge...
data shape：(24969, 186)
spending 79.93775534629822s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


data key：('2017-03-26', 28)
add label
make features...
merge...
data shape：(20049, 186)
spending 80.9590117931366s
data key：('2017-04-02', 21)
add label
make features...
merge...
data shape：(14999, 186)
spending 81.22624182701111s
data key：('2017-04-09', 14)
add label
make features...
merge...
data shape：(10008, 186)
spending 86.13845014572144s
data key：('2017-04-16', 7)
add label
make features...
merge...
data shape：(5012, 186)
spending 88.1272759437561s


In [68]:
test_feat = make_feats(date_add_days(start_date, 42), 39)

data key：('2017-04-23', 39)
add label
make features...


is deprecated and will be removed in a future version
  # This is added back by InteractiveShellApp.init_path()
is deprecated and will be removed in a future version
  if sys.path[0] == '':
is deprecated and will be removed in a future version
  del sys.path[0]
is deprecated and will be removed in a future version
  app.launch_new_instance()
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
  


merge...
data shape：(32019, 186)
spending 87.44816899299622s


In [69]:
predictors = [f for f in test_feat.columns if f not in ['id', 'store_id', 'visit_date', 'end_date', 'air_area_name', 'visitors', 'month']]

In [70]:
params = {
    'learning_rate': .02,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'sub_feature': .7,
    'num_leaves': 60,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}

In [71]:
t0 = time.time()

In [72]:
lgb_train = lgb.Dataset(train_feat[predictors], train_feat['visitors'])

In [73]:
lgb_test = lgb.Dataset(test_feat[predictors], test_feat['visitors'])

In [74]:
gbm = lgb.train(params, lgb_train, 2300)

In [75]:
pred = gbm.predict(test_feat[predictors])

In [76]:
print('Training time: {} secondes'.format(time.time() - t0))

Training time: 1021.3801219463348 secondes


In [77]:
subm = pd.DataFrame({'id': test_feat.store_id + '_' + test_feat.visit_date, 'visitors': np.expm1(pred)})

In [78]:
subm = submission[['id']].merge(subm, on='id', how='left').fillna(0)

In [79]:
subm.to_csv(r'..\sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4f')