In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [1]:
import pandas as pd
import numpy as np

air_reserve = pd.read_csv('./input_data/air_reserve.csv')
air_store_info = pd.read_csv('./input_data/air_store_info.csv')
air_visit_data = pd.read_csv('./input_data/air_visit_data.csv')

hpg_reserve = pd.read_csv('./input_data/hpg_reserve.csv')
hpg_store_info = pd.read_csv('./input_data/hpg_store_info.csv')

date_info = pd.read_csv('./input_data/date_info.csv')
store_id_relation = pd.read_csv('./input_data/store_id_relation.csv')

In [9]:
air_reserve['visit_date'] = air_reserve['visit_datetime'].map(lambda date: str(date)[0:10])
air_reserve['reserve_date'] = air_reserve['reserve_datetime'].map(lambda date: str(date)[0:10])
air_reserve['reserve_delay'] = (pd.to_datetime(air_reserve['visit_datetime']) - pd.to_datetime(air_reserve['reserve_datetime']))
air_reserve['reserve_delay_days'] = air_reserve['reserve_delay'].map(lambda timedelta: timedelta.days)

air_total_reservations = air_reserve. \
                         groupby(['air_store_id', 'visit_date'], as_index=False)['reserve_visitors']. \
                         sum()
air_mean_delay = air_reserve. \
                 groupby(['air_store_id', 'visit_date'], as_index=False)['reserve_delay_days']. \
                 mean()
air_df = air_visit_data.merge(air_total_reservations, on=['air_store_id', 'visit_date'], how='left')

air_df = air_df.merge(air_mean_delay, on=['air_store_id', 'visit_date'], how='left')
air_df = air_df.merge(air_store_info, on=['air_store_id'], how='left')
air_df = air_df.sort_values(['air_store_id', 'visit_date'])
air_df = air_df.fillna(0.0)
print(air_df.shape)
air_df.head(5)
#air_df.to_csv('./air_df.csv', index=False)

(252108, 9)


In [10]:
hpg_reserve['visit_date'] = hpg_reserve['visit_datetime'].map(lambda date: str(date)[0:10])
hpg_reserve['reserve_date'] = hpg_reserve['reserve_datetime'].map(lambda date: str(date)[0:10])
hpg_reserve['reserve_delay'] = (pd.to_datetime(hpg_reserve['visit_datetime']) - pd.to_datetime(hpg_reserve['reserve_datetime']))
hpg_reserve['reserve_delay_days'] = hpg_reserve['reserve_delay'].map(lambda timedelta: timedelta.days)

hpg_mean_delay = hpg_reserve.groupby(['hpg_store_id', 'visit_date'], as_index=False)['reserve_delay_days'].mean()
hpg_total_reservations = hpg_reserve.groupby(['hpg_store_id', 'visit_date'], as_index=False)['reserve_visitors'].sum()

hpg_df = hpg_total_reservations.merge(hpg_store_info, on = 'hpg_store_id', how = 'inner')
hpg_df = hpg_df.merge(hpg_mean_delay, on = ['hpg_store_id', 'visit_date'], how = 'left')
hpg_df = hpg_df.sort_values(['hpg_store_id', 'visit_date'])
print(hpg_df.shape)
hpg_df.head(5)
#hpg_df.to_csv('./hpg_df.csv', index=False)

(561109, 8)


In [4]:
df_sample_submission = pd.read_csv('./sample_submission.csv')
df_sample_submission['store_id'] = df_sample_submission.id.str[0:20]
air_store_ids = df_sample_submission['store_id'].unique()

In [54]:
from statsmodels.tsa.arima_model import ARIMA
from tqdm import tqdm
import datetime

import matplotlib.pyplot as plt
%matplotlib inline


training_start = datetime.date(2016, 1, 1)
training_end = datetime.date(2017, 3, 31)
test_start = datetime.date(2017, 4, 1)
test_end = datetime.date(2017, 4, 22)


def makePredictedActualPlots(df_accuracy, air_store_ids):

    for store_id in [air_store_ids[0]]:
        df = df_accuracy[df_accuracy['air_store_id'] == store_id]
        pred = df['visitors_x']
        actual = df['visitors_y']
        dates = df['visit_date']
        plt.figure()
        plt.title(store_id)
        plt.plot(dates, pred, label='predicted')
        plt.plot(dates, actual, label='actual')
        plt.legend()


def getDataFrameByDates(df, start, end):
    return df.loc[(df['visit_date'] >= start.strftime('%Y-%m-%d')) & (df['visit_date'] <= end.strftime('%Y-%m-%d'))]

train_df = getDataFrameByDates(air_df, training_start, training_end)
test_df = getDataFrameByDates(air_df, test_start, test_end)

def predictDates(input_df, test_start, air_store_ids, days_to_predict = 39, field = 'visitors'):
 
    df_predicted = pd.DataFrame({'id': [], 'air_store_id':[], 'visit_date':[], 'visitors': [] })
    prediction_dates = [(test_start + datetime.timedelta(days=x)).strftime('%Y-%m-%d') \
                        for x in range(0, days_to_predict)]

    for store_id in tqdm(air_store_ids):
        df = input_df[input_df['air_store_id'] == store_id]
        dates = list(df['visit_date'])
        visitors = [float(i) for i in list(df[field])]
        model = ARIMA(endog = visitors, order=(4, 1, 0))
        model_fit = model.fit(disp=0)
        forecast_visits = model_fit.forecast(steps = days_to_predict)
        forecast_visits = forecast_visits[0]
        store_predictions = ["{}_{}".format(a_, b_) for a_, b_ in zip(list([store_id] * (days_to_predict + 1)), prediction_dates)]
  
        df_predicted_temp = pd.DataFrame({'id': store_predictions,
                                     'air_store_id': store_id,
                                     'visit_date': prediction_dates,
                                     'visitors': forecast_visits
                                    })
        df_predicted = df_predicted.append(df_predicted_temp)
    return df_predicted

#df_predicted = predictDates(train_df, test_start, air_store_ids, 7, field = 'visitors')

In [55]:
rmlse_dict = {}
for store_id in range(0, 10):
    store_df = train_df.loc[train_df['air_store_id'] == air_store_ids[store_id]]
    df_predicted = predictDates(store_df, test_start, [air_store_ids[store_id]], 7, field = 'visitors')
    df_accuracy = df_predicted.merge(test_df, on=['air_store_id', 'visit_date'], how = 'left')
    #makePredictedActualPlots(df_accuracy, [air_store_ids[store_id]])
    rmlse_acc = RMLSE(df_accuracy['visitors_x'], df_accuracy['visitors_y'])
    #print(str(store_id) + " : " + str(rmlse_acc))
    rmlse_dict[store_id] = rmlse_acc
np.mean(list(rmlse_dict.values()))

100%|██████████| 1/1 [00:00<00:00,  9.44it/s]
100%|██████████| 1/1 [00:00<00:00, 17.93it/s]
100%|██████████| 1/1 [00:00<00:00, 10.27it/s]
100%|██████████| 1/1 [00:00<00:00, 12.53it/s]
100%|██████████| 1/1 [00:00<00:00, 13.55it/s]
100%|██████████| 1/1 [00:00<00:00, 10.36it/s]
100%|██████████| 1/1 [00:00<00:00, 10.40it/s]
100%|██████████| 1/1 [00:00<00:00, 10.08it/s]
100%|██████████| 1/1 [00:00<00:00, 10.42it/s]
100%|██████████| 1/1 [00:00<00:00, 11.32it/s]


0.53282976138757554

In [34]:
def RMLSE(pred, actual):
    n = len(pred)
    return np.sqrt( (1/n) * np.sum( (np.log1p(pred) - np.log1p(actual) )**2 ) )

0.53217726247098052