### setup

In [1]:
import argparse
import pickle

import numpy as np; np.seterr(invalid='ignore')
import pandas as pd

In [2]:
parser = {
    'offset': 793,
    'val_len': 74,
    'seed': 20170911,
    'windows': [7, 14, 21, 35, 56, 91, 147, 238, 385, 623],
    'forecast_start': '2017-09-01',
    'forecast_end': '2017-11-13'
}
args = argparse.Namespace(**parser)

In [3]:
def smape(y_pred, y_true):
    y_pred = np.around(y_pred)
    denominator = y_true + y_pred
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0
    return 200 * np.nanmean(diff)

In [4]:
print("Getting data...")
full = pd.read_csv('../data/wttsf/train_2.csv')
full.iloc[:, 1:] = full.iloc[:, 1:].fillna(method='ffill', axis=1).fillna(
        method='bfill', axis=1)
datetime_list = pd.date_range(args.forecast_start, args.forecast_end)
for datetime in datetime_list:
    full[datetime.date().isoformat()] = 0

print("Constructing test set...")
test = pd.melt(full[list(
    full.columns[args.offset+1:args.offset+args.val_len+1])+['Page']],
    id_vars='Page', var_name='Date', value_name="Visits")
test['Date'] = test['Date'].astype('datetime64[ns]')
test['Weekend'] = test['Date'].dt.dayofweek >= 5

print("Constructing train set...")
train = full.iloc[:, :args.offset+1]

print("Getting medians...")
for i in args.windows:
    print(i, end=' ')
    val = 'MW'+str(i)
    tmp = pd.melt(train[list(train.columns[-i:])+['Page']],
                  id_vars='Page', var_name='Date', value_name=val)
    tmp['Date'] = tmp['Date'].astype('datetime64[ns]')
    tmp['Weekend']= tmp['Date'].dt.dayofweek >= 5           
    tmp1 = tmp.groupby(['Page', 'Weekend']).median().reset_index()
    test = test.merge(tmp1, how='left')
print("\n")

print("Getting median of medians...")
test['Predict'] = test[["MW7", "MW7", "MW14", "MW21", "MW35", "MW56", "MW91",
    "MW147", "MW238", "MW385", "MW623"]].median(axis=1)
#print("Result: ", smape(test['Predict'].values, test['Visits'].values))

Getting data...
Constructing test set...
Constructing train set...
Getting medians...
7 14 21 35 56 91 147 238 385 623 

Getting median of medians...


In [5]:
with open("../intermediate/{}/pred_fib.pkl".format(args.seed), "wb") as f:
    predict_df = test[["Page", "Date", "Predict"]].pivot(
        index='Page', columns='Date')['Predict'].loc[full["Page"]]
    pickle.dump(predict_df.values, f)

In [7]:
predict_df.head()

Date,2017-09-01 00:00:00,2017-09-02 00:00:00,2017-09-03 00:00:00,2017-09-04 00:00:00,2017-09-05 00:00:00,2017-09-06 00:00:00,2017-09-07 00:00:00,2017-09-08 00:00:00,2017-09-09 00:00:00,2017-09-10 00:00:00,...,2017-11-04 00:00:00,2017-11-05 00:00:00,2017-11-06 00:00:00,2017-11-07 00:00:00,2017-11-08 00:00:00,2017-11-09 00:00:00,2017-11-10 00:00:00,2017-11-11 00:00:00,2017-11-12 00:00:00,2017-11-13 00:00:00
Page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2NE1_zh.wikipedia.org_all-access_spider,23.0,22.5,22.5,23.0,23.0,23.0,23.0,23.0,22.5,22.5,...,22.5,22.5,23.0,23.0,23.0,23.0,23.0,22.5,22.5,23.0
2PM_zh.wikipedia.org_all-access_spider,21.0,21.5,21.5,21.0,21.0,21.0,21.0,21.0,21.5,21.5,...,21.5,21.5,21.0,21.0,21.0,21.0,21.0,21.5,21.5,21.0
3C_zh.wikipedia.org_all-access_spider,4.0,3.5,3.5,4.0,4.0,4.0,4.0,4.0,3.5,3.5,...,3.5,3.5,4.0,4.0,4.0,4.0,4.0,3.5,3.5,4.0
4minute_zh.wikipedia.org_all-access_spider,16.0,15.0,15.0,16.0,16.0,16.0,16.0,16.0,15.0,15.0,...,15.0,15.0,16.0,16.0,16.0,16.0,16.0,15.0,15.0,16.0
52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider,15.0,16.0,16.0,15.0,15.0,15.0,15.0,15.0,16.0,16.0,...,16.0,16.0,15.0,15.0,15.0,15.0,15.0,16.0,16.0,15.0
