In [1]:
# import packages

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, concatenate
from tensorflow.keras import regularizers
import tensorflow.keras.backend as K

from tqdm import tqdm
import gc
import warnings

warnings.filterwarnings('ignore')

In [2]:
# define metric of evaluation

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.nanmean(diff)

def smape2D(y_true, y_pred):
    return smape(np.ravel(y_true), np.ravel(y_pred))

In [3]:
# define variables to be used

max_size = 181 # number of days in 2015 with 3 days before end
offset = 1/2

In [4]:
# read raw train data

train_all = pd.read_csv('data/web-traffic-forecasting-train.csv')
train_all.head() # train data ranges from 2015-07-01 to 2017-09-10, means 803 days count

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2017-09-01,2017-09-02,2017-09-03,2017-09-04,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,19.0,33.0,33.0,18.0,16.0,27.0,29.0,23.0,54.0,38.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,32.0,30.0,11.0,19.0,54.0,25.0,26.0,23.0,13.0,81.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,6.0,6.0,7.0,2.0,4.0,7.0,3.0,4.0,7.0,6.0
3,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,...,7.0,19.0,19.0,9.0,6.0,16.0,19.0,30.0,38.0,4.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,,,,,,,,,,...,16.0,16.0,19.0,9.0,20.0,23.0,28.0,14.0,8.0,7.0


In [5]:
# extract the 'Page' from raw train data for further preparation

all_page = train_all.Page.copy()
train_key = train_all[['Page']].copy()
train_all = train_all.iloc[:,1:] * offset
train_all.head()

Unnamed: 0,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,2015-07-10,...,2017-09-01,2017-09-02,2017-09-03,2017-09-04,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10
0,9.0,5.5,2.5,6.5,7.0,4.5,4.5,11.0,13.0,12.0,...,9.5,16.5,16.5,9.0,8.0,13.5,14.5,11.5,27.0,19.0
1,5.5,7.0,7.5,9.0,5.5,6.5,11.0,5.5,5.0,2.0,...,16.0,15.0,5.5,9.5,27.0,12.5,13.0,11.5,6.5,40.5
2,0.5,0.0,0.5,0.5,0.0,2.0,0.0,1.5,2.0,2.0,...,3.0,3.0,3.5,1.0,2.0,3.5,1.5,2.0,3.5,3.0
3,17.5,6.5,5.0,47.0,2.0,13.0,7.0,4.5,5.5,8.0,...,3.5,9.5,9.5,4.5,3.0,8.0,9.5,15.0,19.0,2.0
4,,,,,,,,,,,...,8.0,8.0,9.5,4.5,10.0,11.5,14.0,7.0,4.0,3.5


In [6]:
# define a function to get the column index of a given date

def get_date_index(date, train_all=train_all):
    for idx, c in enumerate(train_all.columns):
        if date == c:
            break
    if idx == len(train_all.columns):
        return None
    return idx

In [7]:
# make new train_all, train, and test dataset from raw train data

train_end = get_date_index('2016-09-10') + 1
test_start = get_date_index('2016-09-13')
print(f'New train dataset ends at index:', train_end)
print(f'New test dataset starts at index:', test_start)

train = train_all.iloc[:, (train_end - max_size):train_end].copy().astype('float32')
test = train_all.iloc[:, test_start:(63 + test_start)].copy().astype('float32')
train = train.iloc[:, ::-1].copy().astype('float32')

train_all = train_all.iloc[:, -(max_size):].astype('float32')
train_all = train_all.iloc[:, ::-1].copy().astype('float32')

New train dataset ends at index: 438
New test dataset starts at index: 440


In [8]:
train_all.head()

Unnamed: 0,2017-09-10,2017-09-09,2017-09-08,2017-09-07,2017-09-06,2017-09-05,2017-09-04,2017-09-03,2017-09-02,2017-09-01,...,2017-03-23,2017-03-22,2017-03-21,2017-03-20,2017-03-19,2017-03-18,2017-03-17,2017-03-16,2017-03-15,2017-03-14
0,19.0,27.0,11.5,14.5,13.5,8.0,9.0,16.5,16.5,9.5,...,22.0,11.5,11.0,14.0,13.0,15.0,14.0,10.5,9.5,12.0
1,40.5,6.5,11.5,13.0,12.5,27.0,9.5,5.5,15.0,16.0,...,15.5,27.0,13.5,16.5,14.0,14.0,18.0,26.5,22.5,12.0
2,3.0,3.5,2.0,1.5,3.5,2.0,1.0,3.5,3.0,3.0,...,1.5,0.5,4.0,4.0,4.0,3.0,1.5,4.0,1.5,0.5
3,2.0,19.0,15.0,9.5,8.0,3.0,4.5,9.5,9.5,3.5,...,7.5,12.0,12.5,12.5,12.0,13.5,12.0,14.5,7.5,9.5
4,3.5,4.0,7.0,14.0,11.5,10.0,4.5,9.5,8.0,8.0,...,7.0,7.0,10.5,10.0,7.0,11.0,9.5,12.0,6.5,12.0


In [9]:
train.head()

Unnamed: 0,2016-09-10,2016-09-09,2016-09-08,2016-09-07,2016-09-06,2016-09-05,2016-09-04,2016-09-03,2016-09-02,2016-09-01,...,2016-03-23,2016-03-22,2016-03-21,2016-03-20,2016-03-19,2016-03-18,2016-03-17,2016-03-16,2016-03-15,2016-03-14
0,8.0,27.5,9.0,9.5,9.0,11.5,9.5,12.5,10.0,10.5,...,5.5,6.0,5.5,6.5,9.0,3.0,8.5,7.0,5.0,4.5
1,6.5,6.5,17.5,21.5,22.0,9.5,39.0,33.0,8.5,15.5,...,8.5,6.0,8.5,7.5,8.5,23.0,9.0,11.0,8.0,18.0
2,9.5,2.5,5.0,5.5,3.0,3.5,1.0,2.5,3.5,2.5,...,5.0,0.5,0.5,3.5,2.0,1.5,1.0,1.0,2.0,1.0
3,4.5,5.5,6.5,7.0,3.5,5.5,8.0,4.5,7.0,7.0,...,5.0,6.5,5.5,8.5,14.0,4.5,5.5,4.0,6.0,5.0
4,2.5,2.5,2.5,2.5,0.5,1.5,2.0,19.5,1.5,2.0,...,,,,,,,,,,


In [10]:
test.head()

Unnamed: 0,2016-09-13,2016-09-14,2016-09-15,2016-09-16,2016-09-17,2016-09-18,2016-09-19,2016-09-20,2016-09-21,2016-09-22,...,2016-11-05,2016-11-06,2016-11-07,2016-11-08,2016-11-09,2016-11-10,2016-11-11,2016-11-12,2016-11-13,2016-11-14
0,5.5,6.5,10.0,10.5,6.5,12.0,10.0,6.5,16.0,8.0,...,10.0,4.0,33.5,6.5,20.5,5.0,10.5,6.5,4.0,7.5
1,18.5,19.0,11.0,14.0,9.5,23.0,12.0,11.0,21.5,29.0,...,5.5,10.5,7.0,7.0,27.0,2.5,5.0,6.0,5.5,7.0
2,2.0,5.0,1.5,2.0,3.0,1.5,2.0,4.0,5.0,1.5,...,1.0,5.0,1.0,1.0,1.0,3.5,1.5,3.0,2.0,1.0
3,5.5,7.5,14.0,5.0,12.0,4.0,10.0,9.5,6.0,15.5,...,3.5,6.0,6.5,4.5,4.0,10.5,8.0,19.0,6.5,7.0
4,6.5,31.5,1.0,1.0,1.5,3.0,5.0,1.0,4.0,2.0,...,6.5,5.0,11.0,5.5,4.0,2.0,5.0,6.5,5.5,4.0


In [11]:
# separate 'Page' from raw train data into several new features, incorporate them into train_key

data = [page.split('_') for page in tqdm(train_key.Page)]
access = ['_'.join(page[-2:]) for page in data]
site = [page[-3] for page in data]
page = ['_'.join(page[:-3]) for page in data]
train_key['PageTitle'] = page
train_key['Site'] = site
train_key['AccessAgent'] = access
train_key.head()

100%|██████████████████████████████████████████████████████████████████████| 145063/145063 [00:00<00:00, 704127.89it/s]


Unnamed: 0,Page,PageTitle,Site,AccessAgent
0,2NE1_zh.wikipedia.org_all-access_spider,2NE1,zh.wikipedia.org,all-access_spider
1,2PM_zh.wikipedia.org_all-access_spider,2PM,zh.wikipedia.org,all-access_spider
2,3C_zh.wikipedia.org_all-access_spider,3C,zh.wikipedia.org,all-access_spider
3,4minute_zh.wikipedia.org_all-access_spider,4minute,zh.wikipedia.org,all-access_spider
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,52_Hz_I_Love_You,zh.wikipedia.org,all-access_spider


In [12]:
# transform train_all dataset

train_all_norm = np.log1p(train_all).astype('float32')
train_all_norm.head()

Unnamed: 0,2017-09-10,2017-09-09,2017-09-08,2017-09-07,2017-09-06,2017-09-05,2017-09-04,2017-09-03,2017-09-02,2017-09-01,...,2017-03-23,2017-03-22,2017-03-21,2017-03-20,2017-03-19,2017-03-18,2017-03-17,2017-03-16,2017-03-15,2017-03-14
0,2.995732,3.332205,2.525729,2.74084,2.674149,2.197225,2.302585,2.862201,2.862201,2.351375,...,3.135494,2.525729,2.484907,2.70805,2.639057,2.772589,2.70805,2.442347,2.351375,2.564949
1,3.725693,2.014903,2.525729,2.639057,2.60269,3.332205,2.351375,1.871802,2.772589,2.833213,...,2.80336,3.332205,2.674149,2.862201,2.70805,2.70805,2.944439,3.314186,3.157,2.564949
2,1.386294,1.504077,1.098612,0.916291,1.504077,1.098612,0.693147,1.504077,1.386294,1.386294,...,0.916291,0.405465,1.609438,1.609438,1.609438,1.386294,0.916291,1.609438,0.916291,0.405465
3,1.098612,2.995732,2.772589,2.351375,2.197225,1.386294,1.704748,2.351375,2.351375,1.504077,...,2.140066,2.564949,2.60269,2.60269,2.564949,2.674149,2.564949,2.74084,2.140066,2.351375
4,1.504077,1.609438,2.079442,2.70805,2.525729,2.397895,1.704748,2.351375,2.197225,2.197225,...,2.079442,2.079442,2.442347,2.397895,2.079442,2.484907,2.351375,2.564949,2.014903,2.564949


In [13]:
# transform train dataset

train_norm = np.log1p(train).astype('float32')
train_norm.head()

Unnamed: 0,2016-09-10,2016-09-09,2016-09-08,2016-09-07,2016-09-06,2016-09-05,2016-09-04,2016-09-03,2016-09-02,2016-09-01,...,2016-03-23,2016-03-22,2016-03-21,2016-03-20,2016-03-19,2016-03-18,2016-03-17,2016-03-16,2016-03-15,2016-03-14
0,2.197225,3.349904,2.302585,2.351375,2.302585,2.525729,2.351375,2.60269,2.397895,2.442347,...,1.871802,1.94591,1.871802,2.014903,2.302585,1.386294,2.251292,2.079442,1.791759,1.704748
1,2.014903,2.014903,2.917771,3.113515,3.135494,2.351375,3.688879,3.526361,2.251292,2.80336,...,2.251292,1.94591,2.251292,2.140066,2.251292,3.178054,2.302585,2.484907,2.197225,2.944439
2,2.351375,1.252763,1.791759,1.871802,1.386294,1.504077,0.693147,1.252763,1.504077,1.252763,...,1.791759,0.405465,0.405465,1.504077,1.098612,0.916291,0.693147,0.693147,1.098612,0.693147
3,1.704748,1.871802,2.014903,2.079442,1.504077,1.871802,2.197225,1.704748,2.079442,2.079442,...,1.791759,2.014903,1.871802,2.251292,2.70805,1.704748,1.871802,1.609438,1.94591,1.791759
4,1.252763,1.252763,1.252763,1.252763,0.405465,0.916291,1.098612,3.020425,0.916291,1.098612,...,,,,,,,,,,


In [14]:
# edit column names of test dataset

first_day = 1 # 2016-09-13 is a Tuesday
test.columns = ['w%d_d%d' % (i // 7, (first_day + i) % 7) for i in range(63)]
test.head()

Unnamed: 0,w0_d1,w0_d2,w0_d3,w0_d4,w0_d5,w0_d6,w0_d0,w1_d1,w1_d2,w1_d3,...,w7_d5,w7_d6,w7_d0,w8_d1,w8_d2,w8_d3,w8_d4,w8_d5,w8_d6,w8_d0
0,5.5,6.5,10.0,10.5,6.5,12.0,10.0,6.5,16.0,8.0,...,10.0,4.0,33.5,6.5,20.5,5.0,10.5,6.5,4.0,7.5
1,18.5,19.0,11.0,14.0,9.5,23.0,12.0,11.0,21.5,29.0,...,5.5,10.5,7.0,7.0,27.0,2.5,5.0,6.0,5.5,7.0
2,2.0,5.0,1.5,2.0,3.0,1.5,2.0,4.0,5.0,1.5,...,1.0,5.0,1.0,1.0,1.0,3.5,1.5,3.0,2.0,1.0
3,5.5,7.5,14.0,5.0,12.0,4.0,10.0,9.5,6.0,15.5,...,3.5,6.0,6.5,4.5,4.0,10.5,8.0,19.0,6.5,7.0
4,6.5,31.5,1.0,1.0,1.5,3.0,5.0,1.0,4.0,2.0,...,6.5,5.0,11.0,5.5,4.0,2.0,5.0,6.5,5.5,4.0


In [15]:
# add 'Page' from raw train data to test dataset

test.fillna(0, inplace=True)
test['Page'] = all_page
test.sort_values(by='Page', inplace=True)
test.reset_index(drop=True, inplace=True)
test.head()

Unnamed: 0,w0_d1,w0_d2,w0_d3,w0_d4,w0_d5,w0_d6,w0_d0,w1_d1,w1_d2,w1_d3,...,w7_d6,w7_d0,w8_d1,w8_d2,w8_d3,w8_d4,w8_d5,w8_d6,w8_d0,Page
0,1.5,1.0,2.5,0.5,1.5,2.5,1.0,2.5,0.0,1.5,...,2.5,3.0,3.0,2.0,2.0,2.0,1.0,1.0,2.0,!vote_en.wikipedia.org_all-access_all-agents
1,1.5,1.0,1.0,0.0,1.0,1.5,0.5,1.5,0.0,0.0,...,2.5,2.5,1.5,2.0,1.0,1.0,0.0,1.0,1.5,!vote_en.wikipedia.org_all-access_spider
2,1.5,1.0,1.5,0.5,1.5,2.0,1.0,2.0,0.0,1.5,...,2.0,3.0,3.0,2.0,2.0,1.5,1.0,0.5,2.0,!vote_en.wikipedia.org_desktop_all-agents
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,659.5,"""Awaken,_My_Love!""_en.wikipedia.org_all-access..."
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,"""Awaken,_My_Love!""_en.wikipedia.org_all-access..."


In [16]:
# merge test dataset with train_key

test = test.merge(train_key, how='left', on='Page', copy=False)
test.head()

Unnamed: 0,w0_d1,w0_d2,w0_d3,w0_d4,w0_d5,w0_d6,w0_d0,w1_d1,w1_d2,w1_d3,...,w8_d2,w8_d3,w8_d4,w8_d5,w8_d6,w8_d0,Page,PageTitle,Site,AccessAgent
0,1.5,1.0,2.5,0.5,1.5,2.5,1.0,2.5,0.0,1.5,...,2.0,2.0,2.0,1.0,1.0,2.0,!vote_en.wikipedia.org_all-access_all-agents,!vote,en.wikipedia.org,all-access_all-agents
1,1.5,1.0,1.0,0.0,1.0,1.5,0.5,1.5,0.0,0.0,...,2.0,1.0,1.0,0.0,1.0,1.5,!vote_en.wikipedia.org_all-access_spider,!vote,en.wikipedia.org,all-access_spider
2,1.5,1.0,1.5,0.5,1.5,2.0,1.0,2.0,0.0,1.5,...,2.0,2.0,1.5,1.0,0.5,2.0,!vote_en.wikipedia.org_desktop_all-agents,!vote,en.wikipedia.org,desktop_all-agents
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,659.5,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...","""Awaken,_My_Love!""",en.wikipedia.org,all-access_all-agents
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,34.0,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...","""Awaken,_My_Love!""",en.wikipedia.org,all-access_spider


In [17]:
# save the date columns of test dataset for further preparation

y_cols = test.columns[:63]
y_cols

Index(['w0_d1', 'w0_d2', 'w0_d3', 'w0_d4', 'w0_d5', 'w0_d6', 'w0_d0', 'w1_d1',
       'w1_d2', 'w1_d3', 'w1_d4', 'w1_d5', 'w1_d6', 'w1_d0', 'w2_d1', 'w2_d2',
       'w2_d3', 'w2_d4', 'w2_d5', 'w2_d6', 'w2_d0', 'w3_d1', 'w3_d2', 'w3_d3',
       'w3_d4', 'w3_d5', 'w3_d6', 'w3_d0', 'w4_d1', 'w4_d2', 'w4_d3', 'w4_d4',
       'w4_d5', 'w4_d6', 'w4_d0', 'w5_d1', 'w5_d2', 'w5_d3', 'w5_d4', 'w5_d5',
       'w5_d6', 'w5_d0', 'w6_d1', 'w6_d2', 'w6_d3', 'w6_d4', 'w6_d5', 'w6_d6',
       'w6_d0', 'w7_d1', 'w7_d2', 'w7_d3', 'w7_d4', 'w7_d5', 'w7_d6', 'w7_d0',
       'w8_d1', 'w8_d2', 'w8_d3', 'w8_d4', 'w8_d5', 'w8_d6', 'w8_d0'],
      dtype='object')

In [18]:
# read key dataset to test_all_id

test_all_id = pd.read_csv('data/web-traffic-forecasting-key.csv')
test_all_id.head()

Unnamed: 0,Page,Id
0,007_スペクター_ja.wikipedia.org_all-access_all-agen...,0b293039387a
1,007_スペクター_ja.wikipedia.org_all-access_all-agen...,7114389dd824
2,007_スペクター_ja.wikipedia.org_all-access_all-agen...,057b02ff1f09
3,007_スペクター_ja.wikipedia.org_all-access_all-agen...,bd2aca21caa3
4,007_スペクター_ja.wikipedia.org_all-access_all-agen...,c0effb42cdd5


In [19]:
# separate 'Date' from 'Page' in test_all_id 

test_all_id['Date'] = [page[-10:] for page in tqdm(test_all_id.Page)]
test_all_id['Page'] = [page[:-11] for page in tqdm(test_all_id.Page)]
test_all_id.head()

100%|███████████████████████████████████████████████████████████████████| 8993906/8993906 [00:02<00:00, 3117469.96it/s]
100%|███████████████████████████████████████████████████████████████████| 8993906/8993906 [00:02<00:00, 3019101.57it/s]


Unnamed: 0,Page,Id,Date
0,007_スペクター_ja.wikipedia.org_all-access_all-agents,0b293039387a,2017-09-13
1,007_スペクター_ja.wikipedia.org_all-access_all-agents,7114389dd824,2017-09-14
2,007_スペクター_ja.wikipedia.org_all-access_all-agents,057b02ff1f09,2017-09-15
3,007_スペクター_ja.wikipedia.org_all-access_all-agents,bd2aca21caa3,2017-09-16
4,007_スペクター_ja.wikipedia.org_all-access_all-agents,c0effb42cdd5,2017-09-17


In [20]:
# make test_all by dropping 'Id' from test_all_id, then add 'Visits_true' to test_all

test_all = test_all_id.drop('Id', axis=1)
test_all['Visits_true'] = np.NaN
test_all.Visits_true = test_all.Visits_true * offset
test_all = test_all.pivot(index='Page', columns='Date', values='Visits_true').astype('float32').reset_index()

test_all['2017-11-14'] = np.NaN
test_all.sort_values(by='Page', inplace=True)
test_all.reset_index(drop=True, inplace=True)

test_all.head()

Date,Page,2017-09-13,2017-09-14,2017-09-15,2017-09-16,2017-09-17,2017-09-18,2017-09-19,2017-09-20,2017-09-21,...,2017-11-05,2017-11-06,2017-11-07,2017-11-08,2017-11-09,2017-11-10,2017-11-11,2017-11-12,2017-11-13,2017-11-14
0,!vote_en.wikipedia.org_all-access_all-agents,,,,,,,,,,...,,,,,,,,,,
1,!vote_en.wikipedia.org_all-access_spider,,,,,,,,,,...,,,,,,,,,,
2,!vote_en.wikipedia.org_desktop_all-agents,,,,,,,,,,...,,,,,,,,,,
3,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...",,,,,,,,,,...,,,,,,,,,,
4,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...",,,,,,,,,,...,,,,,,,,,,


In [21]:
# save columns of test_all for further preparation

test_all_columns_date = list(test_all.columns[1:])

In [22]:
# edit column names of test_all dataset

first_day = 2 # 2017-13-09 is a Wednesday
test_all_columns_code = ['w%d_d%d' % (i // 7, (first_day + i) % 7) for i in range(63)]
cols = ['Page']
cols.extend(test_all_columns_code)
test_all.columns = cols
test_all.head()

Unnamed: 0,Page,w0_d2,w0_d3,w0_d4,w0_d5,w0_d6,w0_d0,w0_d1,w1_d2,w1_d3,...,w7_d6,w7_d0,w7_d1,w8_d2,w8_d3,w8_d4,w8_d5,w8_d6,w8_d0,w8_d1
0,!vote_en.wikipedia.org_all-access_all-agents,,,,,,,,,,...,,,,,,,,,,
1,!vote_en.wikipedia.org_all-access_spider,,,,,,,,,,...,,,,,,,,,,
2,!vote_en.wikipedia.org_desktop_all-agents,,,,,,,,,,...,,,,,,,,,,
3,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...",,,,,,,,,,...,,,,,,,,,,
4,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...",,,,,,,,,,...,,,,,,,,,,


In [23]:
# merge test_all dataset with train_key

test_all = test_all.merge(train_key, how='left', on='Page')
test_all.head()

Unnamed: 0,Page,w0_d2,w0_d3,w0_d4,w0_d5,w0_d6,w0_d0,w0_d1,w1_d2,w1_d3,...,w8_d2,w8_d3,w8_d4,w8_d5,w8_d6,w8_d0,w8_d1,PageTitle,Site,AccessAgent
0,!vote_en.wikipedia.org_all-access_all-agents,,,,,,,,,,...,,,,,,,,!vote,en.wikipedia.org,all-access_all-agents
1,!vote_en.wikipedia.org_all-access_spider,,,,,,,,,,...,,,,,,,,!vote,en.wikipedia.org,all-access_spider
2,!vote_en.wikipedia.org_desktop_all-agents,,,,,,,,,,...,,,,,,,,!vote,en.wikipedia.org,desktop_all-agents
3,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...",,,,,,,,,,...,,,,,,,,"""Awaken,_My_Love!""",en.wikipedia.org,all-access_all-agents
4,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...",,,,,,,,,,...,,,,,,,,"""Awaken,_My_Love!""",en.wikipedia.org,all-access_spider


In [24]:
# reset index of test dataset

test = test.reset_index()
print(f'Size of test dataset:', test.shape)
test.head()

Size of test dataset: (145063, 68)


Unnamed: 0,index,w0_d1,w0_d2,w0_d3,w0_d4,w0_d5,w0_d6,w0_d0,w1_d1,w1_d2,...,w8_d2,w8_d3,w8_d4,w8_d5,w8_d6,w8_d0,Page,PageTitle,Site,AccessAgent
0,0,1.5,1.0,2.5,0.5,1.5,2.5,1.0,2.5,0.0,...,2.0,2.0,2.0,1.0,1.0,2.0,!vote_en.wikipedia.org_all-access_all-agents,!vote,en.wikipedia.org,all-access_all-agents
1,1,1.5,1.0,1.0,0.0,1.0,1.5,0.5,1.5,0.0,...,2.0,1.0,1.0,0.0,1.0,1.5,!vote_en.wikipedia.org_all-access_spider,!vote,en.wikipedia.org,all-access_spider
2,2,1.5,1.0,1.5,0.5,1.5,2.0,1.0,2.0,0.0,...,2.0,2.0,1.5,1.0,0.5,2.0,!vote_en.wikipedia.org_desktop_all-agents,!vote,en.wikipedia.org,desktop_all-agents
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,659.5,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...","""Awaken,_My_Love!""",en.wikipedia.org,all-access_all-agents
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,34.0,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...","""Awaken,_My_Love!""",en.wikipedia.org,all-access_spider


In [25]:
# reset index of test_all dataset

test_all = test_all.reset_index()
print(f'Size of test_all dataset:', test_all.shape)
test_all = test_all[test.columns].copy()
test_all.head()

Size of test_all dataset: (145063, 68)


Unnamed: 0,index,w0_d1,w0_d2,w0_d3,w0_d4,w0_d5,w0_d6,w0_d0,w1_d1,w1_d2,...,w8_d2,w8_d3,w8_d4,w8_d5,w8_d6,w8_d0,Page,PageTitle,Site,AccessAgent
0,0,,,,,,,,,,...,,,,,,,!vote_en.wikipedia.org_all-access_all-agents,!vote,en.wikipedia.org,all-access_all-agents
1,1,,,,,,,,,,...,,,,,,,!vote_en.wikipedia.org_all-access_spider,!vote,en.wikipedia.org,all-access_spider
2,2,,,,,,,,,,...,,,,,,,!vote_en.wikipedia.org_desktop_all-agents,!vote,en.wikipedia.org,desktop_all-agents
3,3,,,,,,,,,,...,,,,,,,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...","""Awaken,_My_Love!""",en.wikipedia.org,all-access_all-agents
4,4,,,,,,,,,,...,,,,,,,"""Awaken,_My_Love!""_en.wikipedia.org_all-access...","""Awaken,_My_Love!""",en.wikipedia.org,all-access_spider


In [26]:
# check whether the 'Page' is the same for test and test_all

all(test[:test_all.shape[0]].Page == test_all.Page)

True

In [27]:
# edit column names of train_norm & train_all_norm

train_cols = ['d_%d' % i for i in range(train_norm.shape[1])]
train_norm.columns = train_cols
train_all_norm.columns = train_cols
train_norm.head()

Unnamed: 0,d_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_171,d_172,d_173,d_174,d_175,d_176,d_177,d_178,d_179,d_180
0,2.197225,3.349904,2.302585,2.351375,2.302585,2.525729,2.351375,2.60269,2.397895,2.442347,...,1.871802,1.94591,1.871802,2.014903,2.302585,1.386294,2.251292,2.079442,1.791759,1.704748
1,2.014903,2.014903,2.917771,3.113515,3.135494,2.351375,3.688879,3.526361,2.251292,2.80336,...,2.251292,1.94591,2.251292,2.140066,2.251292,3.178054,2.302585,2.484907,2.197225,2.944439
2,2.351375,1.252763,1.791759,1.871802,1.386294,1.504077,0.693147,1.252763,1.504077,1.252763,...,1.791759,0.405465,0.405465,1.504077,1.098612,0.916291,0.693147,0.693147,1.098612,0.693147
3,1.704748,1.871802,2.014903,2.079442,1.504077,1.871802,2.197225,1.704748,2.079442,2.079442,...,1.791759,2.014903,1.871802,2.251292,2.70805,1.704748,1.871802,1.609438,1.94591,1.791759
4,1.252763,1.252763,1.252763,1.252763,0.405465,0.916291,1.098612,3.020425,0.916291,1.098612,...,,,,,,,,,,


In [28]:
# make a copy of all elements in train_key 'Site'

sites = train_key.Site.unique()
sites

array(['zh.wikipedia.org', 'fr.wikipedia.org', 'en.wikipedia.org',
       'commons.wikimedia.org', 'ru.wikipedia.org', 'www.mediawiki.org',
       'de.wikipedia.org', 'ja.wikipedia.org', 'es.wikipedia.org'],
      dtype=object)

In [29]:
# encode 'Site' in test & test_all dataset

test_site = pd.factorize(test.Site)[0]
test['Site_label'] = test_site
test_all['Site_label'] = test_site[:test_all.shape[0]]

In [30]:
# make a copy of all elements in train_key 'AccessAgent'

accesses = train_key.AccessAgent.unique()
accesses

array(['all-access_spider', 'desktop_all-agents', 'mobile-web_all-agents',
       'all-access_all-agents'], dtype=object)

In [31]:
# encode 'AccessAgent' in test & test_all datasets

test_access = pd.factorize(test.AccessAgent)[0]
test['Access_label'] = test_access
test_all['Access_label'] = test_access[:test_all.shape[0]]

In [32]:
test.shape

(145063, 70)

In [33]:
test_all.shape

(145063, 70)

In [34]:
# make a copy of original test & test_all datasets

test0 = test.copy()
test_all0 = test_all.copy()

In [35]:
# column names for test & test_all datasets

y_norm_cols = [c+'_norm' for c in y_cols]
y_pred_cols = [c+'_pred' for c in y_cols]

In [36]:
# all visits are median

def add_median(test, train,
               train_key, periods, max_periods, first_train_weekday):
    
    train =  train.iloc[:, :7*max_periods]
    
    df = train_key[['Page']].copy()
    df['AllVisits'] = train.median(axis=1).fillna(0)
    test = test.merge(df, how='left', on='Page', copy=False)
    test.AllVisits = test.AllVisits.fillna(0).astype('float32')
    
    for site in sites:
        test[site] = (1 * (test.Site == site)).astype('float32')
    
    for access in accesses:
        test[access] = (1 * (test.AccessAgent == access)).astype('float32')

    for (w1, w2) in periods:
        df = train_key[['Page']].copy()
        c = 'median_%d_%d' % (w1, w2)
        df[c] = train.iloc[:, 7*w1:7*w2].median(axis=1, skipna=True) 
        test = test.merge(df, how='left', on='Page', copy=False)
        test[c] = (test[c] - test.AllVisits).fillna(0).astype('float32')

    for c_norm, c in zip(y_norm_cols, y_cols):
        test[c_norm] = (np.log1p(test[c]) - test.AllVisits).astype('float32')

    gc.collect()

    return test

max_periods = 16
periods = [(0,1), (1,2), (2,3), (3,4), 
           (4,5), (5,6), (6,7), (7,8)]

site_cols = list(sites)
access_cols = list(accesses)

test, test_all = test0.copy(), test_all0.copy()

for c in y_pred_cols:
    test[c] = np.NaN
    test_all[c] = np.NaN

test1 = add_median(test, train_norm, 
                   train_key, periods, max_periods, 3)

test_all1 = add_median(test_all, train_all_norm, 
                       train_key, periods, max_periods, 5)

In [37]:
# modeling

num_cols = (['median_%d_%d' % (w1,w2) for (w1,w2) in periods])

def smape_error(y_true, y_pred):
    return K.mean(K.clip(K.abs(y_pred - y_true),  0.0, 1.0), axis=-1)

def get_model(input_dim, num_sites, num_accesses, output_dim):
    
    dropout = 0.5
    regularizer = 0.00004
    main_input = Input(shape=(input_dim,), dtype='float32', name='main_input')
    site_input = Input(shape=(num_sites,), dtype='float32', name='site_input')
    access_input = Input(shape=(num_accesses,), dtype='float32', name='access_input')
    
    x0 = concatenate([main_input, site_input, access_input])
    x = Dense(200, activation='relu', 
              kernel_initializer='lecun_uniform', kernel_regularizer=regularizers.l2(regularizer))(x0)
    x = Dropout(dropout)(x)
    x = concatenate([x0, x])
    x = Dense(200, activation='relu', 
              kernel_initializer='lecun_uniform', kernel_regularizer=regularizers.l2(regularizer))(x)
    x = BatchNormalization(beta_regularizer=regularizers.l2(regularizer),
                           gamma_regularizer=regularizers.l2(regularizer))(x)
    x = Dropout(dropout)(x)
    x = Dense(100, activation='relu', 
              kernel_initializer='lecun_uniform', kernel_regularizer=regularizers.l2(regularizer))(x)
    x = Dropout(dropout)(x)
    x = Dense(200, activation='relu', 
              kernel_initializer='lecun_uniform', kernel_regularizer=regularizers.l2(regularizer))(x)
    x = Dropout(dropout)(x)
    x = Dense(output_dim, activation='linear', 
              kernel_initializer='lecun_uniform', kernel_regularizer=regularizers.l2(regularizer))(x)

    model = Model(inputs=[main_input, site_input, access_input], outputs=[x])
    model.compile(loss=smape_error, optimizer='adam')
    return model

group = pd.factorize(test1.Page)[0]

n_bag = 20
kf = GroupKFold(n_bag)
batch_size=4096

test2 = test1
test_all2 = test_all1
X, Xs, Xa, y = test2[num_cols].values, test2[site_cols].values, test2[access_cols].values, test2[y_norm_cols].values
X_all, Xs_all, Xa_all, y_all = test_all2[num_cols].values, test_all2[site_cols].values, test_all2[access_cols].values, test_all2[y_norm_cols].fillna(0).values

y_true = test2[y_cols]
y_all_true = test_all2[y_cols]

models = [get_model(len(num_cols), len(site_cols), len(access_cols), len(y_cols)) for bag in range(n_bag)]

print('offset:', offset)
print('batch size:', batch_size)

best_score = 100
best_all_score = 100

save_pred = 0
saved_pred_all = 0

for n_epoch in range(20, 201, 20):
    print('\n=================== start %d epochs ===================' % n_epoch)

    y_pred0 = np.zeros((y.shape[0], y.shape[1]))
    y_all_pred0 = np.zeros((n_bag, y_all.shape[0], y_all.shape[1]))
    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y, group)):
        print('train fold', fold, end='\t')    
        
        model = models[fold]
        X_train, Xs_train, Xa_train, y_train = X[train_idx,:], Xs[train_idx,:], Xa[train_idx,:], y[train_idx,:]
        X_test, Xs_test, Xa_test, y_test = X[test_idx,:], Xs[test_idx,:], Xa[test_idx,:], y[test_idx,:]

        model.fit([ X_train, Xs_train, Xa_train],  y_train, 
                  epochs=10, batch_size=batch_size, verbose=0, shuffle=True, 
                  validation_data=([X_test, Xs_test, Xa_test],  y_test)
                 )
        
        y_pred = model.predict([ X_test, Xs_test, Xa_test], batch_size=batch_size)
        y_all_pred = model.predict([X_all, Xs_all, Xa_all], batch_size=batch_size)

        y_pred0[test_idx,:] = y_pred
        y_all_pred0[fold,:,:]  = y_all_pred

        y_pred += test2.AllVisits.values[test_idx].reshape((-1,1))
        y_pred = np.expm1(y_pred)
        y_pred[y_pred < 0.5 * offset] = 0
        res = smape2D(test2[y_cols].values[test_idx, :], y_pred)
        y_pred = offset*((y_pred / offset).round())
        res_round = smape2D(test2[y_cols].values[test_idx, :], y_pred)

        y_all_pred += test_all2.AllVisits.values.reshape((-1,1))
        y_all_pred = np.expm1(y_all_pred)
        y_all_pred[y_all_pred < 0.5 * offset] = 0
        res_all = smape2D(test_all2[y_cols], y_all_pred)
        y_all_pred = offset*((y_all_pred / offset).round())
        res_all_round = smape2D(test_all2[y_cols], y_all_pred)
        print('smape train: %0.5f' % res, 'round: %0.5f' % res_round,
              'smape LB: %0.5f' % res_all, 'round: %0.5f' % res_all_round)
        
        del X_train, Xs_train, Xa_train, y_train
        del X_test, Xs_test, Xa_test, y_test
        
    y_all_pred0  = np.nanmedian(y_all_pred0, axis=0)

    y_pred0 += test2.AllVisits.values.reshape((-1,1))
    y_pred0 = np.expm1(y_pred0)
    y_pred0[y_pred0 < 0.5 * offset] = 0
    res = smape2D(y_true, y_pred0)
    print('smape train: %0.5f' % res, end=' ')
    y_pred0 = offset*((y_pred0 / offset).round())
    res_round = smape2D(y_true, y_pred0)
    print('round: %0.5f' % res_round, end=' ')

    y_all_pred0 += test_all2.AllVisits.values.reshape((-1,1))
    y_all_pred0 = np.expm1(y_all_pred0)
    y_all_pred0[y_all_pred0 < 0.5 * offset] = 0
    res_all = smape2D(y_all_true, y_all_pred0)
    print('smape LB: %0.5f' % res_all, end=' ')
    y_all_pred0 = offset*((y_all_pred0 / offset).round())
    res_all_round = smape2D(y_all_true, y_all_pred0)
    print('round: %0.5f' % res_all_round)
    if res_round < best_score:
        print('saving')
        best_score = res_round
        best_all_score = res_all_round
        test.loc[:, y_pred_cols] = y_pred0
        test_all.loc[:, y_pred_cols] = y_all_pred0
    else:
        print()
    print('==================== end %d epochs ====================' % n_epoch)
    
    del y_pred0, y_all_pred0
    gc.collect()
    
print('\nbest saved LB score:', best_all_score)

offset: 0.5
batch size: 4096

train fold 0	smape train: 0.47951 round: 0.47864 smape LB: nan round: nan
train fold 1	smape train: 0.48024 round: 0.47943 smape LB: nan round: nan
train fold 2	smape train: 0.47371 round: 0.47290 smape LB: nan round: nan
train fold 3	smape train: 0.48484 round: 0.48397 smape LB: nan round: nan
train fold 4	smape train: 0.47702 round: 0.47610 smape LB: nan round: nan
train fold 5	smape train: 0.48097 round: 0.48005 smape LB: nan round: nan
train fold 6	smape train: 0.47135 round: 0.47058 smape LB: nan round: nan
train fold 7	smape train: 0.48010 round: 0.47911 smape LB: nan round: nan
train fold 8	smape train: 0.47491 round: 0.47420 smape LB: nan round: nan
train fold 9	smape train: 0.48285 round: 0.48190 smape LB: nan round: nan
train fold 10	smape train: 0.47753 round: 0.47658 smape LB: nan round: nan
train fold 11	smape train: 0.47857 round: 0.47748 smape LB: nan round: nan
train fold 12	smape train: 0.47380 round: 0.47289 smape LB: nan round: nan
train

train fold 19	smape train: 0.44793 round: 0.44652 smape LB: nan round: nan
smape train: 0.44580 round: 0.44447 smape LB: nan round: nan
saving

train fold 0	smape train: 0.44621 round: 0.44469 smape LB: nan round: nan
train fold 1	smape train: 0.43931 round: 0.43807 smape LB: nan round: nan
train fold 2	smape train: 0.44306 round: 0.44181 smape LB: nan round: nan
train fold 3	smape train: 0.44990 round: 0.44873 smape LB: nan round: nan
train fold 4	smape train: 0.44297 round: 0.44169 smape LB: nan round: nan
train fold 5	smape train: 0.44105 round: 0.43966 smape LB: nan round: nan
train fold 6	smape train: 0.43653 round: 0.43532 smape LB: nan round: nan
train fold 7	smape train: 0.44549 round: 0.44400 smape LB: nan round: nan
train fold 8	smape train: 0.44559 round: 0.44441 smape LB: nan round: nan
train fold 9	smape train: 0.44572 round: 0.44421 smape LB: nan round: nan
train fold 10	smape train: 0.44256 round: 0.44112 smape LB: nan round: nan
train fold 11	smape train: 0.44568 round:

train fold 17	smape train: 0.43797 round: 0.43654 smape LB: nan round: nan
train fold 18	smape train: 0.44158 round: 0.44022 smape LB: nan round: nan
train fold 19	smape train: 0.44601 round: 0.44453 smape LB: nan round: nan
smape train: 0.44100 round: 0.43965 smape LB: nan round: nan
saving

best saved LB score: nan


In [38]:
# submission

filename = 'keras_simple'

test_all_columns_save = [c+'_pred' for c in test_all_columns_code]
test_all_columns_save.append('Page')

test_all_save = test_all[test_all_columns_save]
test_all_save.columns = test_all_columns_date + ['Page']
test_all_save.to_csv('%s_test_all_save.csv' % filename, index=False)

test_all_save_columns = test_all_columns_date[:-1] + ['Page']
test_all_save = test_all_save[test_all_save_columns]
test_all_save = pd.melt(test_all_save, id_vars=['Page'], var_name='Date', value_name='Visits')

test_all_sub = test_all_id.merge(test_all_save, how='left', on=['Page','Date'])
test_all_sub.Visits = (test_all_sub.Visits / offset).round()

test_all_sub_sorted = test_all_sub[['Id', 'Visits']].sort_values(by='Id')
test_all_sub_sorted[['Id', 'Visits']].to_csv('%s_test_sorted.csv' % filename, index=False)

test_all_sub[['Id', 'Visits']].to_csv('%s_test.csv' % filename, index=False)