### setup

In [2]:
import os
import argparse
import pickle
import time

import numpy as np; np.seterr(invalid='ignore')
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import TensorDataset, DataLoader

In [3]:
parser = {
    'data_path': '../data/wttsf/',
    'train_file': 'train_2.csv',
    'key_file': 'key_2.csv',
    'intermediate_path': '../intermediate/',
    'n_epoch': 6,
    'future': 73,
    'batch_size': 128,
    'hidden_size': 256,
    'log_every': 10,
    'read_from_file': False,
    'train': True,
    'model_name': '',
    'forecast': False,
    'forecast_start': '2017-01-01',
    'forecast_end': '2017-03-01',
    'cuda': True,
    'seed': 20170907,
}
args = argparse.Namespace(**parser)

args.cuda = args.cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)

args.intermediate_path = os.path.join(args.intermediate_path, str(args.seed))

### model

In [4]:
class DenseLSTMForecast(nn.Module):
    def __init__(self, hidden_size):
        super(DenseLSTMForecast, self).__init__()
        self.lstm1 = nn.LSTMCell(15, hidden_size)
        self.lstm2 = nn.LSTMCell(hidden_size+15, hidden_size)
        self.lstm3 = nn.LSTMCell(2*hidden_size+15, hidden_size)
        self.linear = nn.Linear(3*hidden_size+15, 1)
        self.hidden_size = hidden_size

    def forward(self, x, feature, future=1):
        o = []
        tt = torch.cuda if args.cuda else torch
        h1_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        c1_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        h2_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        c2_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        h3_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        c3_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        
        for x_t in x.chunk(x.size(1), dim=1):
            x_t = x_t.squeeze(dim=1)
            xd_t = torch.cat([x_t, feature], dim=1)
            h1_t, c1_t = self.lstm1(xd_t, (h1_t, c1_t))
            h1d_t = torch.cat([xd_t, h1_t], dim=1)
            h2_t, c2_t = self.lstm2(h1d_t, (h2_t, c2_t))
            h2d_t = torch.cat([xd_t, h1_t, h2_t], dim=1)
            h3_t, c3_t = self.lstm3(h2d_t, (h3_t, c3_t))
            h3d_t = torch.cat([xd_t, h1_t, h2_t, h3_t], dim=1)
            o_t = self.linear(h3d_t)
            o.append(o_t)
            
        for i in range(future-1):
            od_t = torch.cat([o_t, feature], dim=1)
            h1_t, c1_t = self.lstm1(od_t, (h1_t, c1_t))
            h1d_t = torch.cat([od_t, h1_t], dim=1)
            h2_t, c2_t = self.lstm2(h1d_t, (h2_t, c2_t))
            h2d_t = torch.cat([od_t, h1_t, h2_t], dim=1)
            h3_t, c3_t = self.lstm3(h2d_t, (h3_t, c3_t))
            h3d_t = torch.cat([od_t, h1_t, h2_t, h3_t], dim=1)
            o_t = self.linear(h3d_t)
            o.append(o_t)

        return torch.stack(o, dim=1)

### utils

In [5]:
def smape(y_pred, y_true):
    y_pred = np.around(np.clip(np.exp(y_pred)-1, 0, None))
    y_true = np.around(np.exp(y_true) - 1)
    raw_smape = np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))
    kaggle_smape = np.nan_to_num(raw_smape)
    return np.mean(kaggle_smape) * 200

In [6]:
def get_data():
    scaled_data_file = os.path.join(args.intermediate_path,
                                    'scaled_data.pkl')
    scaler_file = os.path.join(args.intermediate_path, 'scaler.pkl')
    features_file = os.path.join(args.intermediate_path, 'features.pkl')
    
    if not args.read_from_file:
        data_df = pd.read_csv(os.path.join(args.data_path, args.train_file),
                              index_col='Page')
        data_df = data_df.fillna(method='ffill', axis=1).fillna(
            method='bfill', axis=1)
        data_df["agent"] = data_df.index.str.rsplit('_').str.get(-1)
        data_df["access"] = data_df.index.str.rsplit('_').str.get(-2)
        data_df["project"] = data_df.index.str.rsplit('_').str.get(-3)
        features = pd.get_dummies(data_df[["agent", "access", "project"]],
            columns=["agent", "access", "project"]).values.astype('float32')
        raw_data = np.nan_to_num(
            data_df.iloc[:,:-3].values.astype('float32'))
        data = np.log1p(raw_data)
        scaler = StandardScaler()
        scaler.fit(np.swapaxes(data, 0, 1))
#        scaler.fit(np.swapaxes(data[:, :-args.future], 0, 1))
        scaled_data = scaler.transform(np.swapaxes(data, 0, 1))
        scaled_data = np.swapaxes(scaled_data, 0, 1)
        
        with open(scaled_data_file, 'wb') as f:
            pickle.dump(scaled_data, f)
        with open(scaler_file, 'wb') as f:
            pickle.dump(scaler, f)
        with open(features_file, 'wb') as f:
            pickle.dump(features, f)
    else:
        with open(scaled_data_file, 'rb') as f:
            scaled_data = pickle.load(f)
        with open(scaler_file, 'rb') as f:
            scaler = pickle.load(f)
        with open(features_file, 'rb') as f:
            features = pickle.load(f)
    return scaled_data, scaler, features

In [12]:
scaled_data.shape

(145063, 793)

In [7]:
def train(scaled_data, scaler, features, model, criterion, optimizer):
    p = np.random.permutation(scaled_data.shape[0])
    inverse_p = np.argsort(p)
    
    input_tensor = torch.from_numpy(scaled_data[p, :-1]).unsqueeze(2)
    target_tensor = torch.from_numpy(scaled_data[p, 1:]).unsqueeze(2)
    features_tensor = torch.from_numpy(features[p, :])
    dataset = TensorDataset(input_tensor, target_tensor)
    data_loader = DataLoader(dataset, args.batch_size)
    
    train_loss = 0
    val_output_list = []
    init_time = time.time()
    for i, (inputt, target) in enumerate(data_loader):
        feature = features_tensor[i*args.batch_size:(i*args.batch_size
                                                     +inputt.size(0))]
        if args.cuda:
            inputt = inputt.cuda()
            target = target.cuda()
            feature = feature.cuda()
        inputt = Variable(inputt)
        target = Variable(target)
        feature = Variable(feature)
        
        output = model(inputt, feature)
        pos = np.random.randint(args.future,
                                554-args.future+1)
        loss = criterion(output[:, pos:pos+args.future],
                         target[:, pos:pos+args.future])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.data[0] * inputt.size(0)
        val_output_list.append(output[:, -args.future:]
                               .data.squeeze(2).cpu().numpy())
        
        if i % args.log_every == 0:
            print("   % Time: {:4.0f}s | Batch: {:4} | "
                  "Train loss: {:.4f}".format(
                      time.time()-init_time, i+1, loss.data[0]))
        
    val_output_all = np.concatenate(val_output_list, axis=0)[inverse_p]
    prediction = np.swapaxes(scaler.inverse_transform(
            np.swapaxes(val_output_all, 0, 1)), 0, 1)
    var_target = np.swapaxes(scaler.inverse_transform(
            np.swapaxes(scaled_data[:, -args.future:], 0, 1)), 0, 1)
    
    train_loss /= scaled_data.shape[0]
    val_loss = smape(prediction, var_target)
    print("="*10)
    print("   % Epoch: {} | Time: {:4.0f}s | "
          "Train loss: {:.4f} | Val loss: {:.4f}"
          .format(epoch, time.time()-init_time, train_loss , val_loss))
    print("="*10)
    return val_loss

In [8]:
def forecast(scaled_data, scaler, features, model):
    input_tensor = torch.from_numpy(scaled_data).unsqueeze(2)
    target_tensor = torch.zeros(input_tensor.size(0))
    features_tensor = torch.from_numpy(features)
    dataset = torch.utils.data.TensorDataset(input_tensor, target_tensor)
    data_loader = DataLoader(dataset, 256)
    
    output_list = []
    for i, (inputt, _) in enumerate(data_loader):
        feature = features_tensor[i*args.batch_size:(i*args.batch_size
                                                     +inputt.size(0))]
        if args.cuda:
            inputt = inputt.cuda()
            feature = feature.cuda()
        inputt = Variable(inputt)
        feature = Variable(feature)
        output = model(inputt, feature, args.future)
        output_list.append(output.data.squeeze(2).cpu().numpy()
                           [:, -args.future:])
        
    output_all = np.concatenate(output_list, axis=0)
    prediction = np.swapaxes(scaler.inverse_transform(
            np.swapaxes(output_all, 0, 1)), 0, 1)
    prediction = np.around(np.clip(np.exp(prediction) - 1, 0, None))
    return prediction

In [9]:
def save_model(model, epoch, loss):
    model_file = os.path.join(args.intermediate_path,
                              "model_{}_epoch{}_loss{:.4f}.pth"
                              .format(args.seed, epoch, loss))
    torch.save(model.state_dict(), os.path.join(model_file))

### prepare

In [10]:
scaled_data, scaler, features = get_data()

In [10]:
model = DenseLSTMForecast(args.hidden_size)
if args.cuda:
    model.cuda()
criterion = nn.L1Loss()

In [11]:
optimizer = optim.Adam(model.parameters())
scheduler = MultiStepLR(optimizer, milestones=[3])

### train

In [12]:
if args.train:
    for epoch in range(1, args.n_epoch+1):
        scheduler.step()
        print("=> EPOCH {} with lr {}".format(epoch, scheduler.get_lr()[0]))
        val_loss = train(scaled_data, scaler, features,
                         model, criterion, optimizer)
        save_model(model, epoch, val_loss)
else:
    model_file = os.path.join(args.intermediate_path, args.model_name)
    model.load_state_dict(torch.load(model_file))

=> EPOCH 1 with lr 0.001
   % Time:    1s | Batch:    1 | Train loss: 0.7267
   % Time:    9s | Batch:   11 | Train loss: 0.5070
   % Time:   17s | Batch:   21 | Train loss: 0.4808
   % Time:   26s | Batch:   31 | Train loss: 0.4506
   % Time:   34s | Batch:   41 | Train loss: 0.4256
   % Time:   42s | Batch:   51 | Train loss: 0.4560
   % Time:   50s | Batch:   61 | Train loss: 0.4716
   % Time:   58s | Batch:   71 | Train loss: 0.4322
   % Time:   66s | Batch:   81 | Train loss: 0.4597
   % Time:   75s | Batch:   91 | Train loss: 0.4703
   % Time:   83s | Batch:  101 | Train loss: 0.5155
   % Time:   91s | Batch:  111 | Train loss: 0.4576
   % Time:   99s | Batch:  121 | Train loss: 0.4472
   % Time:  107s | Batch:  131 | Train loss: 0.4761
   % Time:  115s | Batch:  141 | Train loss: 0.4563
   % Time:  124s | Batch:  151 | Train loss: 0.4509
   % Time:  132s | Batch:  161 | Train loss: 0.4419
   % Time:  140s | Batch:  171 | Train loss: 0.4044
   % Time:  148s | Batch:  181 | Train 

   % Time: 1257s | Batch: 1581 | Train loss: 0.3595
   % Time: 1264s | Batch: 1591 | Train loss: 0.3596
   % Time: 1272s | Batch: 1601 | Train loss: 0.3673
   % Time: 1280s | Batch: 1611 | Train loss: 0.4112
   % Time: 1288s | Batch: 1621 | Train loss: 0.3853
   % Time: 1296s | Batch: 1631 | Train loss: 0.3468
   % Time: 1304s | Batch: 1641 | Train loss: 0.4241
   % Time: 1312s | Batch: 1651 | Train loss: 0.3835
   % Time: 1320s | Batch: 1661 | Train loss: 0.4319
   % Time: 1328s | Batch: 1671 | Train loss: 0.4027
   % Time: 1336s | Batch: 1681 | Train loss: 0.3714
   % Time: 1343s | Batch: 1691 | Train loss: 0.3410
   % Time: 1351s | Batch: 1701 | Train loss: 0.3537
   % Time: 1359s | Batch: 1711 | Train loss: 0.3742
   % Time: 1367s | Batch: 1721 | Train loss: 0.3649
   % Time: 1375s | Batch: 1731 | Train loss: 0.3861
   % Time: 1383s | Batch: 1741 | Train loss: 0.4092
   % Time: 1391s | Batch: 1751 | Train loss: 0.3785
   % Time: 1399s | Batch: 1761 | Train loss: 0.4136
   % Time: 1

   % Time:  674s | Batch:  871 | Train loss: 0.3332
   % Time:  682s | Batch:  881 | Train loss: 0.3441
   % Time:  690s | Batch:  891 | Train loss: 0.3530
   % Time:  698s | Batch:  901 | Train loss: 0.3685
   % Time:  706s | Batch:  911 | Train loss: 0.3685
   % Time:  713s | Batch:  921 | Train loss: 0.3949
   % Time:  721s | Batch:  931 | Train loss: 0.3914
   % Time:  728s | Batch:  941 | Train loss: 0.4190
   % Time:  736s | Batch:  951 | Train loss: 0.4196
   % Time:  744s | Batch:  961 | Train loss: 0.3509
   % Time:  752s | Batch:  971 | Train loss: 0.3704
   % Time:  759s | Batch:  981 | Train loss: 0.3621
   % Time:  767s | Batch:  991 | Train loss: 0.3967
   % Time:  775s | Batch: 1001 | Train loss: 0.3586
   % Time:  782s | Batch: 1011 | Train loss: 0.3905
   % Time:  790s | Batch: 1021 | Train loss: 0.3417
   % Time:  798s | Batch: 1031 | Train loss: 0.3836
   % Time:  805s | Batch: 1041 | Train loss: 0.3825
   % Time:  813s | Batch: 1051 | Train loss: 0.3794
   % Time:  

   % Time:  126s | Batch:  161 | Train loss: 0.4184
   % Time:  134s | Batch:  171 | Train loss: 0.3839
   % Time:  142s | Batch:  181 | Train loss: 0.3754
   % Time:  150s | Batch:  191 | Train loss: 0.3884
   % Time:  157s | Batch:  201 | Train loss: 0.3900
   % Time:  165s | Batch:  211 | Train loss: 0.4048
   % Time:  173s | Batch:  221 | Train loss: 0.3143
   % Time:  180s | Batch:  231 | Train loss: 0.4186
   % Time:  188s | Batch:  241 | Train loss: 0.4004
   % Time:  196s | Batch:  251 | Train loss: 0.3735
   % Time:  204s | Batch:  261 | Train loss: 0.3468
   % Time:  212s | Batch:  271 | Train loss: 0.3527
   % Time:  220s | Batch:  281 | Train loss: 0.4036
   % Time:  227s | Batch:  291 | Train loss: 0.3786
   % Time:  235s | Batch:  301 | Train loss: 0.4017
   % Time:  243s | Batch:  311 | Train loss: 0.3926
   % Time:  251s | Batch:  321 | Train loss: 0.3614
   % Time:  259s | Batch:  331 | Train loss: 0.4085
   % Time:  267s | Batch:  341 | Train loss: 0.3879
   % Time:  

   % Time: 1339s | Batch: 1741 | Train loss: 0.3794
   % Time: 1347s | Batch: 1751 | Train loss: 0.4205
   % Time: 1355s | Batch: 1761 | Train loss: 0.3718
   % Time: 1363s | Batch: 1771 | Train loss: 0.3656
   % Time: 1370s | Batch: 1781 | Train loss: 0.3819
   % Time: 1378s | Batch: 1791 | Train loss: 0.4100
   % Time: 1386s | Batch: 1801 | Train loss: 0.4017
   % Time: 1394s | Batch: 1811 | Train loss: 0.3221
   % Time: 1401s | Batch: 1821 | Train loss: 0.4319
   % Time: 1409s | Batch: 1831 | Train loss: 0.3773
   % Time: 1417s | Batch: 1841 | Train loss: 0.4076
   % Time: 1425s | Batch: 1851 | Train loss: 0.3475
   % Time: 1433s | Batch: 1861 | Train loss: 0.3193
   % Time: 1441s | Batch: 1871 | Train loss: 0.3219
   % Time: 1449s | Batch: 1881 | Train loss: 0.3812
   % Time: 1457s | Batch: 1891 | Train loss: 0.4112
   % Time: 1465s | Batch: 1901 | Train loss: 0.3805
   % Time: 1472s | Batch: 1911 | Train loss: 0.3883
   % Time: 1480s | Batch: 1921 | Train loss: 0.3818
   % Time: 1

   % Time:  790s | Batch: 1031 | Train loss: 0.3952
   % Time:  798s | Batch: 1041 | Train loss: 0.3612
   % Time:  806s | Batch: 1051 | Train loss: 0.3735
   % Time:  814s | Batch: 1061 | Train loss: 0.4095
   % Time:  822s | Batch: 1071 | Train loss: 0.3898
   % Time:  830s | Batch: 1081 | Train loss: 0.3397
   % Time:  838s | Batch: 1091 | Train loss: 0.3780
   % Time:  846s | Batch: 1101 | Train loss: 0.3657
   % Time:  853s | Batch: 1111 | Train loss: 0.3586
   % Time:  861s | Batch: 1121 | Train loss: 0.3822
   % Time:  869s | Batch: 1131 | Train loss: 0.4108
   % Time:  877s | Batch: 1141 | Train loss: 0.3730
   % Time:  885s | Batch: 1151 | Train loss: 0.3816
   % Time:  893s | Batch: 1161 | Train loss: 0.3823
   % Time:  901s | Batch: 1171 | Train loss: 0.3428
   % Time:  909s | Batch: 1181 | Train loss: 0.3990
   % Time:  917s | Batch: 1191 | Train loss: 0.4203
   % Time:  924s | Batch: 1201 | Train loss: 0.3770
   % Time:  933s | Batch: 1211 | Train loss: 0.3309
   % Time:  

   % Time:  251s | Batch:  321 | Train loss: 0.3903
   % Time:  258s | Batch:  331 | Train loss: 0.3988
   % Time:  266s | Batch:  341 | Train loss: 0.3731
   % Time:  274s | Batch:  351 | Train loss: 0.3640
   % Time:  281s | Batch:  361 | Train loss: 0.3889
   % Time:  289s | Batch:  371 | Train loss: 0.3474
   % Time:  297s | Batch:  381 | Train loss: 0.3876
   % Time:  304s | Batch:  391 | Train loss: 0.3685
   % Time:  312s | Batch:  401 | Train loss: 0.3918
   % Time:  320s | Batch:  411 | Train loss: 0.3810
   % Time:  327s | Batch:  421 | Train loss: 0.4009
   % Time:  335s | Batch:  431 | Train loss: 0.3767
   % Time:  343s | Batch:  441 | Train loss: 0.3567
   % Time:  350s | Batch:  451 | Train loss: 0.4135
   % Time:  359s | Batch:  461 | Train loss: 0.4067
   % Time:  367s | Batch:  471 | Train loss: 0.4197
   % Time:  375s | Batch:  481 | Train loss: 0.3328
   % Time:  382s | Batch:  491 | Train loss: 0.3981
   % Time:  390s | Batch:  501 | Train loss: 0.3920
   % Time:  

   % Time: 1460s | Batch: 1901 | Train loss: 0.4014
   % Time: 1468s | Batch: 1911 | Train loss: 0.4233
   % Time: 1476s | Batch: 1921 | Train loss: 0.3907
   % Time: 1483s | Batch: 1931 | Train loss: 0.4052
   % Time: 1491s | Batch: 1941 | Train loss: 0.4062
   % Time: 1499s | Batch: 1951 | Train loss: 0.3176
   % Time: 1506s | Batch: 1961 | Train loss: 0.3664
   % Time: 1514s | Batch: 1971 | Train loss: 0.3542
   % Time: 1521s | Batch: 1981 | Train loss: 0.3849
   % Time: 1529s | Batch: 1991 | Train loss: 0.3617
   % Time: 1537s | Batch: 2001 | Train loss: 0.3583
   % Time: 1544s | Batch: 2011 | Train loss: 0.3381
   % Time: 1552s | Batch: 2021 | Train loss: 0.3645
   % Time: 1560s | Batch: 2031 | Train loss: 0.3804
   % Time: 1567s | Batch: 2041 | Train loss: 0.4283
   % Time: 1575s | Batch: 2051 | Train loss: 0.3823
   % Time: 1583s | Batch: 2061 | Train loss: 0.3322
   % Time: 1590s | Batch: 2071 | Train loss: 0.3426
   % Time: 1598s | Batch: 2081 | Train loss: 0.3663
   % Time: 1

   % Time:  908s | Batch: 1191 | Train loss: 0.3284
   % Time:  916s | Batch: 1201 | Train loss: 0.4092
   % Time:  923s | Batch: 1211 | Train loss: 0.3933
   % Time:  931s | Batch: 1221 | Train loss: 0.4096
   % Time:  939s | Batch: 1231 | Train loss: 0.3272
   % Time:  946s | Batch: 1241 | Train loss: 0.3274
   % Time:  954s | Batch: 1251 | Train loss: 0.4061
   % Time:  962s | Batch: 1261 | Train loss: 0.3778
   % Time:  969s | Batch: 1271 | Train loss: 0.4012
   % Time:  977s | Batch: 1281 | Train loss: 0.4063
   % Time:  984s | Batch: 1291 | Train loss: 0.3485
   % Time:  992s | Batch: 1301 | Train loss: 0.3892
   % Time: 1000s | Batch: 1311 | Train loss: 0.4077
   % Time: 1007s | Batch: 1321 | Train loss: 0.3635
   % Time: 1015s | Batch: 1331 | Train loss: 0.3745
   % Time: 1023s | Batch: 1341 | Train loss: 0.3830
   % Time: 1030s | Batch: 1351 | Train loss: 0.3694
   % Time: 1038s | Batch: 1361 | Train loss: 0.3176
   % Time: 1046s | Batch: 1371 | Train loss: 0.4014
   % Time: 1

### test

In [13]:
if args.forecast:
    prediction = forecast(scaled_data, scaler, features, model)

    data_df = pd.read_csv(os.path.join(args.data_path, args.train_file),
                          index_col='Page')
    key_df = pd.read_csv(os.path.join(args.data_path, args.key_file))
    key_df['Date'] = key_df['Page'].apply(lambda a: a[-10:])
    key_df['Page'] = key_df['Page'].apply(lambda a: a[:-11])
    
    future_start = (pd.Timestamp(args.forecast_start)
                    - pd.Timestamp(data_df.columns[-1])).days - 1
    future_end = (pd.Timestamp(args.forecast_end)
                  - pd.Timestamp(data_df.columns[-1])).days
    future_period = future_end - future_start

    visits = np.zeros(key_df.shape[0])
    for i in range(0, len(visits), future_period):
        page = key_df['Page'][i]
        page_index = data_df.index.get_loc(page)
        visits[i:(i+future_period)] = prediction[page_index,
                                                 future_start:future_end]

    key_df['Visits'] = visits
    submission_file = os.path.join(args.intermediate_path,
                                   'submission_{}.csv'.format(args.seed))
    key_df[['Id', 'Visits']].to_csv(submission_file, float_format='%.0f',
                                    index=False)