### setup

In [1]:
import os
import argparse
import pickle
import time

import numpy as np; np.seterr(invalid='ignore')
import pandas as pd
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import TensorDataset, DataLoader

In [2]:
parser = {
    'data_path': '../data/wttsf/',
    'train_file': 'train_2.csv',
    'key_file': 'key_2.csv',
    'intermediate_path': '../intermediate/',
    'train_len': 91,
    'train_skip': 91,
    'val_len': 74,
    'offset': 793,
    'batch_size': 256,
    'hidden_size': 256,
    'log_every': 10,
    'read_from_file': False,
    'train': True,
    'model_name': 'model_20170910_epoch6_loss39.2927.pth',
    'forecast': True,
    'cuda': True,
    'seed': 20170911,
}
args = argparse.Namespace(**parser)

args.cuda = args.cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)

args.intermediate_path = os.path.join(args.intermediate_path, str(args.seed))

### model

In [3]:
class DenseLSTMForecast(nn.Module):
    def __init__(self, hidden_size):
        super(DenseLSTMForecast, self).__init__()
        self.lstm1 = nn.LSTMCell(1, hidden_size)
        self.lstm2 = nn.LSTMCell(hidden_size+1, hidden_size)
        self.lstm3 = nn.LSTMCell(2*hidden_size+1, hidden_size)
        self.linear = nn.Linear(3*hidden_size+1, 1)
        self.hidden_size = hidden_size

    def forward(self, x, future=0):
        o = []
        tt = torch.cuda if args.cuda else torch
        h1_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        c1_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        h2_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        c2_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        h3_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        c3_t = Variable(tt.FloatTensor(x.size(0), self.hidden_size).zero_())
        
        for x_t in x.chunk(x.size(1), dim=1):
            x_t = x_t.squeeze(dim=1)
            h1_t, c1_t = self.lstm1(x_t, (h1_t, c1_t))
            h1d_t = torch.cat([x_t, h1_t], dim=1)
            h2_t, c2_t = self.lstm2(h1d_t, (h2_t, c2_t))
            h2d_t = torch.cat([x_t, h1_t, h2_t], dim=1)
            h3_t, c3_t = self.lstm3(h2d_t, (h3_t, c3_t))
            h3d_t = torch.cat([x_t, h1_t, h2_t, h3_t], dim=1)
            o_t = self.linear(h3d_t)
            o.append(o_t)

            
        for i in range(future):
            h1_t, c1_t = self.lstm1(o_t, (h1_t, c1_t))
            h1d_t = torch.cat([o_t, h1_t], dim=1)
            h2_t, c2_t = self.lstm2(h1d_t, (h2_t, c2_t))
            h2d_t = torch.cat([o_t, h1_t, h2_t], dim=1)
            h3_t, c3_t = self.lstm3(h2d_t, (h3_t, c3_t))
            h3d_t = torch.cat([o_t, h1_t, h2_t, h3_t], dim=1)
            o_t = self.linear(h3d_t)
            o.append(o_t)

        return torch.stack(o, dim=1)

### utils

In [4]:
def smape(y_pred, y_true):
    y_pred = np.around(y_pred)
    denominator = y_true + y_pred
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0
    return 200 * np.nanmean(diff)

In [5]:
def get_data():
    raw_data_file = os.path.join(args.intermediate_path, 'raw_data.pkl')
    scaled_data_file = os.path.join(args.intermediate_path,
                                    'scaled_data.pkl')
    scaler_file = os.path.join(args.intermediate_path, 'scaler.pkl')
    
    if not args.read_from_file:
        data_df = pd.read_csv(os.path.join(args.data_path, args.train_file),
                              index_col='Page')
        raw_data = data_df.values.copy()
        data_df = data_df.fillna(method='ffill', axis=1).fillna(
            method='bfill', axis=1)
        data = np.nan_to_num(data_df.values.astype('float32'))
        data = np.log1p(data)
        scaler = StandardScaler()
        scaler.fit(np.swapaxes(data, 0, 1))
        scaled_data = scaler.transform(np.swapaxes(data, 0, 1))
        scaled_data = np.swapaxes(scaled_data, 0, 1)
        
        with open(raw_data_file, 'wb') as f:
            pickle.dump(raw_data, f)
        with open(scaled_data_file, 'wb') as f:
            pickle.dump(scaled_data, f)
        with open(scaler_file, 'wb') as f:
            pickle.dump(scaler, f)
    else:
        with open(raw_data_file, 'rb') as f:
            raw_data = pickle.load(f)
        with open(scaled_data_file, 'rb') as f:
            scaled_data = pickle.load(f)
        with open(scaler_file, 'rb') as f:
            scaler = pickle.load(f)
    return raw_data, scaled_data, scaler

In [6]:
def train(raw_data, scaled_data, scaler, model, criterion, optimizer):
    p = np.random.permutation(scaled_data.shape[0])
    inverse_p = np.argsort(p)
    
    input_tensor = torch.from_numpy(
        scaled_data[p, :(args.offset-1)]).unsqueeze(2)
    target_tensor = torch.from_numpy(
        scaled_data[p, 1:args.offset]).unsqueeze(2)
    dataset = TensorDataset(input_tensor, target_tensor)
    data_loader = DataLoader(dataset, args.batch_size)
    
    train_loss = 0
    val_output_list = []
    init_time = time.time()
    for i, (inputt, target) in enumerate(data_loader):
        if args.cuda:
            inputt = inputt.cuda()
            target = target.cuda()
        inputt = Variable(inputt)
        target = Variable(target)
        
#        output = model(inputt, future=args.val_len)
        output = model(inputt)
        pos = np.random.randint(args.train_skip,
                                inputt.size(1)-args.train_len+1)
        loss = criterion(output[:, pos:pos+args.train_len],
                         target[:, pos:pos+args.train_len])
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), 3, 'inf')
        optimizer.step()
        train_loss += loss.data[0] * inputt.size(0)
#        val_output_list.append(output[:, -args.val_len:]
#                               .data.squeeze(2).cpu().numpy())
        
        if i % args.log_every == 0:
            print("   % Time: {:4.0f}s | Batch: {:4} | "
                  "Train loss: {:.4f}".format(
                      time.time()-init_time, i+1, loss.data[0]))
        
#    val_output_all = np.concatenate(val_output_list, axis=0)[inverse_p]
#    prediction = np.swapaxes(scaler.inverse_transform(
#            np.swapaxes(val_output_all, 0, 1)), 0, 1)
#    prediction = np.clip(np.exp(prediction)-1, 0, None)
#    var_target = raw_data[:, args.offset:args.offset+args.val_len]
    
    train_loss /= scaled_data.shape[0]
#    val_loss = smape(prediction, var_target)
    val_loss = 0
    print("="*10)
    print("   % Epoch: {} | Time: {:4.0f}s | "
          "Train loss: {:.4f} | Val loss: {:.4f}"
          .format(epoch, time.time()-init_time, train_loss, val_loss))
    print("="*10)
    return val_loss

In [7]:
def forecast(raw_data, scaled_data, scaler, model):
    input_tensor = torch.from_numpy(scaled_data[:,
            :args.offset]).unsqueeze(2)
    target_tensor = torch.zeros(input_tensor.size(0))
    dataset = torch.utils.data.TensorDataset(input_tensor, target_tensor)
    data_loader = DataLoader(dataset, 128)
    
    output_list = []
    for i, (inputt, _) in enumerate(data_loader):
        if args.cuda:
            inputt = inputt.cuda()
        inputt = Variable(inputt)
        output = model(inputt, args.val_len)
        output_list.append(output.data.squeeze(2).cpu().numpy()
                           [:, -args.val_len:])
        
    output_all = np.concatenate(output_list, axis=0)
    prediction = np.swapaxes(scaler.inverse_transform(
            np.swapaxes(output_all, 0, 1)), 0, 1)

    prediction = np.clip(np.exp(prediction) - 1, 0, None)
    return prediction

In [8]:
def save_model(model, epoch, loss):
    model_file = os.path.join(args.intermediate_path,
                              "model_{}_epoch{}_loss{:.4f}.pth"
                              .format(args.seed, epoch, loss))
    torch.save(model.state_dict(), os.path.join(model_file))

### prepare

In [9]:
raw_data, scaled_data, scaler = get_data()

In [10]:
model = DenseLSTMForecast(args.hidden_size)
if args.cuda:
    model.cuda()
criterion = nn.L1Loss()

In [11]:
optimizer = optim.RMSprop(model.parameters(), lr=0.001)
scheduler = MultiStepLR(optimizer, milestones=[2, 4, 6])

### train

In [12]:
if args.train:
    for epoch in range(1, 7):
        scheduler.step()
        print("=> EPOCH {} with lr {}".format(epoch, scheduler.get_lr()))
        val_loss = train(raw_data, scaled_data, scaler,
                         model, criterion, optimizer)
        save_model(model, epoch, val_loss)
else:
    model_file = os.path.join(args.intermediate_path, args.model_name)
    model.load_state_dict(torch.load(model_file))

=> EPOCH 1 with lr [0.001]
   % Time:    2s | Batch:    1 | Train loss: 0.7122
   % Time:   14s | Batch:   11 | Train loss: 0.5575
   % Time:   25s | Batch:   21 | Train loss: 0.4695
   % Time:   37s | Batch:   31 | Train loss: 0.4285
   % Time:   48s | Batch:   41 | Train loss: 0.4239
   % Time:   60s | Batch:   51 | Train loss: 0.4024
   % Time:   71s | Batch:   61 | Train loss: 0.4192
   % Time:   83s | Batch:   71 | Train loss: 0.4230
   % Time:   94s | Batch:   81 | Train loss: 0.4306
   % Time:  106s | Batch:   91 | Train loss: 0.3910
   % Time:  118s | Batch:  101 | Train loss: 0.4088
   % Time:  129s | Batch:  111 | Train loss: 0.4025
   % Time:  141s | Batch:  121 | Train loss: 0.3983
   % Time:  152s | Batch:  131 | Train loss: 0.4136
   % Time:  164s | Batch:  141 | Train loss: 0.4246
   % Time:  176s | Batch:  151 | Train loss: 0.4049
   % Time:  187s | Batch:  161 | Train loss: 0.4102
   % Time:  199s | Batch:  171 | Train loss: 0.4140
   % Time:  210s | Batch:  181 | Trai

   % Time:  451s | Batch:  391 | Train loss: 0.3619
   % Time:  462s | Batch:  401 | Train loss: 0.3591
   % Time:  473s | Batch:  411 | Train loss: 0.3942
   % Time:  485s | Batch:  421 | Train loss: 0.3665
   % Time:  496s | Batch:  431 | Train loss: 0.4043
   % Time:  507s | Batch:  441 | Train loss: 0.3950
   % Time:  518s | Batch:  451 | Train loss: 0.4138
   % Time:  530s | Batch:  461 | Train loss: 0.3893
   % Time:  541s | Batch:  471 | Train loss: 0.3685
   % Time:  552s | Batch:  481 | Train loss: 0.3553
   % Time:  563s | Batch:  491 | Train loss: 0.3767
   % Time:  575s | Batch:  501 | Train loss: 0.3686
   % Time:  586s | Batch:  511 | Train loss: 0.3877
   % Time:  597s | Batch:  521 | Train loss: 0.3813
   % Time:  608s | Batch:  531 | Train loss: 0.3773
   % Time:  619s | Batch:  541 | Train loss: 0.3620
   % Time:  630s | Batch:  551 | Train loss: 0.3522
   % Time:  642s | Batch:  561 | Train loss: 0.3885
   % Epoch: 3 | Time:  648s | Train loss: 54824.9193 | Val loss:

   % Time:  199s | Batch:  181 | Train loss: 0.3591
   % Time:  210s | Batch:  191 | Train loss: 0.3927
   % Time:  221s | Batch:  201 | Train loss: 0.3748
   % Time:  232s | Batch:  211 | Train loss: 0.3818
   % Time:  243s | Batch:  221 | Train loss: 0.3438
   % Time:  254s | Batch:  231 | Train loss: 0.3867
   % Time:  265s | Batch:  241 | Train loss: 0.3784
   % Time:  276s | Batch:  251 | Train loss: 0.3767
   % Time:  287s | Batch:  261 | Train loss: 0.3743
   % Time:  298s | Batch:  271 | Train loss: 0.3963
   % Time:  309s | Batch:  281 | Train loss: 0.3700
   % Time:  320s | Batch:  291 | Train loss: 0.3791
   % Time:  331s | Batch:  301 | Train loss: 0.3592
   % Time:  342s | Batch:  311 | Train loss: 0.3740
   % Time:  353s | Batch:  321 | Train loss: 0.3298
   % Time:  364s | Batch:  331 | Train loss: 0.3848
   % Time:  375s | Batch:  341 | Train loss: 0.3650
   % Time:  386s | Batch:  351 | Train loss: 0.3860
   % Time:  397s | Batch:  361 | Train loss: 0.3497
   % Time:  

### test

In [13]:
if args.forecast:
    prediction = forecast(raw_data, scaled_data, scaler, model)
#    print("SMAPE: {}".format(smape(prediction, raw_data[:,
#        args.offset:args.offset+args.val_len])))
    with open(os.path.join(args.intermediate_path,
                           "pred_rnn.pkl"), "wb") as f:
        pickle.dump(prediction, f)