In [1]:
import sys
import numpy as np
import pandas as pd
from numba import jit

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

torch.cuda.empty_cache()

In [2]:
data_med = pd.read_feather('./data_clean.feat')
y_true = pd.read_feather('./true_values.feat')
pages = pd.read_csv('./pages.csv',header=None)[0]
data_test = pd.read_feather('./data_test.feat')
test_pages = pd.read_csv('./test_pages.csv',header=None)[0]

## Building Training Data

In [None]:
data = pd.read_csv('./train_2.csv', encoding='utf-8')
y_true = pd.read_csv('./solution_11_15.csv', encoding='utf-8')
sample_solution = pd.read_csv('./sample_submission_2.csv', encoding='utf-8')
keys = pd.read_csv('./key_2.csv', encoding='utf-8')

keys = dict(zip(keys.Id.values, keys.Page.values))

In [None]:
# load and clean y_true
y_true = pd.read_csv('./solution_11_15.csv', encoding='utf-8')
y_true.Id = y_true.Id.apply(lambda x : keys[x])
y_true['Date'] = y_true.Id.apply(lambda x : x[-10:])
y_true.Id = y_true.Id.apply(lambda x : x[:-11])
y_true = y_true.pivot('Id','Date','Visits')
y_true['Id']=y_true.index
y_true.reset_index(drop=True,inplace=True)

In [None]:
data_med =  data.iloc[:,1:].apply(lambda x : x.fillna(x.dropna().median()), axis=1)
pages = data.Page
data_med = data_med.astype(np.float32)

del data

In [None]:
y_true.iloc[:,:62] = y_true.iloc[:,:62].astype(np.float32)
data_med['Id'] = pages
data_test = data_med.merge(y_true, 'left', on='Id')
test_pages = data_test.Id
data_test.drop('Id',axis=1,inplace=True)
data_test = data_test.iloc[:,62:]

In [None]:
data_med.to_feather('./data_clean.feat')
y_true.to_feather('./true_values.feat')
pages.to_csv('./pages.csv',index=False)
data_test.to_feather('./data_test.feat')
test_pages.to_csv('./test_pages.csv')

## Building Models

For an initial approach, I fit two CNN models with slightly different architecture.  These models generated the entire range of the prediction (62 values) at once.

In [3]:
@jit(nopython=True)
def smape_batch(y_true, y_pred):
    return 100 * np.mean( np.abs(y_true - y_pred) / ( (np.abs(y_true) + np.abs(y_pred)) / 2 ) )

In [4]:
class TimeSeriesData(Dataset):
    def __init__(self,data,page,pred_len=62):
        self.data = data
        self.page = page
        self.length = data.shape[0]
        self.pred_len = pred_len
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        out = np.array(self.data.loc[index])
        return out[:-self.pred_len], out[-self.pred_len:],self.page.loc[index]

In [5]:
class TimeSeriesCNN(nn.Module):
    def __init__(self,pred_len):
        super(TimeSeriesCNN, self).__init__()
        self.conv_year = nn.Conv1d(1, out_channels=100, kernel_size=365)
        self.conv_year2 = nn.Conv1d(100, 10, 1)
        self.conv_mon = nn.Conv1d(1, out_channels=20, kernel_size=31)
        self.conv_day = nn.Conv1d(1, out_channels=2, kernel_size=7)
        self.fc = nn.Linear(1220, pred_len)
        
    def forward(self, x):
        x = x.unsqueeze(1)
        y = F.relu(self.conv_year(x))
        y2 = F.relu(self.conv_mon(x))
        y3 = F.relu(self.conv_day(x))
        y = nn.AdaptiveAvgPool1d(10)(y)
        y2 = nn.AdaptiveAvgPool1d(10)(y2)
        y3 = nn.AdaptiveAvgPool1d(10)(y3)
        y = y.view(y.size(0),-1)
        y2 = y2.view(y2.size(0),-1)
        y3 = y3.view(y3.size(0),-1)
        #y = self.bn(y)
        #y2 = self.bn2(y2)
        #y3 = self.bn3(y3)
        out = torch.cat([y,y2,y3], dim=1)
        out = F.dropout(out, .2)
        out = self.fc(out)
        
        return out

In [6]:
class TimeSeriesCNN_v2(nn.Module):
    def __init__(self,pred_len):
        super(TimeSeriesCNN_v2, self).__init__()
        self.pred_len = pred_len
        self.conv_1 = nn.Conv1d(1, out_channels=8, kernel_size=7)
        self.conv_2 = nn.Conv1d(8, out_channels=16, kernel_size=5)
        self.conv_3 = nn.Conv1d(16, out_channels=32, kernel_size=12)
        self.conv_4 = nn.Conv1d(32, out_channels=64, kernel_size=10)
        self.conv_5 = nn.Conv1d(64, out_channels=1, kernel_size=1)
        self.fc = nn.Linear(831,pred_len)
        
    def forward(self, x):
        x = x.unsqueeze(1)
        y = F.relu(self.conv_1(x))
        max1 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_2(y))
        max2 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_3(y))
        max3 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_4(y))
        max4 = nn.MaxPool1d(kernel_size = y.shape[2])(y)
        y = self.conv_5(y)
        y = y.view(y.size(0),-1)
        max1 = max1.view(max1.size(0),-1)
        max2 = max2.view(max2.size(0),-1)
        max3 = max3.view(max3.size(0),-1)
        max4 = max4.view(max4.size(0),-1)
        y = F.dropout(y, .1)
        y = torch.cat([max1,max2,max3,max4,y],dim=1)
        y = self.fc(y)
        
        return y

In [7]:
def train_epochs(model, epochs=10, lr=0.01):
    model.train()
    output = list('....................')
    best_smape = 200
    best_epoch = 0
    for i in range(epochs):
        parameters = filter(lambda p: p.requires_grad, model.parameters())
        optimizer = torch.optim.Adam(parameters, lr=lr)
        y_pred = np.array([])
        y_true = np.array([])
        for x,y,name in train_loader:
            size = y.shape[0]
            x = torch.Tensor(x).cuda()
            y = torch.Tensor(y).cuda()
            y_hat = model(x)
            loss = F.l1_loss(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # calculating total loss
            y_pred = np.append(y_pred,y_hat.cpu().detach().numpy())
            y_true = np.append(y_true,y.cpu().detach().numpy())
        
        # building output
        curr_smape = smape_batch(y_pred, y_true)
        if curr_smape < best_smape:
            best_smape = curr_smape
            best_epoch = i
        output[int(i//(epochs / 20))] = 'x'
        sys.stdout.write('\r' + 'Epoch %s: ' % i +\
                         ''.join(output) +\
                         ' SMAPE: %.2f' % (smape_batch(y_pred,y_true)) +\
                         ' Best: %.2f on epoch %s' % (best_smape, best_epoch)
                        )
        sys.stdout.flush()

In [8]:
def predict(model, test_loader):
    model.eval()
    y_pred = np.array([])
    y_true = np.array([])
    for x,y,name in test_loader:
        x = torch.Tensor(x).cuda()
        y = torch.Tensor(y).cuda()
        y_hat = model(x)
        # calculating total loss
        y_pred = np.append(y_pred,y_hat.cpu().detach().numpy())
        y_true = np.append(y_true,y.cpu().detach().numpy())
    
    print('Test SMAPE: %s' % smape_batch(y_true,y_pred))

## Training

This first block tests the performance of guessing the median for the validation range, which turns out to be around 46%.

In [9]:
raw_data = data_med.drop('Id',axis=1).values
median_vals = np.median(raw_data[:,:-62],axis=1)
med_pred = np.tile(median_vals, (62,1)).transpose()
test = raw_data[:,-62:]

out = np.abs(test-med_pred) / ((np.abs(test) + np.abs(med_pred)) / 2)
out[np.isnan(out)] = 0
np.mean(out)

  


0.4678351

These models do not out perform the median.

In [10]:
train_loader = DataLoader(TimeSeriesData(data_med.drop('Id',axis=1),pages,62),batch_size=2048,shuffle=True,num_workers=4)
model = TimeSeriesCNN(62)
model.cuda()
train_epochs(model, 1000, 1e-4)


Epoch 999: xxxxxxxxxxxxxxxxxxxx SMAPE: 46.40 Best: 45.49 on epoch 974

In [11]:
torch.save(model,'./v1.model')
del model; torch.cuda.empty_cache()

  "type " + obj.__name__ + ". It won't be checked "


In [12]:
train_loader = DataLoader(TimeSeriesData(data_med.drop('Id',axis=1),pages,62),batch_size=2048,shuffle=True,num_workers=3)
model_v2 = TimeSeriesCNN_v2(62)
model_v2.cuda()
train_epochs(model_v2, 1000, 1e-4)


Epoch 999: xxxxxxxxxxxxxxxxxxxx SMAPE: 43.74 Best: 42.43 on epoch 732

In [13]:
torch.save(model_v2,'./v2.model')
del model_v2; torch.cuda.empty_cache()

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
y_true.iloc[:,:62] = y_true.iloc[:,:62].astype(np.float32)
data_med['Id'] = pages
data_test = data_med.merge(y_true, 'left', on='Id')
test_pages = data_test.Id
data_test.drop('Id',axis=1,inplace=True)
data_test = data_test.iloc[:,62:]
test_loader = DataLoader(TimeSeriesData(data_test,pages,62),batch_size=2048,num_workers=3)

## Models with Attention

Per the winning solution on kaggle, these models are-implemented but with some modification.  New features will be created that are callbacks to data at previous time intervals $x_{t-90}$ and $x_{t-365}$.  This model scores around 40%.

In [None]:
model_attn = torch.load('./model_attn.model')

In [None]:
class TimeSeriesCNN_v2(nn.Module):
    def __init__(self,pred_len):
        super(TimeSeriesCNN_v2, self).__init__()
        self.pred_len = pred_len
        self.conv_1 = nn.Conv1d(1, out_channels=8, kernel_size=7)
        self.conv_2 = nn.Conv1d(8, out_channels=16, kernel_size=5)
        self.conv_3 = nn.Conv1d(16, out_channels=32, kernel_size=12)
        self.conv_4 = nn.Conv1d(32, out_channels=64, kernel_size=10)
        self.conv_5 = nn.Conv1d(64, out_channels=1, kernel_size=1)
        self.fc = nn.Linear(831,pred_len)
        
    def forward(self, x):
        x = x.unsqueeze(1)
        y = F.relu(self.conv_1(x))
        max1 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_2(y))
        max2 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_3(y))
        max3 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_4(y))
        max4 = nn.MaxPool1d(kernel_size = y.shape[2])(y)
        y = self.conv_5(y)
        y = y.view(y.size(0),-1)
        max1 = max1.view(max1.size(0),-1)
        max2 = max2.view(max2.size(0),-1)
        max3 = max3.view(max3.size(0),-1)
        max4 = max4.view(max4.size(0),-1)
        y = F.dropout(y, .1)
        y = torch.cat([max1,max2,max3,max4,y],dim=1)
        y = self.fc(y)
        
        return y

In [26]:
class TimeSeriesCNNAttn(nn.Module):
    def __init__(self,pred_len):
        super(TimeSeriesCNNAttn, self).__init__()
        self.pred_len = pred_len
        self.conv_1 = nn.Conv1d(1, out_channels=8, kernel_size=7)
        self.conv_2 = nn.Conv1d(8, out_channels=16, kernel_size=5)
        self.conv_3 = nn.Conv1d(16, out_channels=32, kernel_size=12)
        self.conv_4 = nn.Conv1d(32, out_channels=64, kernel_size=10)
        self.conv_5 = nn.Conv1d(64, out_channels=1, kernel_size=1)
        self.fc = nn.Linear(831,pred_len)
        self.fc2 = nn.Linear(pred_len, pred_len)
        self.fc3 = nn.Linear(pred_len, pred_len)
        self.fc4 = nn.Linear(pred_len * 3, pred_len)

    def forward(self, x):
        x = x.unsqueeze(1)
        y = F.relu(self.conv_1(x))
        max1 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_2(y))
        max2 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_3(y))
        max3 = nn.MaxPool1d(y.shape[2])(y)
        y = F.relu(self.conv_4(y))
        max4 = nn.MaxPool1d(kernel_size = y.shape[2])(y)
        y = F.relu(self.conv_5(y))
        y = y.view(y.size(0),-1)
        max1 = max1.view(max1.size(0),-1)
        max2 = max2.view(max2.size(0),-1)
        max3 = max3.view(max3.size(0),-1)
        max4 = max4.view(max4.size(0),-1)
        y = F.dropout(y, .2)
        y = torch.cat([max1,max2,max3,max4,y],dim=1)
        y = self.fc(y)
        
        x = x.view(x.size(0),-1)
        out = torch.cat([y,x[:,-90:(-90+62)],x[:,-365:(-365+62)]],dim=1)
        #out = F.dropout(out, .2)
        out = self.fc4(out)
        
        return out

In [27]:
model_attn = TimeSeriesCNNAttn(62)

In [None]:
train_loader = DataLoader(TimeSeriesData(data_med.drop('Id',axis=1),pages,62),batch_size=1024,shuffle=True,num_workers=3)
model_attn.cuda()
train_epochs(model_attn, 1500, 1e-4)

Epoch 1480: xxxxxxxxxxxxxxxxxxxx SMAPE: 39.53 Best: 39.26 on epoch 1474

In [None]:
torch.save(model_attn, './model_attn.model')

In [None]:
data_test

In [None]:
data_med