# Model

## Setup

In [1]:
import numpy as np
import torch
import torch.utils.data
import torch.nn as nn
from tensorboardX import SummaryWriter

data = np.load('./dataset/preprocessed.npz')
train_x = torch.from_numpy(data['train_x']).float()
train_y = torch.from_numpy(data['train_y']).float()
test_x = torch.from_numpy(data['test_x']).float()

train_len = int(len(train_x) * 0.8)
eval_len = len(train_x) - train_len
train_data, eval_data = torch.utils.data.random_split(
    torch.utils.data.TensorDataset(train_x, train_y),
    [train_len, eval_len]
)
test_data = torch.utils.data.TensorDataset(test_x)

print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)
print('test_x shape:', test_x.shape)

train_x shape: torch.Size([3000, 20083])
train_y shape: torch.Size([3000])
test_x shape: torch.Size([4398, 20083])


## Define Model

In [44]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features=20082, out_features=8192),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=8192, out_features=2048),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=2048, out_features=512),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=512, out_features=128),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=128, out_features=1),
            nn.ReLU()
        )
        self.loss_func = nn.MSELoss()
        
    def forward(self, in_data):
        in_data[torch.isnan(in_data)] = 0 # set nan to 0
        pred = self.net(in_data[:,1:]) # skip id
        return pred

    def loss(self, pred, truth):
        loss = torch.sqrt(self.loss_func(torch.log1p(pred), torch.log1p(truth))) #RMSLE
        return loss

## Runner function

In [45]:
def run(model, loaders, optimizer, writer, num_epoch=10, device='cpu'):
    def run_epoch(mode):
        epoch_loss = 0.0
        for i, batch in enumerate(loaders[mode], 0):
            in_data, truth = batch
            in_data, truth = in_data.to(device), truth.to(device)

            if mode == 'train':
                optimizer.zero_grad()

            pred = model(in_data)
            batch_loss = model.loss(pred, truth)
            

            epoch_loss += batch_loss.item()
            if mode == 'train':
                batch_loss.backward()
                optimizer.step()

        # sum of all batchs / num of batches
        epoch_loss /= i + 1
        print('epoch %d %s loss %.4f' % (epoch, mode, epoch_loss))
        # log to tensorboard
        if not (writer is None):
            writer.add_scalars('%s_loss' % model.__class__.__name__,
                         tag_scalar_dict={mode: epoch_loss}, 
                         global_step=epoch)
    for epoch in range(num_epoch):
        run_epoch('train')
        run_epoch('eval')

## Training

In [46]:
model = Model()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
run(
    model=model,
    loaders={
        'train': torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True),
        'eval': torch.utils.data.DataLoader(eval_data, batch_size=128, shuffle=True)
    },
    optimizer=optimizer, 
    writer=SummaryWriter('./logs/'), 
    num_epoch=10, 
    device='cpu'
)

epoch 0 train loss 6.9072
epoch 0 eval loss 5.4217
epoch 1 train loss 4.7072
epoch 1 eval loss 4.3277
epoch 2 train loss 4.0016
epoch 2 eval loss 3.8220
epoch 3 train loss 3.6754
epoch 3 eval loss 3.6627
epoch 4 train loss 3.5177
epoch 4 eval loss 3.5070
epoch 5 train loss 3.3969
epoch 5 eval loss 3.4733
epoch 6 train loss 3.3783
epoch 6 eval loss 3.3725
epoch 7 train loss 3.3577
epoch 7 eval loss 3.4255
epoch 8 train loss 3.3098
epoch 8 eval loss 3.4242
epoch 9 train loss 3.3238
epoch 9 eval loss 3.4065


## Export test output

In [61]:
test_pred_tensor = model(test_x)
print('test loss: ', model.loss(test_pred_tensor, test_x).item())
test_pred = test_pred_tensor.detach().cpu().numpy()
output = np.concatenate((np.expand_dims(data['test_x'][:,0], axis=1), test_pred), axis=1)
np.savetxt('./test_out.csv', output, header='id,revenue', delimiter=',', fmt='%i', comments='')

test loss:  16.116472244262695
