In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

In [63]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision import transforms as T
from torchvision import datasets

In [4]:
torch.cuda.set_device(1)

In [5]:
trn_ds = datasets.MNIST(root=Path.home()/'data'/'MNIST',
                        train=True,
                        transform=T.ToTensor(),
                        download=True)

In [6]:
tst_ds = datasets.MNIST(root=Path.home()/'data'/'MNIST',
                        train=False,
                        transform=T.ToTensor())

In [7]:
trn_ds.data.size(), trn_ds.targets.size()

(torch.Size([60000, 28, 28]), torch.Size([60000]))

In [8]:
tst_ds.data.size(), tst_ds.targets.size()

(torch.Size([10000, 28, 28]), torch.Size([10000]))

In [9]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim,
                          batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        h0.requires_grad_()
        h0 = h0.cuda()
        out, hn = self.rnn(x, h0.detach())
        out = self.fc(out[:, -1, :])
        return out

We interpret MNIST images as sequence of observations.

In [None]:
input_dim = 28    # MNIST image width (input features)
hidden_dim = 100  # number of units per hidden layer of RNN
layer_dim = 1     # number of hidden layers
output_dim = 10   # number of MNIST classes
seq_dim = 28      # MNIST image height (timestaps to unroll)

model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)
model = model.cuda()
lr = 0.01
opt = torch.optim.SGD(model.parameters(), lr=lr)

bs = 100
n_iters = 3000
n_epochs = int(n_iters / (len(trn_ds) / bs))
trn_loader = torch.utils.data.DataLoader(
    dataset=trn_ds, batch_size=bs, shuffle=True)
tst_loader = torch.utils.data.DataLoader(
    dataset=tst_ds, batch_size=bs, shuffle=False)

c = 0
criterion = nn.CrossEntropyLoss()

for epoch in range(n_epochs):
    for i, (images, labels) in enumerate(trn_loader):
        model.train()
        images = images.view(-1, seq_dim, input_dim).requires_grad_()
        opt.zero_grad()
        outputs = model(images.cuda())
        loss = criterion(outputs, labels.cuda())
        loss.backward()
        opt.step()
        
        c += 1
        
        if c % 500 == 0:
            model.eval()
            correct, total = 0, 0
            for images, labels in tst_loader:
                images = images.view(-1, seq_dim, input_dim)
                outputs = model(images.cuda())
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels.cuda()).sum()
            acc = 100 * correct / total
            print(f'Iteration: {c}. Loss: {loss.item():.4f}. Acc.: {acc}')

Applying the same approach to the competition's dataset

In [11]:
from multiprocessing import cpu_count

In [12]:
from utils import from_feather
from torch_helpers import create_loaders
from torch_helpers import create_datasets, create_test_dataset

In [26]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.batch_size = None
        self.hidden = None
    
    def forward(self, x):
        h0, c0 = self.init_hidden(x)
        out, (hn, cn) = self.rnn(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        # self.hidden = hn, cn
        return out
    
    def init_hidden(self, x):
#         if self.batch_size is not None and self.batch_size == x.size(0):
#             return self.hidden
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        return [t.cuda() for t in (h0, c0)]
        
#         self.batch_size = x.size(0)
#         self.hidden = [t.to('cuda') for t in (h0, c0)]
#         return self.hidden

In [32]:
x_trn, y_trn, x_tst = from_feather('x_trn', 'y_trn', 'x_tst')

trn_ds, val_ds, enc = create_datasets(x_trn, y_trn['surface'])

bs = 128
trn_dl, val_dl = create_loaders(trn_ds, val_ds, bs, jobs=cpu_count())

In [59]:
from torch.optim.lr_scheduler import _LRScheduler

class CyclicLR(_LRScheduler):
    
    def __init__(self, optimizer, schedule, last_epoch=-1):
        assert callable(schedule)
        self.schedule = schedule
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        return [self.schedule(self.last_epoch, lr) for lr in self.base_lrs]

In [60]:
def cosine(t_max, eta_min=0):
    
    def scheduler(epoch, base_lr):
        t = epoch % t_max
        return eta_min + (base_lr - eta_min)*(1 + np.cos(np.pi*t/t_max))/2
    
    return scheduler

In [71]:
input_dim = 10    
hidden_dim = 256
layer_dim = 3
output_dim = 9
seq_dim = 128
lr = 0.0005
n_iters = 3000

n_epochs = 1000
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model = model.cuda()
opt = torch.optim.RMSprop(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
iterations_per_epoch = len(trn_dl)
sched = CyclicLR(opt, cosine(t_max=iterations_per_epoch * 2, eta_min=lr/100))
patience, trials = 100, 0
best_acc = 0

for epoch in range(1, n_epochs + 1):
    
    for i, (x_batch, y_batch) in enumerate(trn_dl):
        model.train()
        x_batch = x_batch.cuda()
        y_batch = y_batch.cuda()
        sched.step()
        opt.zero_grad()
        out = model(x_batch)
        loss = criterion(out, y_batch)
        loss.backward()
        opt.step()
    
    model.eval()
    correct, total = 0, 0
    for x_val, y_val in val_dl:
        x_val, y_val = [t.cuda() for t in (x_val, y_val)]
        out = model(x_val)
        preds = F.log_softmax(out, dim=1).argmax(dim=1)
        total += y_val.size(0)
        correct += (preds == y_val).sum().item()
    
    acc = correct / total

    if epoch % 5 == 0:
        print(f'Epoch: {epoch:3d}. Loss: {loss.item():.4f}. Acc.: {acc:2.2%}')

    if acc > best_acc:
        trials = 0
        best_acc = acc
        filename = f'lstm_best_{acc:0.4f}.pth'
        torch.save(model.state_dict(), filename)
        print(f'Epoch {epoch} best model saved: {filename}')
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch}')
            break

Epoch 1 best model saved: lstm_best_0.2100.pth
Epoch 2 best model saved: lstm_best_0.2651.pth
Epoch:   5. Loss: 1.8100. Acc.: 27.82%
Epoch 5 best model saved: lstm_best_0.2782.pth
Epoch 6 best model saved: lstm_best_0.3202.pth
Epoch 9 best model saved: lstm_best_0.3333.pth
Epoch:  10. Loss: 1.6520. Acc.: 37.80%
Epoch 10 best model saved: lstm_best_0.3780.pth
Epoch 12 best model saved: lstm_best_0.4042.pth
Epoch 13 best model saved: lstm_best_0.4383.pth
Epoch 14 best model saved: lstm_best_0.4409.pth
Epoch:  15. Loss: 1.4330. Acc.: 44.88%
Epoch 15 best model saved: lstm_best_0.4488.pth
Epoch 18 best model saved: lstm_best_0.4514.pth
Epoch:  20. Loss: 1.2123. Acc.: 47.51%
Epoch 20 best model saved: lstm_best_0.4751.pth
Epoch 21 best model saved: lstm_best_0.5118.pth
Epoch 22 best model saved: lstm_best_0.5512.pth
Epoch:  25. Loss: 1.3888. Acc.: 53.02%
Epoch 26 best model saved: lstm_best_0.5801.pth
Epoch 28 best model saved: lstm_best_0.5906.pth
Epoch:  30. Loss: 1.0516. Acc.: 61.42%
Epo

In [72]:
model.load_state_dict(torch.load('lstm_best_0.8215.pth'))

In [73]:
tst_ds = create_test_dataset(x_tst)

In [74]:
model.eval()

LSTMModel(
  (rnn): LSTM(10, 256, num_layers=3, batch_first=True)
  (fc): Linear(in_features=256, out_features=9, bias=True)
)

In [75]:
test_results = []
for x, _ in DataLoader(tst_ds, batch_size=256, shuffle=False):
    x = x.permute(0, 2, 1)
    out = model(x.cuda())
    y_hat = F.log_softmax(out, dim=1).argmax(dim=1)
    test_results.extend(y_hat.tolist())

In [76]:
import pandas as pd
from basedir import SAMPLE

In [77]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = enc.inverse_transform(test_results)
submit.to_csv('submit.csv', index=None)
!kaggle c submit career-con-2019 -f 'submit.csv' -m "LSTM"

100%|██████████████████████████████████████| 53.1k/53.1k [00:00<00:00, 48.3kB/s]
Successfully submitted to CareerCon 2019 - Help Navigate Robots 