In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

In [3]:
import torch
from torch import nn
from torch.nn import functional as F
from torchvision import transforms as T
from torchvision import datasets

In [4]:
torch.cuda.set_device(1)

In [5]:
trn_ds = datasets.MNIST(root=Path.home()/'data'/'MNIST',
                        train=True,
                        transform=T.ToTensor(),
                        download=True)

In [6]:
tst_ds = datasets.MNIST(root=Path.home()/'data'/'MNIST',
                        train=False,
                        transform=T.ToTensor())

In [7]:
trn_ds.data.size(), trn_ds.targets.size()

(torch.Size([60000, 28, 28]), torch.Size([60000]))

In [8]:
tst_ds.data.size(), tst_ds.targets.size()

(torch.Size([10000, 28, 28]), torch.Size([10000]))

In [9]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim,
                          batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        h0.requires_grad_()
        h0 = h0.cuda()
        out, hn = self.rnn(x, h0.detach())
        out = self.fc(out[:, -1, :])
        return out

We interpret MNIST images as sequence of observations.

In [10]:
# input_dim = 28    # MNIST image width (input features)
# hidden_dim = 100  # number of units per hidden layer of RNN
# layer_dim = 1     # number of hidden layers
# output_dim = 10   # number of MNIST classes
# seq_dim = 28      # MNIST image height (timestaps to unroll)

# model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)
# model = model.cuda()
# lr = 0.01
# opt = torch.optim.SGD(model.parameters(), lr=lr)

# bs = 100
# n_iters = 3000
# n_epochs = int(n_iters / (len(trn_ds) / bs))
# trn_loader = torch.utils.data.DataLoader(
#     dataset=trn_ds, batch_size=bs, shuffle=True)
# tst_loader = torch.utils.data.DataLoader(
#     dataset=tst_ds, batch_size=bs, shuffle=False)

# c = 0
# criterion = nn.CrossEntropyLoss()

# for epoch in range(n_epochs):
#     for i, (images, labels) in enumerate(trn_loader):
#         model.train()
#         images = images.view(-1, seq_dim, input_dim).requires_grad_()
#         opt.zero_grad()
#         outputs = model(images.cuda())
#         loss = criterion(outputs, labels.cuda())
#         loss.backward()
#         opt.step()
        
#         c += 1
        
#         if c % 500 == 0:
#             model.eval()
#             correct, total = 0, 0
#             for images, labels in tst_loader:
#                 images = images.view(-1, seq_dim, input_dim)
#                 outputs = model(images.cuda())
#                 _, predicted = torch.max(outputs.data, 1)
#                 total += labels.size(0)
#                 correct += (predicted == labels.cuda()).sum()
#             acc = 100 * correct / total
#             print(f'Iteration: {c}. Loss: {loss.item():.4f}. Acc.: {acc}')

Applying the same approach to the competition's dataset

In [11]:
from multiprocessing import cpu_count

In [12]:
from utils import from_feather
from torch_helpers import create_loaders
from torch_helpers import create_datasets, create_test_dataset

In [13]:
x_trn, y_trn, x_tst = from_feather('x_trn', 'y_trn', 'x_tst')

In [14]:
bs = 128
trn_ds, val_ds, enc = create_datasets(x_trn, y_trn['surface'])
trn_dl, val_dl = create_loaders(trn_ds, val_ds, bs, jobs=cpu_count())

In [18]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        h0.requires_grad_()
        h0 = h0.cuda()
        out, hn = self.rnn(x, h0.detach())
        out = self.fc(out[:, -1, :])
        return out

In [19]:
input_dim = 10    
hidden_dim = 100
layer_dim = 3
output_dim = 9
seq_dim = 128
lr = 0.0001
n_iters = 3000

n_epochs = int(n_iters / (len(trn_ds) / bs))
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model = model.cuda()
opt = torch.optim.RMSprop(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

step = 0

for epoch in range(n_epochs):
    for i, (x_batch, y_batch) in enumerate(trn_dl):
        model.train()
        x_batch = x_batch.cuda()
        y_batch = y_batch.cuda()

        opt.zero_grad()
        out = model(x_batch)
        loss = criterion(out, y_batch)
        loss.backward()
        opt.step()
        
        step += 1
        
        if step % 250 == 0:
            model.eval()
            correct, total = 0, 0
            for x_val, y_val in val_dl:
                x_val, y_val = [t.cuda() for t in (x_val, y_val)]
                out = model(x_val)
                preds = F.softmax(out, dim=1).argmax(dim=1)
                total += y_val.size(0)
                correct += (preds == y_val).sum()
    
            acc = correct / total
            print(f'Step: {step:4d}. Loss: {loss.item():.4f}. Acc.: {acc:2.2%}')

RuntimeError: Expected hidden[0] size (3, 128, 100), got (128, 100)