In [3]:
from pathlib import Path

In [4]:
import torch
from torch import nn
from torchvision import transforms as T
from torchvision import datasets

In [6]:
trn_ds = datasets.MNIST(root=Path.home()/'data'/'MNIST',
                        train=True,
                        transform=T.ToTensor(),
                        download=True)

In [8]:
tst_ds = datasets.MNIST(root=Path.home()/'data'/'MNIST',
                        train=False,
                        transform=T.ToTensor())

In [10]:
trn_ds.data.size(), trn_ds.targets.size()

(torch.Size([60000, 28, 28]), torch.Size([60000]))

In [11]:
tst_ds.data.size(), tst_ds.targets.size()

(torch.Size([10000, 28, 28]), torch.Size([10000]))

In [32]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim,
                          batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        h0.requires_grad_()
        h0 = h0.cuda()
        out, hn = self.rnn(x, h0.detach())
        out = self.fc(out[:, -1, :])
        return out

We interpret MNIST images as sequence of observations.

In [16]:
torch.cuda.set_device(0)

In [33]:
input_dim = 28    # MNIST image width (input features)
hidden_dim = 100  # number of units per hidden layer of RNN
layer_dim = 1     # number of hidden layers
output_dim = 10   # number of MNIST classes
seq_dim = 28      # MNIST image height (timestaps to unroll)

model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)
model = model.cuda()
lr = 0.01
opt = torch.optim.SGD(model.parameters(), lr=lr)

bs = 100
n_iters = 3000
n_epochs = int(n_iters / (len(trn_ds) / bs))
trn_loader = torch.utils.data.DataLoader(
    dataset=trn_ds, batch_size=bs, shuffle=True)
tst_loader = torch.utils.data.DataLoader(
    dataset=tst_ds, batch_size=bs, shuffle=False)

c = 0
criterion = nn.CrossEntropyLoss()

for epoch in range(n_epochs):
    for i, (images, labels) in enumerate(trn_loader):
        model.train()
        images = images.view(-1, seq_dim, input_dim).requires_grad_()
        opt.zero_grad()
        outputs = model(images.cuda())
        loss = criterion(outputs, labels.cuda())
        loss.backward()
        opt.step()
        
        c += 1
        
        if c % 500 == 0:
            model.eval()
            correct, total = 0, 0
            for images, labels in tst_loader:
                images = images.view(-1, seq_dim, input_dim)
                outputs = model(images.cuda())
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum()
            acc = 100 * correct / total
            print(f'Iteration: {c}. Loss: {loss.item()}. Acc.: {acc}')

RuntimeError: Expected object of backend CUDA but got backend CPU for argument #2 'other'