In [1]:
import gzip
import pickle
from time import time

import torch
import torch.nn.functional as F
from torch import nn, optim
import torch.utils.data as data

In [2]:
# https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz
train, valid, test = pickle.load(gzip.open('data/mnist.pkl.gz'), encoding='latin1')

In [3]:
train[0].shape, train[1].shape, valid[0].shape, test[0].shape

((50000, 784), (50000,), (10000, 784), (10000, 784))

In [4]:
def normalize(x, mean, std):
    """Subtract mean and divide by std to make x mean 0 and std 1."""
    return (x - mean) / std

def denorm(x):
    """Restore original mean and std. Complement to normalize()."""
    return x * train_std + train_mean

train_mean, train_std = train[0].mean(), train[0].std()
train[0].min(), train[0].max(), train_mean, train_std

(0.0, 0.99609375, 0.13044983, 0.3072898)

In [5]:
train = (normalize(train[0], train_mean, train_std), train[1])
valid = (normalize(valid[0], train_mean, train_std), valid[1])
test = (normalize(test[0], train_mean, train_std), test[1])
print(train[0].mean(), train[0].std(),
      valid[0].mean(), valid[0].std(),
      test[0].mean(), test[0].std())

-3.1638146e-07 0.99999934 -0.005850922 0.99243325 0.005034822 1.0064359


In [6]:
print(train[0].min(), train[0].max(),
      denorm(train[0]).min(), denorm(train[0]).max())

-0.42451727 2.8170278 0.0 0.99609375


In [7]:
n_in = test[0].shape[1] # length of input vector
n_out = test[1].max() + 1 # number of output classes
n_in, n_out

(784, 10)

In [8]:
dev = 'cpu'
if torch.cuda.is_available():
    dev = 'cuda'
dev

'cpu'

In [10]:
batch_size = 100
lr = .1
n_epochs = 10

In [11]:
class Resize(nn.Module):
    def forward(self, x):
        return x.reshape((-1, 1, 28, 28))

class Flatten(nn.Module):
    def forward(self, x):
        return x.reshape((len(x), -1))

def get_model():
    return nn.Sequential(
        Resize() # 784 -> 1x28x28
        ,nn.Conv2d(1, 8, 5, padding=2, stride=2), nn.ReLU() # 14
        ,nn.Conv2d(8, 16, 3, padding=1, stride=2), nn.ReLU() # 7
        ,nn.Conv2d(16, 32, 3, padding=1, stride=2), nn.ReLU() # 4
        ,Flatten()
        ,nn.Linear(32 * 4 * 4, n_out)
    )

def get_data_loader(x, y, shuffle=False, first_n=None):
    if first_n:
        x, y = x[:first_n], y[:first_n]
    x, y = map(lambda x: torch.tensor(x, device=dev), [x, y])
    ds = data.TensorDataset(x, y)
    return data.DataLoader(ds, batch_size=batch_size, shuffle=shuffle)

train_dl = get_data_loader(*train, shuffle=True)
valid_dl = get_data_loader(*valid)
model = get_model().to(device=dev)
opt = optim.SGD(model.parameters(), lr)
t0 = time()
for epoch in range(n_epochs):
    totloss = 0.
    for xb, yb in train_dl:
        loss = F.cross_entropy(model(xb), yb)
        totloss += loss.item() * len(xb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    print('train loss', totloss / len(train[0]))

    with torch.no_grad():
        loss = 0.
        accuracy = 0.
        for xb, yb in valid_dl:
            yhat = model(xb)
            accuracy += (yhat.argmax(axis=1) == yb).sum().item()
            loss += F.cross_entropy(yhat, yb).item() * len(xb)
        print(accuracy / len(valid[0]), loss / len(valid[0]))
print(time() - t0, 's')

train loss 0.455674238152802
0.957 0.14547233759425582
train loss 0.12231811636313795
0.9728 0.09727870895061642
train loss 0.08752219603955745
0.9762 0.0827061158476863
train loss 0.069766287188977
0.9795 0.07259693318745121
train loss 0.05853476858651265
0.9764 0.07723196109058335
train loss 0.05088327271072194
0.9806 0.06519232205173467
train loss 0.045280935860704634
0.9816 0.0637406308442587
train loss 0.04107590311765671
0.9811 0.06570773883518996
train loss 0.03685951347602531
0.9826 0.059405840126564725
train loss 0.03324395830044523
0.9833 0.060406671670789364
1749.6517124176025 s
