# A basic training loop

## MNIST data setup

In [3]:
from pathlib import Path

DATA_PATH = Path('data')
PATH = DATA_PATH/'mnist'

PATH.mkdir(parents=True, exist_ok=True) # generate folder for variable PATH with mkdir

In [4]:
import requests # HTTP requests, see http://docs.python-requests.org/en/master/

URL='http://deeplearning.net/data/mnist/'
FILENAME='mnist.pkl.gz'

if not (PATH/FILENAME).exists():
    content = requests.get(URL+FILENAME).content
    (PATH/FILENAME).open('wb').write(content) # wb = write binary

In [5]:
import pickle, gzip # pickle to load serialized data, see https://docs.python.org/3/library/pickle.html

with gzip.open(PATH/FILENAME, 'rb') as f: # rb = read binary
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

In [6]:
import torch

x_train,y_train,x_valid,y_valid = map(torch.tensor, (x_train,y_train,x_valid,y_valid)) # apply torch.tensor to args with map function

In [7]:
n,c = x_train.shape # get number of training examples (n) and number of features (c = 28 x 28 pixels = 784)
x_train, x_train.shape, y_train.min(), y_train.max()

(tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]),
 torch.Size([50000, 784]),
 tensor(0),
 tensor(9))

## Basic model and training loop

In [8]:
import math

weights = torch.randn(784,10)/math.sqrt(784) # initialise weights for basic model with number of in- and output features divided by the sqrt of 784 for Xavier initialisation (see https://stats.stackexchange.com/questions/326710/why-is-weight-initialized-as-1-sqrt-of-hidden-nodes-in-neural-networks).
weights.requires_grad_() # set gradients on weights to True
bias = torch.zeros(10, requires_grad=True) # initialise bias to zeros with gradients True

In [17]:
import torch.nn.functional as F

def model(xb): # define basic model as python function
    xb = (xb @ weights) + bias # @ for the matrix multiplication, dimensions: [bs,784] @ [784,10] + [10] (See general: https://upload.wikimedia.org/wikipedia/commons/thumb/1/18/Matrix_multiplication_qtl1.svg/440px-Matrix_multiplication_qtl1.svg.png, for bs=1: https://upload.wikimedia.org/wikipedia/commons/thumb/4/4d/Matrix_multiplication_qtl7.svg/440px-Matrix_multiplication_qtl7.svg.png)
    return F.log_softmax(xb, dim=-1) # return log softmax for the 10 classes for batch. Log softmax is needed for the NLL loss function (see https://pytorch.org/docs/stable/nn.html#torch.nn.Softmax).

In [12]:
bs=64 # batch size

In [13]:
x_train[0:bs].shape

torch.Size([64, 784])

In [14]:
weights.shape

torch.Size([784, 10])

In [15]:
bias.shape

torch.Size([10])

In [18]:
preds = model(x_train[0:bs]) # feed first batch into model
preds[0], preds.shape

(tensor([-2.1797, -2.5219, -2.2624, -2.7747, -2.2827, -1.9406, -2.8328,
         -2.6674, -1.9858, -2.0481]), torch.Size([64, 10]))

In [19]:
# look at the log preds 
import numpy as np

np.exp(preds[0].detach().numpy()) # exp log preds and transform to np array

array([ 0.11307222,  0.08030371,  0.10410269,  0.06236996,  0.10201021,
        0.14361274,  0.0588472 ,  0.06943085,  0.13727042,  0.12898001], dtype=float32)

In [52]:
# check if softmax predictions sum up to (approx.) 1 (for an explanation if its not exactly 1 see https://docs.python.org/3.6/tutorial/floatingpoint.html)
np.exp(preds[0].detach().numpy()).sum()

1.0

In [20]:
loss_fn = F.nll_loss # define negative log likelihood loss function with https://pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss

In [21]:
loss_fn(preds, y_train[0:bs])

tensor(2.3033)

In [22]:
lr = 0.5 # learning rate
epochs = 2 # number of epochs = cycles through the training data set

In [23]:
from IPython.core.debugger import set_trace

In [24]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1): # n-1 to get indices, bs + 1 to get last batch with fewer elements than bs.
        #set_trace() # optional hard-coded breakpoint for debugging
        start_i = i*bs # start index of batch
        end_i = start_i+bs # end index of batch
        xb = x_train[start_i:end_i] # x values current batch
        yb = y_train[start_i:end_i] # y vallues current batch
        pred = model(xb) # get predictions of the basic model with the current x batch
        loss = loss_fn(pred, yb) # get NLL loss

        loss.backward() # "loss.backward() computes dloss/dx for every parameter x which has requires_grad=True. These are accumulated into x.grad for every parameter x." (See https://discuss.pytorch.org/t/what-does-the-backward-function-do/9944/2)
        with torch.no_grad(): # torch.no_grad() temporally sets all the requires_grad flag to false for gradient updates (https://datascience.stackexchange.com/questions/32651/what-is-the-use-of-torch-no-grad-in-pytorch).
            weights -= weights.grad * lr # update parameters by multiplying the old parameter by the learning rate
            bias -= bias.grad * lr
            weights.grad.zero_() # Set the gradients to zero. ("The gradient backward() function accumulates gradients, and you don’t want to mix up gradients between minibatches, you have to zero them out at the start of a new minibatch." See https://discuss.pytorch.org/t/why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch/4903/4.)
            bias.grad.zero_()

In [26]:
loss_fn(model(x_train[0:bs]), y_train[0:bs]) # print loss after training the basic model for the first minibatch

tensor(0.2293)

In [27]:
loss_fn(model(x_train), y_train) # print loss for the entire data set

tensor(0.3074)

## Refactor using nn.Module

In [16]:
from torch import nn

class Mnist_Logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(784,10)/math.sqrt(784))
        self.bias = nn.Parameter(torch.zeros(10))

    def forward(self, xb):
        xb = (xb @ self.weights) + self.bias
        return F.log_softmax(xb, dim=-1)

In [17]:
model = Mnist_Logistic()

In [18]:
loss_fn(model(x_train[0:bs]), y_train[0:bs])

tensor(2.3110, grad_fn=<NllLossBackward>)

In [19]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        with torch.no_grad():
            for p in model.parameters(): p -= p.grad * lr
            model.zero_grad()

In [20]:
loss_fn(model(x_train[0:bs]), y_train[0:bs])

tensor(0.2300, grad_fn=<NllLossBackward>)

## Refactor using nn.Linear

In [21]:
class Mnist_Logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin = nn.Linear(784,10)

    def forward(self, xb):
        return F.log_softmax(self.lin(xb), dim=-1)

In [22]:
model = Mnist_Logistic()
loss_fn(model(x_train[0:bs]), y_train[0:bs])

tensor(2.2636, grad_fn=<NllLossBackward>)

In [23]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        with torch.no_grad():
            for p in model.parameters(): p -= p.grad * lr
            model.zero_grad()

In [24]:
loss_fn(model(x_train[0:bs]), y_train[0:bs])

tensor(0.2256, grad_fn=<NllLossBackward>)

## Refactor using optim

In [25]:
from torch import optim

In [26]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=lr)

loss_fn(model(x_train[0:bs]), y_train[0:bs])

tensor(2.3184, grad_fn=<NllLossBackward>)

In [27]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [28]:
loss_fn(model(x_train[0:bs]), y_train[0:bs])

tensor(0.2272, grad_fn=<NllLossBackward>)

## Refactor using Dataset

In [29]:
from torch.utils.data import TensorDataset

In [30]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=1.)

In [31]:
train_ds = TensorDataset(x_train, y_train)

In [32]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        xb,yb = train_ds[i*bs : i*bs+bs]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [33]:
loss_fn(model(x_train[0:bs]), y_train[0:bs])

tensor(0.2200, grad_fn=<NllLossBackward>)

## Refactor using DataLoader

In [34]:
from torch.utils.data import DataLoader

In [35]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=lr)

In [36]:
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs)

In [37]:
for epoch in range(epochs):
    for xb,yb in train_dl:
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [38]:
loss_fn(model(x_train[0:bs]), y_train[0:bs])

tensor(0.2276, grad_fn=<NllLossBackward>)

# Add validation

## First try

In [39]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=lr)

In [40]:
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid)
valid_dl = DataLoader(valid_ds, batch_size=bs*2)

In [41]:
loss_fn(model(x_valid[0:bs]), y_valid[0:bs])

tensor(2.3835, grad_fn=<NllLossBackward>)

In [42]:
for epoch in range(epochs):
    model.train()
    for xb,yb in train_dl:
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()
        
    model.eval()
    with torch.no_grad():
        valid_loss = sum(loss_fn(model(xb), yb)
                         for xb,yb in valid_dl)

    print(epoch, valid_loss/len(valid_dl))

0 tensor(0.2931)
1 tensor(0.2919)


## Create fit() and get_data()

In [43]:
def loss_batch(model, xb, yb, opt=None):
    loss = loss_fn(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
        
    return loss.item(), len(xb)

In [44]:
import numpy as np

def fit(epochs, model, loss_fn, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb,yb in train_dl: loss_batch(model, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            losses,nums = zip(*[loss_batch(model, xb, yb)
                                for xb,yb in valid_dl])
        val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)

        print(epoch, val_loss)

In [45]:
def get_data(train_ds, valid_ds, bs):
    return (DataLoader(train_ds, batch_size=bs, shuffle=True),
            DataLoader(valid_ds, batch_size=bs*2))

train_dl,valid_dl = get_data(train_ds, valid_ds, bs)

In [47]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=lr)

In [48]:
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 0.32389578552246095
1 0.3008082551002502


# Switch to CNN

## First try

In [49]:
class Mnist_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,  16, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(16, 10,  kernel_size=3, stride=2, padding=1)

    def forward(self, xb):
        xb = xb.view(-1,1,28,28)
        xb = F.relu(self.conv1(xb))
        xb = F.relu(self.conv2(xb))
        xb = F.relu(self.conv3(xb))
        xb = F.avg_pool2d(xb, 4)
        xb = xb.view(-1,xb.size(1))
        return F.log_softmax(xb, dim=-1)

In [50]:
lr=0.2

In [51]:
model = Mnist_CNN()
opt = optim.SGD(model.parameters(), lr=lr)

In [52]:
loss_fn(model(x_valid[0:bs]), y_valid[0:bs])

tensor(2.3029, grad_fn=<NllLossBackward>)

In [53]:
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 0.9714147342681885
1 0.42008232226371767


## nn.Sequential

In [54]:
class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func=func
        
    def forward(self, x): return self.func(x)

In [55]:
model = nn.Sequential(
    Lambda(lambda x: x.view(-1,1,28,28)),
    nn.Conv2d(1,  16,  kernel_size=3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(16, 16,  kernel_size=3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(16, 10,  kernel_size=3, stride=2, padding=1), nn.ReLU(),
    nn.AvgPool2d(4),
    Lambda(lambda x: x.view(x.size(0),-1)),
    nn.LogSoftmax(-1),
)

In [56]:
opt = optim.SGD(model.parameters(), lr=lr)

In [57]:
loss_fn(model(x_valid[0:bs]), y_valid[0:bs])

tensor(2.3019, grad_fn=<NllLossBackward>)

In [58]:
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 1.1196507553100585
1 0.7486707862377167
