# Minibatch


In [1]:
#| default_exp minibatch

In [2]:
import torch
from torch import nn
import math

# Get the MNIST dataset
import gzip, pickle
import urllib.request
from pathlib import Path

MNIST_URL = "https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true"
data_path = Path("../data")
data_path.mkdir(parents=True, exist_ok=True)
data_gz = data_path/"mnist.pkl.gz"

# Get the data
if not data_gz:
    urllib.request.urlretrieve(MNIST_URL, data_path/"mnist.pkl.gz")
    
# Destructuring
with gzip.open(data_gz, mode='rb') as unzip_data:
    obj = pickle.load(unzip_data, encoding="latin-1")
    ((x_train, y_train), (x_valid, y_valid), _) = obj

# To tensors
x_train, y_train, x_valid, y_valid = map(torch.tensor, (x_train, y_train, x_valid, y_valid))

In [3]:
n, m = x_train.shape
c = int(y_train.max()+1)
nh = 50
n, m, c, nh

(50000, 784, 10, 50)

In [4]:
# Our model architecture from 03_backpropagation.ipynb

class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [
            nn.Linear(n_in, nh), # [784,50]
            nn.ReLU(),
            nn.Linear(nh, n_out) # [50,10]
        ]
        
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [5]:
model = Model(m, nh, c)
preds = model(x_train)
preds.shape # We are now going to use 10 categories for output

torch.Size([50000, 10])

## Implement `Cross Entropy Loss` function

Compared to the last notebook, here, we are going to predict 10 categogies for the output. 

So, the outputs -- from the model -- will be probabilities of the predictions for the MNIST numbers.

The targets will be `on-hot-encoded` vectors with the index of a 1 representing the actual number.

That means that we need to change our `loss` function from `MSE` to `Cross Entropy Loss`.

`Cross Entropy Loss` typically serves multi-class and multi-label classifications.

`Cross Entropy Loss = log(SoftMax(i))`

First, we will need to compute the softmax of our activations. This is defined by:

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{e^{x_{0}} + e^{x_{1}} + \cdots + e^{x_{n-1}}}$$

or more concisely:

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum\limits_{0 \leq j \lt n} e^{x_{j}}}$$ 

In practice, we will need the log of the softmax when we calculate the loss:

In [6]:
def softmax(x):
    return (x.exp()/(x.exp().sum(dim=-1, keepdim=True))).log()

Note that the formula 

$$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$ 

gives a simplification when we compute the log softmax:

In [7]:
#log(softmax(x)):
def log_softmax(x):
    return x - x.exp().sum(-1,keepdim=True).log()

In [8]:
log_softmax_preds = log_softmax(preds)
log_softmax_preds

tensor([[-2.1322, -2.3039, -2.1242,  ..., -2.2312, -2.5709, -2.4597],
        [-2.1817, -2.3757, -2.2375,  ..., -2.1507, -2.6064, -2.3893],
        [-2.2826, -2.3179, -2.3498,  ..., -2.3222, -2.3067, -2.4729],
        ...,
        [-2.2142, -2.2712, -2.2377,  ..., -2.2306, -2.4570, -2.3492],
        [-2.1171, -2.1884, -2.3784,  ..., -2.3294, -2.3905, -2.3748],
        [-2.2644, -2.2546, -2.2944,  ..., -2.2859, -2.3665, -2.3277]],
       grad_fn=<SubBackward0>)

The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by:

$$ -\sum x\, \log p(x) $$

In `PyTorch` this is known as : negative log likelihood loss == `nll`

But since our $x$s are 1-hot encoded (actually, they're just the integer indices), this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target.

In [9]:
# First 3 numbers
y_train[0:3]

tensor([5, 0, 4])

In [10]:
log_softmax_preds.shape

torch.Size([50000, 10])

In [11]:
# The prababilities of the categories from above are
log_softmax_preds[0, 5], log_softmax_preds[1, 0], log_softmax_preds[2, 4]

(tensor(-2.2618, grad_fn=<SelectBackward0>),
 tensor(-2.1817, grad_fn=<SelectBackward0>),
 tensor(-2.3327, grad_fn=<SelectBackward0>))

In [12]:
log_softmax_preds[0, y_train[0]], log_softmax_preds[1, y_train[1]], log_softmax_preds[2, y_train[2]]

(tensor(-2.2618, grad_fn=<SelectBackward0>),
 tensor(-2.1817, grad_fn=<SelectBackward0>),
 tensor(-2.3327, grad_fn=<SelectBackward0>))

In [13]:
log_softmax_preds[[0,1,3], y_train[:3]]

tensor([-2.2618, -2.1817, -2.3817], grad_fn=<IndexBackward0>)

So, we have the `log(softmax(p))`. Now we need to implement the Cross entropy loss from above as the sum of those.

In [14]:
def nll(inp, target):
    """ Cross Entropy Loss """
    return -inp[range(target.shape[0]), target].mean()

In [15]:
loss = nll(log_softmax_preds, y_train)
loss

tensor(2.3204, grad_fn=<NegBackward0>)

### Going to PyTorch

`PyTorch` already has a function for `Cross Entropy Loss`, so we can use that.

In [16]:
import torch.nn.functional as F

# nll = negative log likelihood loss
F.nll_loss(F.log_softmax(preds, dim=1), y_train)

tensor(2.3204, grad_fn=<NllLossBackward0>)

The above combination of the 2 functions can be combined into 1:

In [17]:
F.cross_entropy(preds, y_train)

tensor(2.3204, grad_fn=<NllLossBackward0>)

## Basic training loop

1) get the predictions from the model
2) calculate the loss from the predictions (based on y_train)
3) calculate the gradients of the loss with respect to every parameter of the model
4) adjust the parameters according to their gradients and learning rate

In [18]:
loss_func = F.cross_entropy

In [19]:
bs = 64
xb = x_train[:bs] # minibatch from x
preds = model(xb)
preds[0], preds.shape

(tensor([ 0.1658, -0.0059,  0.1738, -0.0087, -0.2291,  0.0362,  0.0821,  0.0668,
         -0.2729, -0.1617], grad_fn=<SelectBackward0>),
 torch.Size([64, 10]))

In [20]:
yb = y_train[:64]
yb

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1,
        1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7, 6, 1, 8, 7, 9, 3, 9, 8, 5,
        9, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1, 4, 4, 6, 0])

In [21]:
loss_func(preds, yb)

tensor(2.3166, grad_fn=<NllLossBackward0>)

We need to grab the highest probabilities of the predicted nubmers.

In [22]:
preds.argmax(dim=1)

tensor([2, 7, 5, 5, 0, 2, 0, 5, 0, 0, 0, 0, 3, 5, 0, 2, 5, 0, 5, 0, 5, 0, 2, 5,
        7, 0, 0, 6, 7, 5, 3, 2, 2, 6, 0, 0, 2, 7, 0, 5, 0, 0, 0, 6, 0, 0, 0, 0,
        0, 0, 0, 3, 6, 5, 6, 0, 7, 2, 3, 5, 6, 5, 5, 5])

And we define the accuracy:

In [23]:
#|export
def accuracy(preds, targs):
    """ The average of the correctly predicted numbers """
    return (preds.argmax(dim=1)==targs).float().mean()

In [24]:
accuracy(preds, yb)

tensor(0.0781)

In [25]:
#|export
def report(epoch, preds, targs, loss):
    """ Print a report after each epoch of training """
    print(f"epoch:{epoch} \t accuracy:{accuracy(preds, targs).item():.3f} \t loss:{loss.item():.5f}")

We have an accuracy of ~10% which is expected because of the random weights.

Let's setup the basic training loop:

In [26]:
# Recreate the model before running the training
model = Model(m, nh, c)

In [27]:
batch_size = 64
epochs = 3
lr = 0.5

for epoch in range(epochs):

    for i in range(0, x_train.shape[0], batch_size):
        
        # Create xb, yb batch - this operation is way slower than slice
        if i+i+batch_size > x_train.shape[0]:
            xb = x_train[i:]
            yb = y_train[i:]
        else:
            xb = x_train[i:i+batch_size] # x batch
            yb = y_train[i:i+batch_size] # y batch
        
        preds = model(xb)
        loss = F.cross_entropy(preds, yb)
        loss.backward()
        
        if i == 0:
            # End of each epoch
            print(f"epoch:{epoch} \t accuracy:{accuracy(preds, yb).item()} \t loss:{loss.item()}")
        
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, "weight"):
                    l.weight -= l.weight.grad * lr
                    l.bias -= l.bias.grad * lr
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()

epoch:0 	 accuracy:0.078125 	 loss:2.2974798679351807
epoch:1 	 accuracy:0.96875 	 loss:0.0978163555264473
epoch:2 	 accuracy:0.984375 	 loss:0.06935223191976547


### Using `slice`

The `slice()` function returns a `slice object` that is used to slice any sequence (string, tuple, list, range, or bytes). So, we can use it to slice the training tensors.

In [28]:
model = Model(m, nh, c)

In [29]:
bs = 64
epochs = 3
lr = 0.5
n_inp = x_train.shape[0]

for epoch in range(epochs):
    for i in range(0, n_inp, 64):
        s = slice(i, min(n_inp, i + bs))
        xb, yb = x_train[s], y_train[s]
        preds = model(xb)
        loss = F.cross_entropy(preds, yb)
        loss.backward()
        
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, "weight"):
                    l.weight -= l.weight.grad * lr
                    l.bias -= l.bias.grad * lr
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()
    report(epoch, preds, yb, loss)

epoch:0 	 accuracy:0.938 	 loss:0.31061
epoch:1 	 accuracy:1.000 	 loss:0.07262
epoch:2 	 accuracy:1.000 	 loss:0.04642


In [30]:
s = slice(0,10)

In [31]:
x_train[s].shape

torch.Size([10, 784])