<a href="https://colab.research.google.com/github/daspartho/fastai-part2/blob/main/minibatch_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Get the data

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from urllib.request import urlretrieve
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

if not path_gz.exists():
    urlretrieve(MNIST_URL, path_gz)

with gzip.open(path_gz, 'rb') as f: 
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

### Initial Setup

#### Data

In [2]:
n,m = x_train.shape
c = y_train.max()+1
nh=50
n,m,c

(50000, 784, tensor(10))

In [3]:
from torch import nn

class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [
            nn.Linear(n_in, nh), 
            nn.ReLU(),
            nn.Linear(nh, n_out),
        ]
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [4]:
model = Model(m, nh, 10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

#### Cross entrophy loss

In [5]:
def log_softmax(x):
    return x - x.exp().sum(-1, keepdim=True).log()

In [6]:
log_softmax(pred)

tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
        [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
        [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
        ...,
        [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
        [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
        [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<SubBackward0>)

In [7]:
def logsumexp(x):
    a = x.max(-1)[0]
    return a + (x-a.unsqueeze(dim=1)).exp().sum(-1).log()

In [8]:
from fastcore.test import test_close
test_close(logsumexp(pred), pred.logsumexp(-1)) # comparing with pytorch's implementation

In [9]:
def log_softmax(x): return x - logsumexp(x).unsqueeze(dim=-1)

In [10]:
sm_pred = log_softmax(pred)
sm_pred

tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
        [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
        [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
        ...,
        [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
        [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
        [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<SubBackward0>)

In [11]:
y_train[:3]

tensor([5, 0, 4])

In [12]:
sm_pred[0,5],sm_pred[1,0],sm_pred[2,4]

(tensor(-2.40, grad_fn=<SelectBackward0>),
 tensor(-2.37, grad_fn=<SelectBackward0>),
 tensor(-2.14, grad_fn=<SelectBackward0>))

In [13]:
sm_pred[[0,1,2],y_train[:3]]

tensor([-2.40, -2.37, -2.14], grad_fn=<IndexBackward0>)

In [14]:
def nll(inp, targ):
    return - inp[range(targ.shape[0]), targ].mean()

In [15]:
loss = nll(sm_pred, y_train)
loss

tensor(2.30, grad_fn=<NegBackward0>)

In [16]:
from torch.nn import functional as F
test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3)

In [17]:
test_close(F.cross_entropy(pred, y_train), loss, 1e-3)