<a href="https://colab.research.google.com/github/daspartho/fastai-part2/blob/main/backprop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from urllib.request import urlretrieve
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

if not path_gz.exists():
    urlretrieve(MNIST_URL, path_gz)

with gzip.open(path_gz, 'rb') as f: 
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

### Foundations version

#### Basic architecture

In [2]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

In [3]:
nh=50 # number of hidden

In [4]:
w1 = torch.randn(m, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [5]:
def lin(x, w, b):
    return x@w + b

In [6]:
t = lin(x_valid, w1, b1)
t, t.shape

(tensor([[ -0.09,  11.87, -11.39,  ...,   5.48,   2.14,  15.30],
         [  5.38,  10.21, -14.49,  ...,   0.88,   0.08,  20.23],
         [  3.31,   0.12,   3.10,  ...,  16.89,  -6.05,  24.74],
         ...,
         [  4.01,  10.35, -11.25,  ...,   0.23,  -5.30,  18.28],
         [ 10.62,  -4.27,  10.72,  ...,  -2.87,  -2.87,  18.23],
         [  2.84,  -0.22,   1.43,  ...,  -3.91,   5.75,   2.12]]),
 torch.Size([10000, 50]))

In [7]:
def relu(x):
    return x.clamp_min(0.)

In [8]:
t = relu(t)
t

tensor([[ 0.00, 11.87,  0.00,  ...,  5.48,  2.14, 15.30],
        [ 5.38, 10.21,  0.00,  ...,  0.88,  0.08, 20.23],
        [ 3.31,  0.12,  3.10,  ..., 16.89,  0.00, 24.74],
        ...,
        [ 4.01, 10.35,  0.00,  ...,  0.23,  0.00, 18.28],
        [10.62,  0.00, 10.72,  ...,  0.00,  0.00, 18.23],
        [ 2.84,  0.00,  1.43,  ...,  0.00,  5.75,  2.12]])

In [9]:
def model(x):
    l1 = lin(x, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [10]:
res = model(x_valid)
res, res.shape

(tensor([[  25.75],
         [ -13.06],
         [-114.79],
         ...,
         [ -67.44],
         [ -74.48],
         [ -60.19]]), torch.Size([10000, 1]))

#### Loss function: MSE

(Of course, MSE is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use MSE for now to keep things simple.)

In [11]:
res.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [12]:
(res.squeeze()-y_valid).shape

torch.Size([10000])

In [13]:
def mse(output, targ):
    return (output.squeeze()-targ).pow(2).mean()

In [14]:
mse(res, y_valid)

tensor(4154.01)

### Gradients and backward pass

In [15]:
def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [16]:
def forward_backward(inp, targ):
    # forward pass
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    diff = out.squeeze() - targ
    loss = diff.pow(2).mean()

    # backward pass
    out.g = 2.*diff.unsqueeze(1) / inp.shape[0]
    lin_grad(l2, out, w2, b2)
    l1.g = (l1>0).float() * l2.g
    lin_grad(inp, l1, w1, b1)

In [17]:
forward_backward(x_train, y_train)

In [18]:
w2.g

tensor([[ -869.79],
        [ -622.49],
        [ -165.30],
        [ -468.34],
        [ -837.93],
        [ -687.30],
        [ -559.73],
        [ -637.38],
        [ -176.24],
        [  -10.81],
        [ -993.01],
        [ -533.38],
        [ -730.86],
        [  -54.01],
        [ -451.91],
        [ -156.28],
        [ -567.42],
        [  -61.53],
        [  -15.62],
        [ -463.00],
        [  -79.85],
        [ -582.16],
        [  -11.21],
        [-1023.28],
        [ -140.96],
        [-1357.97],
        [-1326.60],
        [ -115.92],
        [ -125.99],
        [   -4.39],
        [ -601.42],
        [ -508.06],
        [ -505.85],
        [ -301.20],
        [ -549.87],
        [  -13.01],
        [ -900.70],
        [ -179.80],
        [ -117.92],
        [-1531.71],
        [ -295.88],
        [ -327.67],
        [ -642.84],
        [   -7.32],
        [-1333.88],
        [  -10.38],
        [ -198.15],
        [ -561.56],
        [  -53.83],
        [-1659.88]])

### Refactoring

#### layers as classes

In [20]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out

    def backward(self):
        self.inp.g = (self.inp>0).float() * self.out.g