# The forward and backward passes

In [None]:
from pathlib import Path
import pickle
import gzip

from fastcore.test import test_close
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import torch

torch.manual_seed(1103)

matplotlib.rcParams["image.cmap"] = "gray"
torch.set_printoptions(precision=2, linewidth=160, sci_mode=False)
np.set_printoptions(precision=2, linewidth=160)

data_path = Path("data")
mnist_path = data_path / "mnist.pkl.gz"
with gzip.open(mnist_path) as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="bytes")
x_train, y_train, x_valid, y_valid = map(torch.tensor, (x_train, y_train, x_valid, y_valid))

# Foundations version

## Basic architecture

In [None]:
x_train.shape

torch.Size([50000, 784])

In [None]:
nh = 50
c = y_train.max().item() + 1
c

10

In [None]:
w1 = torch.randn(784, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [None]:
def lin(x, w, b):
    return x@w + b

In [None]:
l1_out = lin(x_train, w1, b1)
l1_out.shape

torch.Size([50000, 50])

In [None]:
def relu(x):
    return x.clamp_min(0.)

In [None]:
a1_out = relu(l1_out)
a1_out.shape, (a1_out < 0).sum()

(torch.Size([50000, 50]), tensor(0))

In [None]:
def model(xb):
    l1 = lin(xb, w1, b1)
    a1 = relu(l1)
    l2 = lin(a1, w2, b2)
    return l2

In [None]:
out = model(x_train)
out.shape

torch.Size([50000, 1])

## Loss function: MSE

In [None]:
out.shape, y_train.shape

(torch.Size([50000, 1]), torch.Size([50000]))

In [None]:
def mse(output, target):
    return ((output - target[:, None])**2).mean()

In [None]:
mse(out, y_train)

tensor(849.00)

## Gradients and backward pass

In [None]:
import sympy

In [None]:
x, y = sympy.symbols("x y")
sympy.diff(x**2+y, x)

2*x

In [None]:
sympy.diff((x-y)**2, x)

2*x - 2*y

In [None]:
for i in (("out", l1_out.shape), ("inp", x_train.shape), ("w", w1.shape), ("b", b1.shape)):
    print(i)

('out', torch.Size([50000, 50]))
('inp', torch.Size([50000, 784]))
('w', torch.Size([784, 50]))
('b', torch.Size([50]))


In [None]:
x_train.unsqueeze(-1).shape, l1_out.unsqueeze(1).shape

(torch.Size([50000, 784, 1]), torch.Size([50000, 1, 50]))

In [None]:
l1_out.shape, w1.T.shape

(torch.Size([50000, 50]), torch.Size([50, 784]))

In [None]:
# out = inp @ w + b
# db/dl = sum(dout_k/dl)
# dw_ij/dl = sum_k(dout_kj/dl * inp_ki)
# dinp_ki/dl = sum_j(dout_kj/dl * w_ij) 
def lin_grad(inp, out, w, b):
    b.g = out.g.sum(0)
    w.g = (out.g.unsqueeze(1) * inp.unsqueeze(-1)).sum(0)
    inp.g = out.g @ w.T

In [None]:
def forward_and_backward(inp, targ):
    l1 = lin(inp, w1, b1)
    a1 = relu(l1)
    l2 = lin(a1, w2, b2)
    diff = l2[:, 0] - targ

    l2.g = 2*diff[:, None] / inp.shape[0]
    lin_grad(a1, l2, w2, b2)
    l1.g = (l1 > 0) * a1.g
    lin_grad(inp, l1, w1, b1)

In [None]:
forward_and_backward(x_train, y_train)

In [None]:
def get_grad(x):
    return x.g.clone()

inter = (b2, w2, b1, w1, x_train)
grads = (b2g, w2g, b1g, w1g, x_traing) = list(map(get_grad, inter))

In [None]:
b2g.shape, b2.shape

(torch.Size([1]), torch.Size([1]))

In [None]:
def mkgrad(x):
    return x.clone().requires_grad_(True)

In [None]:
gradst = (b2t, w2t, b1t, w1t, x_traint) = list(map(mkgrad, inter))

In [None]:
def forward(inp, targ):
    l1 = lin(inp, w1t, b1t)
    a1 = relu(l1)
    l2 = lin(a1, w2t, b2t)
    return mse(l2, targ)

In [None]:
loss = forward(x_traint, y_train)
loss

tensor(849.00, grad_fn=<MeanBackward0>)

In [None]:
loss.backward()

In [None]:
for g, t in zip(grads, gradst):
    test_close(g, t.grad, eps=1e-3)

## Refactor model

### Layers as classes

In [None]:
class Relu:
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0)
        return self.out

    def backward(self):
        self.inp.g = (self.inp > 0) * self.out.g

In [None]:
class Lin:
    def __init__(self, w, b):
        self.w = w
        self.b = b

    def __call__(self, inp):
        self.inp = inp
        self.out = inp @ self.w + self.b
        return self.out

    def backward(self):
        self.b.g = self.out.g.sum(0)
        self.w.g = (self.out.g.unsqueeze(1) * self.inp.unsqueeze(-1)).sum(0)
        self.inp.g = self.out.g @ self.w.T

In [None]:
class Mse:
    def __call__(self, inp, targ):
        self.inp = inp
        self.targ = targ
        self.out = ((inp - targ[:, None])**2).mean()
        return self.out

    def backward(self):
        diff = self.inp[:, 0] - self.targ
        self.inp.g = diff[:, None] * (2 / self.inp.shape[0])

In [None]:
class Model:
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()

    def __call__(self, x, targ):
        for l in self.layers:
            x = l(x)
        loss = self.loss(x, targ)
        return loss

    def backward(self):
        self.loss.backward()
        for l in self.layers[::-1]:
            l.backward()

In [None]:
model = Model(w1, b1, w2, b2)

In [None]:
model(x_train, y_train)

tensor(849.00)

In [None]:
model.backward()

In [None]:
for g, gc in zip(grads, inter):
    test_close(g, gc.g, eps=1e-3)

### Module.forward()

In [None]:
class Module:
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out

    def forward(self):
        raise NotImplementedError

    def backward(self):
        self.bwd(self.out, *self.args)

    def bwd(self, *args):
        raise NotImplementedError

In [None]:
class Relu(Module):
    def forward(self, inp):
        return inp.clamp_min(0)

    def bwd(self, out, inp):
        inp.g = (inp > 0) * out.g

In [None]:
class Lin(Module):
    def __init__(self, w, b):
        self.w = w
        self.b = b

    def forward(self, inp):
        return inp @ self.w + self.b

    def bwd(self, out, inp):
        inp.g = out.g @ self.w.T
        self.w.g = (out.g.unsqueeze(1) * inp.unsqueeze(-1)).sum(0)
        self.b.g = out.g.sum(0)

In [None]:
class Mse(Module):
    def forward(self, inp, targ):
        return ((inp - targ[:, None])**2).mean()

    def bwd(self, out, inp, targ):
        diff = inp[:, 0] - targ
        inp.g = diff[:, None] * (2 / inp.shape[0])

In [None]:
model = Model(w1, b1, w2, b2)

In [None]:
model(x_train, y_train)

tensor(849.00)

In [None]:
model.backward()

In [None]:
for g, gc in zip(grads, inter):
    test_close(g, gc.g, eps=1e-3)

### Autograd

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class Linear(nn.Module):
    def __init__(self, n_in, n_out):
        super().__init__()
        self.w = torch.randn((n_in, n_out)).requires_grad_(True)
        self.b = torch.randn((n_out,)).requires_grad_(True)

    def forward(self, x):
        return x @ self.w + self.b

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, n_h, n_out):
        super().__init__()
        self.layers = [Linear(n_in, n_h), nn.ReLU(), Linear(n_h, n_out)]
        self.loss = F.mse_loss

    def __call__(self, x, target):
        for l in self.layers:
            x = l(x)
        return self.loss(x, target[:, None])

In [None]:
model = Model(x_train.shape[1], nh, 1)

In [None]:
loss = model(x_train, y_train.float())

In [None]:
loss.backward()

In [None]:
model.layers[0].b.grad

tensor([  6.52, -16.68,  -4.91,  -3.79,   3.37,  -2.61,  71.67,  25.06,   2.80,   7.05,  10.40,   1.85,   0.48,  14.24,  -2.76,  41.66,   9.50, -34.44, -18.58,
        -23.73,  -5.00,  -5.86, -20.66,  -1.03,  12.93, -17.48, -19.33,  40.95,  -1.49,  47.68,   1.25, -30.78,  72.18,  20.41, -30.57,  -0.11,  -4.71,   0.64,
         -1.50,  -4.80,  -4.95,  85.04,  19.27,   2.65,  -6.14, -15.47,  -2.61,   4.11,  -2.93,   6.86])