In [None]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'

In [54]:
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## initialize params

In [55]:
n,m = x_train.shape
c = y_train.max() + 1
n,m,c

(50000, 784, tensor(10))

In [142]:
nh = 200

In [143]:
w1=torch.randn(m,nh)
b1=torch.zeros(nh)
w2=torch.randn(nh, 1)
b2 = torch.zeros(1)

## code for model's layers, loss, and gradients

In [58]:
def lin(x, w, b):
    return x@w + b

In [60]:
def relu(x):
    return x.clamp_min(0.)

In [62]:
y_train,y_valid=map(lambda t: t.float(), (y_train,y_valid))
y_train,y_valid

(tensor([5., 0., 4.,  ..., 8., 4., 8.]),
 tensor([3., 8., 6.,  ..., 5., 6., 8.]))

In [63]:
def mse(output, targ):
    return (output[:,0]-targ).pow(2).mean()

In [64]:
def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
    w.g = inp.t() @ out.g
    b.g = out.g.sum(0)

In [65]:
def forward_and_backward(xb, yb):
    # forward
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    preds = lin(l2, w2, b2)
    loss = mse(preds, yb)

    # backpropagation
    n=xb.shape[0]
    '''
    preds is shape (batch_size, num_activation_features) = (batch_size, 1), aka a column vector
    yb shape is (batch_size) and needs to be transformed into column vector aka (batch_size, 1)
    hence yb[:,None], in order to do elementwise subtraction
    '''
    preds.g = (2./n) * (preds-yb[:,None]) 
    lin_grad(l2, preds, w2, b2)
    l1.g = (l1 > 0).float() * l2.g
    lin_grad(xb, l1, w1, b1)

In [66]:
forward_and_backward(x_train, y_train)

## comparing gradients to pytorch computed gradients based on same (cloned) params and same (duplicated) forward pass

In [67]:
def get_homemade_grad(t):
    return t.g.clone()

In [68]:
tensors = w1, b1, w2, b2, x_train
my_grads = tuple(map(get_homemade_grad, tensors))

In [69]:
def clone_tensor_with_grads(t):
    return t.clone().requires_grad_(True)

In [73]:
pt_tensors = tuple(map(clone_tensor_with_grads, tensors))
w1c, b1c, w2c, b2c, x_trainc = pt_tensors

In [74]:
def forward_only(inp, targ):
    l1 = lin(inp, w1c, b1c)
    l2 = relu(l1)
    out = lin(l2, w2c, b2c)
    return mse(out, targ)

In [75]:
loss = forward_only(x_trainc, y_train)
loss.backward()

In [81]:
for my_grad,their_tensor in zip(my_grads, pt_tensors):
    test_close(my_grad, their_tensor.grad, eps=0.01)

tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)


## Refactor model into object oriented

In [84]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out

    def backward(self):
        self.inp.g = (self.inp > 0).int() * self.out.g

In [85]:
class Lin():
    def __init__(self, w, b):
        self.w = w
        self.b = b
    def __call__(self, inp):
        self.inp = inp
        self.out = inp @ self.w + self.b
        return self.out

    def backward(self):
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)
        self.inp.g = self.out.g @ self.w.t()

In [108]:
class Mse():
    def __call__(self, preds, targ):
        self.preds = preds
        self.diff = preds - targ[:,None]
        self.out = (self.diff).pow(2).mean()
        return self.out

    def backward(self):
        self.preds.g = (2. / self.diff.shape[0]) * self.diff

In [91]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [
            Lin(w1, b1),
            Relu(),
            Lin(w2, b2)
        ]
        self.loss = Mse()

    def __call__(self, x, y):
        for l in self.layers:
            x = l(x)
        loss = self.loss(x, y)
        return (x, loss)

    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers):
            l.backward()


In [106]:
model = Model(w1, b1, w2, b2)
preds,loss = model(x_train, y_train)

In [146]:
w1,w2,b1,b2,x_train,y_train = tuple(map(lambda t: t.to('cuda'), (w1,w2,b1,b2,x_train,y_train,)))

In [131]:
def acc(preds, y):
    return (preds[:,0].round() == y).half().mean()

In [144]:
pw1, pb1, pw2, pb2 = w1.clone(), b1.clone(), w2.clone(), b2.clone()

In [140]:
w1, b1, w2, b2= pw1, pb1, pw2, pb2

In [None]:
lr = 0.05
for i in range(1000000):
    model = Model(w1, b1, w2, b2)
    preds,loss = model(x_train, y_train)
    model.backward()
    if i % 1000 == 0:
        print(i, loss,acc(preds, y_train))
    w1 -= lr * w1.g
    b1 -= lr * b1.g
    w2 -= lr * w2.g
    b2 -= lr * b2.g

0 tensor(7615.22, device='cuda:0') tensor(0.00, device='cuda:0', dtype=torch.float16)
1000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
2000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
3000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
4000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
5000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
6000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
7000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
8000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
9000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)
10000 tensor(8.36, device='cuda:0') tensor(0.10, device='cuda:0', dtype=torch.float16)


In [133]:
preds[:5],y_train[:5]

(tensor([[3.72],
         [0.18],
         [3.33],
         [1.47],
         [8.40]], device='cuda:0'),
 tensor([5., 0., 4., 1., 9.], device='cuda:0'))

In [132]:
acc(preds, y_train)

tensor(0.35, device='cuda:0', dtype=torch.float16)