<a href="https://colab.research.google.com/github/ghlai9665/course-v3/blob/master/colab_gary_study_notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Beginning after the matrix multiplication lesson, which has decent notes, but should take notes in a completely new Jupyter Notebook for better organization and retention.


# Forward Pass

## Imports

In [1]:
import operator

def test(a,b,cmp,cname=None):
    if cname is None: cname=cmp.__name__
    assert cmp(a,b),f"{cname}:\n{a}\n{b}"

def test_eq(a,b): test(a,b,operator.eq,'==')

from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
import torch.nn.functional as F

def near(a,b): return torch.allclose(a, b, rtol=1e-3, atol=1e-5)
def test_near(a,b): test(a,b,near)

# Make MNIST data work on Google Colab
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

def get_data():
    import os
    import torchvision.datasets as datasets
    root = '../data'
    if not os.path.exists(root):
        os.mkdir(root)
    train_set = datasets.MNIST(root=root, train=True, download=True)
    test_set = datasets.MNIST(root=root, train=False, download=True)
    x_train, x_valid = train_set.train_data.split([50000, 10000])
    y_train, y_valid = train_set.train_labels.split([50000, 10000])
    return (x_train.view(50000, -1) / 256.0), y_train.float(), (x_valid.view(10000, -1))/ 256.0, y_valid.float()

# The geographic intuition for this is picturing x's around a horizontal line (mean), bring that mean down to 0,
# then scale x's by dividing them by the standard deviation
def normalize(x, mean, std): return (x-mean)/std

In [3]:
x_train, y_train, x_valid, y_valid = get_data()

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw
Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [4]:
!pip show torchvision

Name: torchvision
Version: 0.8.2+cu101
Summary: image and video datasets and models for torch deep learning
Home-page: https://github.com/pytorch/vision
Author: PyTorch Core Team
Author-email: soumith@pytorch.org
License: BSD
Location: /usr/local/lib/python3.7/dist-packages
Requires: pillow, torch, numpy
Required-by: fastai


## Normalization

- We want mean to be 0 and standard deviation to be 1 for easier convergence, so we normalize. 
- Notice how we use train_mean and train_std to normalize valid data as well - that's because we don't want validation dataset to be in a different scale

In [5]:
# before normalization
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(0.1304), tensor(0.3073))

In [6]:
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [7]:
# after normalization
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(3.9162e-08), tensor(1.))

In [8]:
def assert_is_near_zero(a, threshold=1e-3): assert a.abs() < threshold, f"{a} is not near zero"
assert_is_near_zero(train_mean)




## Get shapes

In [9]:
num_samples, image_size = x_train.shape 
num_classes = y_train.max() + 1 
nh = 50

n = num_samples
m = image_size
c = num_classes

n, m, c, nh

(50000, 784, tensor(10.), 50)

## Intialization

- Initialization is *extremely* important. In 2019, they wrote a paper "Fixup Initialization: Residual Learning Without Normalization" in which they trained a 10,000 layer neural net WITHOUT normalization just by initializing everything carefully.

### Xavier Initialization

- To perform Standard Xavier Initialization, you just divide input by the sqrt(num_input_units), which would give you a mean of 0, and standard deviation of 1 / sqrt(m)

In [10]:
def lin(x, w, b): return x@w + b

In [11]:
# Forward pass without Initialization
w1 = torch.randn(m, nh)
b1 = torch.zeros(nh)

t = lin(x_valid, w1, b1)
t.mean(), t.std() # terrible, you want ~ (0,1) (mean,std)

(tensor(3.1541), tensor(26.8629))

In [12]:
# Forward pass with Standard Xavier Init
w1 = torch.randn(m, nh) * math.sqrt(1/m)
b1 = torch.zeros(nh)

t = lin(x_valid, w1, b1)
t.mean(), t.std() # good

(tensor(0.1336), tensor(0.9939))

In [13]:
assert_is_near_zero(w1.mean())
assert_is_near_zero(w1.std() - 1/math.sqrt(m))

### Vanishing Activation/Gradient Problem

- Remember after performing the matrix multiplication, you have to pass it through relu, but each time you do that, you cut all activation values that are below 0 to 0 and thereby reduces the standard deviation. If your network is very deep, your standard deviation will keep getting reduced (possibly down to 0)

![Screen Shot 2021-03-02 at 9.20.03 PM.png](attachment:a3f59b8e-febb-4866-8494-d2338a2b0374.png)

In [14]:
# clamp_min(n) means replace everything below n with n, in this case, relu means replacing everything negative with 0
# always try to use PyTorch function because they're generally implemented in C for you
def relu(x): return x.clamp_min(0.) 

t = relu(lin(x_valid, w1, b1)) 
t.mean(),t.std()

(tensor(0.4636), tensor(0.6180))

### Kaiming Initialization
- The problem with Xavier Initialization is that it doesn't combat the vanishing gradient problem very well. 
- Kaiming initialization is almost identical to Xavier initialization but with a 2 on top; it will keep the std around 

$$\text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}$$

- This was introduced in the paper that described the Imagenet-winning approach from *He et al*: [Delving Deep into Rectifiers](https://arxiv.org/abs/1502.01852), which was also the first paper that claimed "super-human performance" on Imagenet (and, most importantly, it introduced resnets as well as Kaiming He initialization!) 

- So papers by competition winners are very good because they introduce MANY good ideas instead of just one tiny tweak.

In [15]:
# Forward pass with Kaiming Initialization
torch.manual_seed(42)
w1 = torch.randn(m, nh) * math.sqrt(2/m)
b1 = torch.zeros(nh)

t = relu(lin(x_valid, w1, b1))
t.mean(), t.std() 

(tensor(0.6624), tensor(0.9097))

In [16]:
# Forward pass with PyTorch's Kaiming Initialization, same thing
from torch.nn.init import kaiming_normal_

w1 = torch.empty(m, nh)
b1 = torch.zeros(nh)

torch.manual_seed(42)
kaiming_normal_(w1, mode='fan_out')
t = relu(lin(x_valid, w1, b1))
t.mean(), t.std() 

(tensor(0.6624), tensor(0.9097))

- Note: Kaiming initialization is very good but notice the mean is still not zero - we have good reasons to want them to be. So we can define our own new_relu to see if it helps with normalizing the mean. It's an intuitive thing to do and papers are written from these minor tweaks. Maybe it'll help a lot in practice

In [17]:
def new_relu(x): return x.clamp_min(0.) - 0.5

In [18]:
# The new_relu seems to help!
torch.manual_seed(42)
w1 = torch.randn(m,nh) * math.sqrt(2./m)
t1 = new_relu(lin(x_valid, w1, b1))
t1.mean(), t1.std()

(tensor(0.1624), tensor(0.9097))

## Train a Model

In [19]:
torch.manual_seed(42)

w1 = torch.empty(m, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

def model(x):
    l1 = lin(x, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [20]:
%timeit -n 10 _=model(x_valid)

10 loops, best of 5: 7.46 ms per loop


In [21]:
assert model(x_valid).shape == torch.Size((x_valid.shape[0],1))

## Loss Function

- We wrongly use the MSE for now just for simplicity's sake

In [22]:
def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

In [23]:
y_train, y_valid = y_train.float(), y_valid.float()

In [24]:
pred = model(x_train)

In [25]:
y_train.shape

torch.Size([50000])

In [26]:
pred.shape # not the exact shape, need to squeeze in mse

torch.Size([50000, 1])

In [27]:
mse(pred, y_train)

tensor(inf)

# Backward Pass

- During backward pass, you calculate the gradient of every w1, b1, w2, b2 with respect to the loss
- For each of the function below, we take the derivative of each layer in terms of loss, storing the result in thatlayer's .g -- in other words, x.grad stores the result of dloss/dx. Note x is the denominator, the layer.
- DON'T RUN THIS FUNCTION LOCALLY ON CPU. IT REQUIRES GPU!

In [28]:
def mse_grad(inp, targ):
    # gradient of loss with respect to the previous layer, so it's pred.grad == dloss/dpred
    inp.grad = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [29]:
def relu_grad(inp, out):
    # inp.grad == dout/dinp
    inp.grad = (inp > 0).float() * out.grad





In [30]:
def lin_grad(inp, out, w, b):
    # dloss / dl
    inp.grad = out.grad @ w.t() 
    # dloss / dw
    w.grad = (inp.unsqueeze(-1) * out.grad.unsqueeze(1)).sum(0)
    # dloss / db
    b.grad = out.grad.sum(0)

# Full Pass: Forward + Backward 


In [31]:
from torch.nn import init
torch.manual_seed(42)

# Our forward + backward loop

w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out')
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

def forward_and_backward(inp, target):
    # forward
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    pred = lin(l2, w2, b2)
    loss = mse(pred, target)    
    
    # backward
    # pred.grad = dloss/dpred
    mse_grad(pred, target) 
    # l2.grad = dloss/dl2 = dloss/dpred * dpred/dl2
    # w2.grad = dloss/dw2 = dloss/dpred * dpred/dw2
    # b2.grad = dloss/db2 = dloss/dpred * dpred/db2
    lin_grad(l2, pred, w2, b2) 
    # l1.grad = dloss/dl1 = dloss/dl2 * dl2/dl1
    relu_grad(l1, l2)
    # x.grad = dloss/dx = dloss/dl1 * dl1/dx
    # w1.grad = dloss/dw1 = dloss/dl1 * dl1/dw1
    # b1.grad = dloss/db1 = dloss/dl1 * dl1/db1
    lin_grad(inp, l1, w1, b1)

forward_and_backward(x_train, y_train)

In [32]:
# pytorch's forward + backward loop
w1_2 = w1.clone().requires_grad_(True)
w2_2 = w2.clone().requires_grad_(True)
b1_2 = b1.clone().requires_grad_(True)
b2_2 = b2.clone().requires_grad_(True)
x_train_2 = x_train.clone().requires_grad_(True)

def forward(inp, targ):
    # forward pass:
    l1 = lin(inp, w1_2, b1_2)
    l2 = relu(l1)
    pred = lin(l2, w2_2, b2_2)
    # we don't actually need the loss in backward!
    return mse(pred, targ) 

loss = forward(x_train_2, y_train)
loss.backward()

In [33]:
test_near(w1_2.grad, w1.grad)
test_near(w2_2.grad, w2.grad)
test_near(b1_2.grad, b1.grad)
test_near(b2_2.grad, b2.grad)
test_near(x_train_2.grad, x_train.grad)

# Checkpoint: https://course19.fast.ai/videos/?lesson=8 , 1:58:04