# Tutorial 4

* Implementation of Backpropagation on toy example
* Vanishing\exploding gradients demonstration


## Toy example 

In this example we will build and train a neural-net for binary classification using only basic tensor operations.

### Model Description:
The network consists of 2 linear layers followed by a Sigmoid activation. The loss function is binary-cross-entropy.
<a href="https://drive.google.com/uc?export=view&id=1Q8ucKs76yeiloZd-LgwNUw50BRJy2xSq">
    <img src="https://drive.google.com/uc?export=view&id=1Q8ucKs76yeiloZd-LgwNUw50BRJy2xSq"
    style="width: 400px; max-width: 100%; height: auto"
    title="Click for the larger version." />
</a>



$N = $ batch size <br>
$X \in \mathbb{R}^{\text{N} \times 2}$

$ W_1 \in \mathbb{R}^{2 \times \text{hidden}}$

$ W_2 \in \mathbb{R}^{\text{hidden} \times 1}$


$$L(y, \hat{y}) = -\frac{1}{N}\sum_{i=0}^{N} y_i\log(\hat{y_i}) + (1-y_i)\log(1-\hat{y_i} )$$

##### Forward pass:
$Z_1 = XW_1 + b_1$ $ \quad \quad \left[Z_1 \in \mathbb{R}^{\text{N} \times \text{hidden}}\right]$<br>
$H = \sigma(Z_1)$<br>
$Z_2 = HW_2 + b_2 \quad \quad \left[Z_2 \in \mathbb{R}^{\text{N} \times 1}\right]$<br>
$\hat{Y} = \sigma(Z_2)$<br>

##### Backward pass:
We are interested in $\frac{\partial L}{\partial W_1}$, $\frac{\partial L}{\partial b_1}$, $\frac{\partial L}{\partial W_2}$ and $\frac{\partial L}{\partial b_2}$

---------
$\large \frac{\partial L}{\partial W_1} = \frac{\partial L}{\partial \hat{Y}} \frac{\partial \hat{Y}}{\partial Z_2} \frac{\partial Z_2}{\partial H} \frac{\partial H}{\partial Z_1} \frac{\partial Z_1}{\partial W_1}$

$\large \frac{\partial L}{\partial b_1} = \frac{\partial L}{\partial \hat{Y}} \frac{\partial \hat{Y}}{\partial Z_2} \frac{\partial Z_2}{\partial H} \frac{\partial H}{\partial Z_1} \frac{\partial Z_1}{\partial b_1}$

$\large \frac{\partial L}{\partial W_2} = \frac{\partial L}{\partial \hat{Y}} \frac{\partial \hat{Y}}{\partial Z_2} \frac{\partial Z_2}{\partial W_2}$

$\large \frac{\partial L}{\partial b_2} = \frac{\partial L}{\partial \hat{Y}} \frac{\partial \hat{Y}}{\partial Z_2} \frac{\partial Z_2}{\partial b_2}$

---------


$\large \frac{\partial L}{\partial \hat{Y}} = -\frac{1}{N}\left(\frac{Y}{\hat{Y}}-\frac{1-Y}{1-\hat{Y}}\right) = \frac{1}{N}\frac{\hat{Y}-Y}{\hat{Y}(1-\hat{Y})}$ $\quad \quad \left[\frac{\partial L}{\partial \hat{Y}} \in \mathbb{R}^{\text{N} \times 1}\right] $

$\large \frac{\partial L}{\partial Z_2} = \frac{\partial L}{\partial \hat{Y}}(\sigma(Z_2)(1-\sigma(Z_2)))$

$\large \frac{\partial L}{\partial W_2} = H^T \frac{\partial L}{\partial Z_2}$

$\large \frac{\partial L}{\partial b_2} = (\frac{\partial L}{\partial Z_2})^T1$

$\large \frac{\partial L}{\partial H} = \frac{\partial L}{\partial Z_2}W_2^T $

$\large \frac{\partial L}{\partial Z_1} = \frac{\partial L}{\partial H}(\sigma(Z_1)(1-\sigma(Z_1)))$

$\large \frac{\partial L}{\partial W_1} = (\frac{\partial L}{\partial Z_1})^TX$

$\large \frac{\partial L}{\partial b_1} = (\frac{\partial L}{\partial Z_1})^T1$

In [8]:
import torch
import torch.nn as nn
from matplotlib import pyplot as plt  
import math

In [139]:
torch.manual_seed(42)
batch_size = 100
x = torch.rand(batch_size,10)
y = torch.randint(low=0, high=10, size=(batch_size, ))
print("myloss", cross_entropy_loss(x, y))

loss = nn.CrossEntropyLoss()
loss(x, y)

tensor([[0.8823, 0.9150, 0.3829, 0.9593, 0.3904, 0.6009, 0.2566, 0.7936, 0.9408,
         0.1332],
        [0.9346, 0.5936, 0.8694, 0.5677, 0.7411, 0.4294, 0.8854, 0.5739, 0.2666,
         0.6274],
        [0.2696, 0.4414, 0.2969, 0.8317, 0.1053, 0.2695, 0.3588, 0.1994, 0.5472,
         0.0062],
        [0.9516, 0.0753, 0.8860, 0.5832, 0.3376, 0.8090, 0.5779, 0.9040, 0.5547,
         0.3423],
        [0.6343, 0.3644, 0.7104, 0.9464, 0.7890, 0.2814, 0.7886, 0.5895, 0.7539,
         0.1952],
        [0.0050, 0.3068, 0.1165, 0.9103, 0.6440, 0.7071, 0.6581, 0.4913, 0.8913,
         0.1447],
        [0.5315, 0.1587, 0.6542, 0.3278, 0.6532, 0.3958, 0.9147, 0.2036, 0.2018,
         0.2018],
        [0.9497, 0.6666, 0.9811, 0.0874, 0.0041, 0.1088, 0.1637, 0.7025, 0.6790,
         0.9155],
        [0.2418, 0.1591, 0.7653, 0.2979, 0.8035, 0.3813, 0.7860, 0.1115, 0.2477,
         0.6524],
        [0.6057, 0.3725, 0.7980, 0.8399, 0.1374, 0.2331, 0.9578, 0.3313, 0.3227,
         0.0162],
        [0

tensor(2.3436)

In [133]:
##### Utils functions 
def sigmoid(s):
    return 1 / (1 + torch.exp(-s))

def sigmoidPrime(s):
    # derivative of sigmoid
    # s: sigmoid output
    return s * (1 - s)

def tanh(t):
    return torch.div(torch.exp(t) - torch.exp(-t), torch.exp(t) + torch.exp(-t))

def tanhPrime(t):
    # derivative of tanh
    # t: tanh output
    return 1 - t*t

def softmax(x):
    return x.exp() / x.exp().sum(-1, keepdim=True)

def cross_entropy_loss(x, y):
    loss = 0
    assert x.shape[0] == y.shape[0] # make sure batch sizes are the same
    nll = - torch.log(softmax(x))
    for prob, true_class in zip(nll, y):
        loss += prob[true_class]
    return loss / x.shape[0]

def cross_entropy_loss_prime(x, y):
    ret = softmax(x).squeeze()
    for i, yi in enumerate(y):
        ret[i][yi] -= 1
    return ret.sum(0) / x.shape[0]
        

class Neural_Network:
    def __init__(self, input_size=784, hidden_size=32, output_size=10):
        # parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        # weights
        self.W1 = torch.randn(self.input_size, self.hidden_size)
        self.b1 = torch.zeros(self.hidden_size)
        
        self.W2 = torch.randn(self.hidden_size, self.output_size)
        self.b2 = torch.zeros(self.output_size)
        
    def forward(self, X):
        self.z1 = torch.matmul(X, self.W1) + self.b1
        self.h = tanh(self.z1)
        self.z2 = torch.matmul(self.h, self.W2) + self.b2
        return sigmoid(self.z2)
    
    def backward(self, X, y, y_hat, lr=.1):
        batch_size = y.size(0)
        dl_dz2 = (1/batch_size)*(y_hat - y)  

        dl_dh = torch.matmul(dl_dz2, torch.t(self.W2))
        dl_dz1 = dl_dh * sigmoidPrime(self.h)
        
        self.W1 -= lr*torch.matmul(torch.t(X), dl_dz1)
        self.b1 -= lr*torch.matmul(torch.t(dl_dz1), torch.ones(batch_size))
        self.W2 -= lr*torch.matmul(torch.t(self.h), dl_dz2)
        self.b2 -= lr*torch.matmul(torch.t(dl_dz2), torch.ones(batch_size))
    
    def train(self, X, y):
        # forward + backward pass for training
        o = self.forward(X)
        self.backward(X, y, o)

In [52]:
import torchvision.datasets as dsets
import torchvision.transforms as transforms
# MNIST Dataset (Images and Labels)

batch_size = 100
transform = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: x.view(-1, 28*28))])

train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transform,
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transform)

# Dataset Loader (Input Pipline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [172]:
net = Neural_Network()
for batch_ndx, batch in enumerate(train_loader):
    output = net.forward(batch[0]).squeeze()
    loss = cross_entropy_loss(output, batch[1])
    print(loss)
    break

tensor(2.3631)
