In [2]:
import torch
from torch import nn
from tests_backpropagation import main_test

torch.manual_seed(123)
torch.set_default_dtype(torch.double)

## Class ``MyNet``

Read carefully how ``MyNet`` is implemented in the cell below. In particular:  
- ``n_hid`` is a list of integer, representing the number of hidden units in each hidden layer.   
-  ``MyNet([2, 3, 2]) = MiniNet()`` where ``MiniNet`` is the neural network defined in the fourth tutorial, in which notations are also clarified.     
- ``model.L`` is the number of hidden layers, ``L``   
- ``model.f[l]`` is the activation function of layer ``l``, $f^{[l]}$ (here ``torch.tanh``)   
- ``model.df[l]`` is the derivative of the activation function, $f'^{[l]}$   
- ``model.a[l]``  is the tensor $A^{[l]}$, (shape: ``(1, n(l))``)   
- ``model.z[l]``  is the tensor $Z^{[l]}$, (shape: ``(1, n(l))``)  
- Weights $W^{[l]}$ (shape: ``(n(l+1), n(l))``) and biases $\mathbf{b}^{[l]}$ (shape: ``(n(l+1))``) can be accessed as follows:
```
weights = model.fc[str(l)].weight.data
bias = model.fc[str(l)].bias.data
```

In [3]:
class MyNet(nn.Module):
    def __init__(self, n_l = [2, 3, 2]):
        super().__init__() 
        
        
        # number of layers in our network (following Andrew's notations)
        self.L = len(n_l)-1
        self.n_l = n_l
        
        # Where we will store our neuron values
        # - z: before activation function 
        # - a: after activation function (a=f(z))
        self.z = {i : None for i in range(1, self.L+1)}
        self.a = {i : None for i in range(self.L+1)}

        # Where we will store the gradients for our custom backpropagation algo
        self.dL_dw = {i : None for i in range(1, self.L+1)}
        self.dL_db = {i : None for i in range(1, self.L+1)}

        # Our activation functions
        self.f = {i : lambda x : torch.tanh(x) for i in range(1, self.L+1)}

        # Derivatives of our activation functions
        self.df = {
            i : lambda x : (1 / (torch.cosh(x)**2)) 
            for i in range(1, self.L+1)
        }
        
        # fully connected layers
        # We have to use nn.ModuleDict and to use strings as keys here to 
        # respect pytorch requirements (otherwise, the model does not learn)
        self.fc = nn.ModuleDict({str(i): None for i in range(1, self.L+1)})
        for i in range(1, self.L+1):
            self.fc[str(i)] = nn.Linear(in_features=n_l[i-1], out_features=n_l[i])
        
    def forward(self, x):
        # Input layer
        self.a[0] = torch.flatten(x, 1)
        
        # Hidden layers until output layer
        for i in range(1, self.L+1):

            # fully connected layer
            self.z[i] = self.fc[str(i)](self.a[i-1])
            # activation
            self.a[i] = self.f[i](self.z[i])

        # return output
        return self.a[self.L]

## Tasks

Write a function ``backpropagation(model, y_true, y_pred)`` that computes:

- $\frac{\partial L}{\partial w^{[l]}_{i,j}}$ and store them in ``model.dL_dw[l][i,j]`` for $l \in [1 .. L]$ 
- $\frac{\partial L}{\partial b^{[l]}_{j}}$ and store them in ``model.dL_db[l][j]`` for $l \in [1 .. L]$ 

assuming ``model`` is an instance of the ``MyNet`` class.

A vectorized implementation would be appreciated.

In [303]:
def backpropagation(model, y_true, y_pred):
    with torch.no_grad():
        
        #From last layer to the first
        for l in range(model.L,0,-1):
            
            #If current layer is the last, then dz is the loss, else we need dh to compute dz.
            if l == model.L:
                #The partial derivative of loss with respect to last ouput with formula.
                #The dimensions of y_true and y_pred is transposed so it has shape 2x1
                dz = ((y_pred - y_true)*2).T
            else:
                dh = torch.matmul(model.fc[str(l+1)].weight.data.T, dz)
                dz = (dh*model.df[l](model.z[l]).T)

            db = dz.T[0]
            dw = torch.matmul(dz, model.a[l-1])
            model.dL_db[l] = db
            model.dL_dw[l] = dw


            
            
        
    
    return None

In [35]:
def backpropagation(model, y_true, y_pred):
    with torch.no_grad():
        
        #From last layer to the first
        for l in range(model.L,0,-1):
            
            #If current layer is the last, then dz is the loss, else we need dh to compute dz.
            if l == model.L:
                #The partial derivative of loss with respect to last ouput with formula.
                #The dimensions of y_true and y_pred is transposed so it has shape 2x1
                dz = (-1*(y_true - y_pred)).T
            else:
                dh = torch.matmul(model.fc[str(l+1)].weight.data.T, dz)
                dz = (dh*model.df[l](model.z[l]).T)

            db = dz.T[0]
            dw = torch.matmul(dz, model.a[l-1])
            model.dL_db[l] = db
            model.dL_dw[l] = dw


            
            
        
    
    return None

## Run the cells below, and check the output

- In the 1st cell, we use a toy dataset and the same architecture as the MiniNet class of the fourth tutorial. 
- In the 2nd cell, we use a few samples of the MNIST dataset with a consistent model architecture (``24x24`` black and white cropped images as input and ``10`` output classes). 

You can set ``verbose`` to ``True`` if you want more details about your computations versus what is expected.

In [36]:
model = MyNet([2, 3, 2])

main_test(backpropagation, model, verbose=True, data='toy')


 __________________________________________________________________ 
                          Check gradients                             
 __________________________________________________________________ 


 -------- Gradcheck with finite differences  --------- 
 residual error:
 [0.3331, 0.3325, 0.2764, 0.2289, 0.46, 0.5775, 0.8795, 0.9033, 0.9321, 0.9415]

 --------- Comparing with autograd values  ----------- 

 ******* fc['1'].weight.grad ******* 
  Our computation:
 tensor([[6.9394e-04, 6.2638e-04],
        [5.9461e+00, 5.3672e+00],
        [2.2708e-04, 2.0497e-04]])

  Autograd's computation:
 tensor([[2.2061e-05, 1.9914e-05],
        [1.0659e-01, 9.6214e-02],
        [5.9805e-06, 5.3982e-06]])

 ********* fc['1'].bias.grad ******* 
  Our computation:
 tensor([8.1589e-05, 6.9910e-01, 2.6698e-05])
  Autograd's computation:
 tensor([2.5938e-06, 1.2532e-02, 7.0314e-07])

 ------------------- relative error ------------------ 
(fc[1].weight.grad, model.dL_dw[1]):   54.7836
(fc[1

In [377]:

model = MyNet([24*24, 16, 10])
main_test(backpropagation, model, verbose=False, data='mnist')



 __________________________________________________________________ 
                          Check gradients                             
 __________________________________________________________________ 


 -------- Gradcheck with finite differences  --------- 
 residual error:
 [0.0247, 0.9944, 0.9894, 0.9937, 0.9935]

 ------------------- relative error ------------------ 
(fc[1].weight.grad, model.dL_dw[1]):   312.6307
(fc[1].bias.grad,   model.dL_db[1]):   312.6307
(fc[2].weight.grad, model.dL_dw[2]):   275.5594
(fc[2].bias.grad,   model.dL_db[2]):   275.5594

 TEST FAILED: Gradients NOT consistent with autograd's computations.

 TEST FAILED: Gradients NOT consistent with finite differences computations.

 __________________________________________________________________ 
                 Check that weights have been updated               
 __________________________________________________________________ 

 TEST PASSED: Weights have been updated.

 ________________________

In [29]:
torch.div(torch.tensor([[ 1.6216e-04,  1.8000e-04],
        [-1.3800e-02, -1.5318e-02],
        [ 1.2963e-05,  1.4389e-05]]), torch.tensor([[ 2.8287e-06,  3.1399e-06],
        [-2.4065e-04, -2.6713e-04],
        [ 2.2656e-07,  2.5148e-07]]))

tensor([[57.3267, 57.3267],
        [57.3447, 57.3429],
        [57.2166, 57.2173]])

In [388]:
1.3989e-04/1.4983e-04, 1.1068e-04/1.1854e-04

(0.9336581458986851, 0.9336932680951577)

In [181]:
def show_layers_param():
    for l in range(1,model.L+1):
        print('\nLayer: ', l)
        print('weights: ', model.fc[str(l)].weight.data)
        print('bias: ', model.fc[str(l)].bias.data)
    

In [312]:
show_layers_param()


Layer:  1
weights:  tensor([[ 0.1147, -0.2399],
        [ 0.0448, -0.6335],
        [-0.1786, -0.6912]])
bias:  tensor([-0.0807, -0.2581, -0.5455])

Layer:  2
weights:  tensor([[-0.1520, -0.3967,  0.1105],
        [ 0.5335, -0.3719, -0.1566]])
bias:  tensor([ 0.2745, -0.1143])


In [185]:
model.df[1](model.fc[str(2)].weight.data), model.df[1](model.fc[str(2)].bias.data)

(tensor([[1.0000, 0.9760, 0.9745],
         [0.9784, 0.7312, 0.9986]]),
 tensor([0.9458, 0.9909]))

In [None]:
model.dL_db

In [155]:
model.df[1](model.fc[str(1)].bias.data)

tensor([0.9857, 0.9996, 0.8158])

In [23]:
model.z[2]

tensor([[ 3.5002, -3.5263]], grad_fn=<AddmmBackward0>)

In [32]:
for l in range(model.L,0,-1):
    print(l)

2
1


In [156]:
model.df[1](model.z[2])

tensor([[0.7809, 0.8896]], grad_fn=<MulBackward0>)

In [157]:
model.a

{0: tensor([[-0.6030, -0.7212]]),
 1: tensor([[-0.2602,  0.0526, -0.1992]], grad_fn=<TanhBackward0>),
 2: tensor([[-0.4681, -0.3323]], grad_fn=<TanhBackward0>)}

In [158]:
model.df[2](model.z[2]).shape

torch.Size([1, 2])

In [159]:
model.z[2]* model.df[2](model.z[2])

tensor([[-0.3964, -0.3073]], grad_fn=<MulBackward0>)

In [72]:
model.a[2].shape

torch.Size([1, 2])

In [77]:
model.a[2]* (model.df[2](model.z[2]))

tensor([[ 0.0036, -0.0034]], grad_fn=<MulBackward0>)

In [332]:
model.dL_db

{1: tensor([0.4302]), 2: tensor([0.6775])}

In [146]:
model.fc

ModuleDict(
  (1): Linear(in_features=2, out_features=3, bias=True)
  (2): Linear(in_features=3, out_features=2, bias=True)
)

In [157]:
model.fc[str(1)].bias.data.dim(0)

TypeError: _TensorBase.dim() takes no arguments (1 given)

In [174]:
torch.matmul()

TypeError: matmul() missing 2 required positional argument: "input", "other"

In [203]:
model.a

{0: tensor([[ 0.7015, -2.4287]]),
 1: tensor([[-0.5260,  0.1538, -0.2480]], grad_fn=<TanhBackward0>),
 2: tensor([[ 0.3936, -0.5398]], grad_fn=<TanhBackward0>)}

In [297]:
model.z

{1: tensor([[ 0.3640, -0.2106,  0.0463]], grad_fn=<AddmmBackward0>),
 2: tensor([[0.0940, 0.2982]], grad_fn=<AddmmBackward0>)}

In [162]:
model.fc[str(2)].weight.data.shape

torch.Size([2, 3])

In [380]:
model.dL_dw

{1: tensor([[-0.0161, -0.0172],
         [ 0.0050,  0.0053],
         [-0.2129, -0.2269]]),
 2: tensor([[-16.0031,  16.0033, -15.9934],
         [ 16.0031, -16.0034,  15.9934]])}

In [9]:
model = MyNet([2, 3, 2])

In [12]:
show_layers_param()


Layer:  1
weights:  tensor([[ 0.0472, -0.6182],
        [ 0.6941,  0.2990],
        [ 0.5138, -0.0120]])
bias:  tensor([-0.2873,  0.5716, -0.0682])

Layer:  2
weights:  tensor([[-0.3801, -0.5443,  0.2979],
        [-0.2486, -0.4878, -0.2317]])
bias:  tensor([0.0515, 0.5273])


In [13]:
model.forward(torch.ones(1,2))

tensor([[-0.0612,  0.1574]], grad_fn=<TanhBackward0>)

In [14]:
model.a

{0: tensor([[1., 1.]]),
 1: tensor([[-0.6953,  0.9162,  0.4082]], grad_fn=<TanhBackward0>),
 2: tensor([[-0.0612,  0.1574]], grad_fn=<TanhBackward0>)}

In [15]:
model.z

{1: tensor([[-0.8582,  1.5647,  0.4335]], grad_fn=<AddmmBackward0>),
 2: tensor([[-0.0612,  0.1587]], grad_fn=<AddmmBackward0>)}

In [16]:
model.df[2](model.z[2])

tensor([[0.9963, 0.9752]], grad_fn=<MulBackward0>)

In [17]:
model.z[1][0]

tensor([-0.8582,  1.5647,  0.4335], grad_fn=<SelectBackward0>)

In [273]:
model.fc[str(2)].weight.data.shape

torch.Size([2, 3])

In [61]:
dh2 = torch.matmul(model.fc[str(2)].weight.data.T, model.z[2])
dh2

RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x2 and 1x2)

In [37]:
model.df[2](model.z[1]).T

tensor([[0.5165],
        [0.1606],
        [0.8333]], grad_fn=<PermuteBackward0>)

In [50]:
dz2 = torch.mul(dh2, model.df[1](model.z[1]))
dz2

tensor([[-0.0082, -0.0025, -0.0132],
        [-0.0225, -0.0070, -0.0362],
        [-0.0282, -0.0088, -0.0456]], grad_fn=<MulBackward0>)

In [51]:
dw2 = torch.matmul(dz2, model.a[2].T)
dw2

RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x3 and 2x1)

In [103]:
model.a[1].shape

torch.Size([1, 3])

In [130]:
torch.tensor([-5.8622e-06,  1.6592e-06, -7.5508e-05]).shape

torch.Size([3])

In [346]:
a = torch.tensor([1,0,3])
c = torch.reshape(a, (3,1))
a,c

(tensor([1, 0, 3]),
 tensor([[1],
         [0],
         [3]]))

In [347]:
b = model.df[1](a)
b

tensor([0.4200, 1.0000, 0.0099])

In [350]:
a*b

tensor([0.4200, 0.0000, 0.0296])