In [29]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (9,9)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
np.random.seed(0)

![simpleresnet.png](simpleresnet.png)

This exercies uses a simple implementation of a deep neural network to explore the vanishing gradient problem

In [291]:
# Choose an activation function
activation = torch.tanh

# Choose a number of iterations
n = 4

# Store the feed-forward steps
w_list = []
z_list = []
a_list = []

# Make up some data
z_obs = torch.tensor([1.0])

# Initial value
x = torch.tensor([10.],requires_grad=True)
z_prev = x

# Loop over a number of hidden layers
for i in range(n):
    # New weight
    w_i = torch.tensor([1.0],requires_grad=True)

    # Linear transform
    a_i = z_prev*w_i

    # Activation
    zprime_i = activation(a_i)

    # Without skip connection
    z_i = zprime_i

    #z_i = zprime_i + z_prev

    # Store forward model stuff
    w_list.append(w_i)
    z_list.append(zprime_i)
    a_list.append(a_i)

    # output of layer i becomes input for layer i+1
    z_prev = z_i

# Objective function
L = 0.5*(z_i - z_obs)**2

# Reverse-mode AD
L.backward()

# Print each weight's gradient
print([w_.grad for w_ in w_list])


[tensor([-0.]), tensor([-0.0727]), tensor([-0.1319]), tensor([-0.1892])]


Now that we have seen how implementing skip connections seemingly solve the problem of vanishing gradients, that's all the paper was saying
<br>
<br>
This is a simple example of an image processing problem where adding a skipped connection would be helpful

In [286]:
# basic net class
class Net(nn.Module):
    def __init__(self, num_input_images):
        
        # batch size is needed to configure 
        self.num_input_images = num_input_images
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 5, 3)
        self.linearization = nn.Linear(5*26*26,10)        
        
    def forward(self, x):
        # convolution
        x = self.conv1(x)
        # activation
        x = F.relu(x)
        # outputed images needed to be flattened for a linear layer
        x = x.view(self.num_input_images, 5*26*26)
        # find linear patterns in non-linear data
        x = self.linearization(x)
        return x    

In [289]:
num_input_images = 100
num_epochs = 100
num_classes = 10
# Everyone's playing with the same seed, same data
torch.manual_seed(0)
rand_train_data = torch.randn(num_input_images, 1, 28, 28)
rand_test_data = torch.randn(num_input_images, 1, 28, 28)
rand_labels = torch.LongTensor(num_input_images).random_(0, 10)

learning_rate = 1e-3  # The speed of convergence

# net class
net = Net(num_input_images)

# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

In [290]:
for epoch in range(num_epochs):
    optimizer.zero_grad() # Intialize the hidden weight to all zeros
    outputs = net(rand_input) # Forward pass: compute the output class given a image
    loss = criterion(outputs, rand_labels) # Compute the loss: difference between the output class and the pre-given label
    loss.backward() # Backward pass: compute the weight
    optimizer.step()
    num_batches += 1
    print(epoch, loss.item())

0 2.334779739379883
1 2.080829381942749
2 1.861109733581543
3 1.6222866773605347
4 1.4008594751358032
5 1.2021803855895996
6 1.0175918340682983
7 0.8464715480804443
8 0.6945523619651794
9 0.5652304291725159
10 0.45722895860671997
11 0.3678813874721527
12 0.2950149476528168
13 0.23656459152698517
14 0.1902393400669098
15 0.1537216305732727
16 0.12497119605541229
17 0.10233154892921448
18 0.08448765426874161
19 0.07039088010787964
20 0.059205152094364166
21 0.05027180165052414
22 0.04308139905333519
23 0.037246380001306534
24 0.032474275678396225
25 0.028543196618556976
26 0.02528352662920952
27 0.02256440930068493
28 0.020283177495002747
29 0.018358616158366203
30 0.016725942492485046
31 0.015333104878664017
32 0.014138107188045979
33 0.01310708001255989
34 0.012212511152029037
35 0.01143216248601675
36 0.01074786577373743
37 0.010144847445189953
38 0.009610974229872227
39 0.009136296808719635
40 0.008712565526366234
41 0.008332919329404831
42 0.007991624064743519
43 0.00768382847309112

**Questions**

1. What is the vanishing gradient problem, and what is its primary cause?

2. What are 4 limitations to optimizing a deep convolutional neural network?

3. In terms of how a given block of a network is "fitted", what is the key difference between using skip connections and traditional blocks?

4. In the context of model hyper-parameters, what additional parameters is added in the res-net implementation?

5. How do skip connections resolve the "vanishing gradient" problem?

6. Give an appropriate anology for how kernels are used to extract features from images (i.e. sanding wood)

7. max's questions: was this a good paper when it was released? Is it a good paper now? What has changed between now and it's initial release point? What other methods are there of solving the vanishing gradient problem?