In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (9,9)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
np.random.seed(0)

![simpleresnet.png](simpleresnet.png)

This exercies uses a simple implementation of a deep neural network to explore the vanishing gradient problem

In [2]:
# Choose an activation function
activation = torch.tanh

# Choose a number of iterations
n = 4

# Store the feed-forward steps
w_list = []
z_list = []
a_list = []

# Make up some data
z_obs = torch.tensor([1.0])

# Initial value
x = torch.tensor([10.],requires_grad=True)
z_prev = x

# Loop over a number of hidden layers
for i in range(n):
    # New weight
    w_i = torch.tensor([1.0],requires_grad=True)

    # Linear transform
    a_i = z_prev*w_i

    # Activation
    zprime_i = activation(a_i)

    # Without skip connection
    z_i = zprime_i
    
    # With skip connection
    #z_i = zprime_i + z_prev

    # Store forward model stuff
    w_list.append(w_i)
    z_list.append(zprime_i)
    a_list.append(a_i)

    # output of layer i becomes input for layer i+1
    z_prev = z_i

# Objective function
L = 0.5*(z_i - z_obs)**2

# Reverse-mode AD
L.backward()

# Print each weight's gradient
print([w_.grad for w_ in w_list])


[tensor([-0.]), tensor([-0.0727]), tensor([-0.1319]), tensor([-0.1892])]


Now that we have seen how implementing skip connections seemingly solve the problem of vanishing gradients,
lets apply this idea to something a bit fancier, like classifying handwritten images.

In [4]:
train_dataset = dsets.MNIST(root='./data',
                           train=True,
                           transform=transforms.ToTensor(),
                           download=True)

test_dataset = dsets.MNIST(root='./data',
                           train=False,
                           transform=transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=100,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100,
                                          shuffle=False)

In [3]:
class Net(nn.Module):
    def __init__(self,layers, input_size, num_class,batch_size):
        super(Net, self).__init__()
        self.bs = batch_size
        self.input = input_size
        self.classes = num_class
        self.layers = layers
        self.conv1 = nn.Conv2d(self.input,self.bs,2)
        self.conv2 = nn.Conv2d(self.input,self.bs,2)
        self.linears = nn.ModuleList([nn.Linear(self.input,self.input)])
        self.linears.extend([nn.Linear(self.input, self.input) for i in range(1, self.layers-1)])
        self.linears.append(nn.Linear(self.input, self.classes))
        
    def forward(self, x):
        temp = None
        i = 0
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        for layer in self.linears:
            if i%3 == 0:
                x = torch.sigmoid(layer(x))
                i += 1
            else:
                x = layer(x)
                i += 1

        return x    

In [5]:
input_size = 784       # The image size = 28 x 28 = 784
num_classes = 10       # The number of output classes. In this case, from 0 to 9
num_epochs = 10         # The number of times entire dataset is trained
batch_size = 100       # The size of input data took for one iteration
learning_rate = 1e-3  # The speed of convergence
net = Net(10, input_size,num_classes,batch_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn.ParameterList(net.parameters()), lr=learning_rate)

In [14]:
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):   # Load a batch of images with its (index, data, class)
        images = Variable(images.view(-1,28*28))         # Convert torch tensor to Variable: change image from a vector of size 784 to a matrix of 28 x 28
        labels = Variable(labels)
        optimizer.zero_grad()                             # Intialize the hidden weight to all zeros
        outputs = net(images)                             # Forward pass: compute the output class given a image
        loss = criterion(outputs, labels)                 # Compute the loss: difference between the output class and the pre-given label
        loss.backward()                                   # Backward pass: compute the weight
        optimizer.step()
    total=0
    correct=0
    # Loop over all the test examples and accumulate the number of correct results in each batch
    for d,t in test_loader:
        outputs = net(d.view(-1,28*28))
        _, predicted = torch.max(outputs.data,1)
        total += Variable(t).size(0)
        correct += (predicted==t).sum()
        
    # Print the epoch, the training loss, and the test set accuracy.
    print(epoch,loss.item(),(100.*correct/total).item())

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [100, 784, 2, 2], but got 2-dimensional input of size [100, 784] instead

**Questions**

1. What is the vanishing gradient problem, and what is its primary cause?

2. What are 4 limitations to optimizing a deep convolutional neural network?

3. In terms of how a given block of a network is "fitted", what is the key difference between using skip connections and traditional blocks?

4. In the context of model hyper-parameters, what additional parameters is added in the res-net implementation?

5. How do skip connections resolve the "vanishing gradient" problem?

6. Give an appropriate anology for how kernals are used to extract features from images (i.e. sanding wood)
