In [68]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (9,9)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
np.random.seed(0)
from torch.utils.data import TensorDataset

![simpleresnet.png](simpleresnet.png)

This exercies uses a simple implementation of a deep neural network to explore the vanishing gradient problem

In [3]:
# Choose an activation function
activation = torch.tanh

# Choose a number of iterations
n = 4

# Store the feed-forward steps
w_list = []
z_list = []
a_list = []

# Make up some data
z_obs = torch.tensor([1.0])

# Initial value
x = torch.tensor([10.],requires_grad=True)
z_prev = x

# Loop over a number of hidden layers
for i in range(n):
    # New weight
    w_i = torch.tensor([1.0],requires_grad=True)

    # Linear transform
    a_i = z_prev*w_i

    # Activation
    zprime_i = activation(a_i)

    #TODO: replace the line below with one that would add a skip connection
    z_i = zprime_i
    
    # Store forward model stuff
    w_list.append(w_i)
    z_list.append(zprime_i)
    a_list.append(a_i)

    # output of layer i becomes input for layer i+1
    z_prev = z_i

# Objective function
L = 0.5*(z_i - z_obs)**2

# Reverse-mode AD
L.backward()

# Print each weight's gradient
print([w_.grad for w_ in w_list])


[tensor([-0.]), tensor([-0.0727]), tensor([-0.1319]), tensor([-0.1892])]


Now that we have seen how implementing skip connections seemingly solve the problem of vanishing gradients, we've learned all we can from the paper, lets look at some applications

------------

Below is a simple example of an image processing problem where vanishing gradient becomes an issue (no need to show it this time)

For training and testing data I generated random images for a training and test set. If the small problems are too easy feel free to increase the size of the datasets to make for more challenging problems

After you get done with the conceptual questions below, feel free to change the architecture of the below net. Make 3 changes to the architecture, record the loss differnece after 100 iterations, and come up with a justification for that difference in loss

In [90]:
train_dataset = dsets.MNIST(root='./data',
                           train=True,
                           transform=transforms.ToTensor(),
                           download=True)

train_data = train_dataset.data[::20].unsqueeze(1).float()
train_targets = train_dataset.targets[::20].unsqueeze(1)

test_dataset = dsets.MNIST(root='./data',
                           train=False,
                           transform=transforms.ToTensor())

test_data = test_dataset.data[::20].unsqueeze(1).float()
test_targets = test_dataset.targets[::20].unsqueeze(1)

training_data = TensorDataset(train_data,train_targets)
testing_data = TensorDataset(test_data,test_targets)

train_loader = torch.utils.data.DataLoader(dataset=training_data,
                                          batch_size=100,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=testing_data,
                                          batch_size=100,
                                          shuffle=False)

In [91]:
train_dataset.data.shape
train_dataset.data[::20].shape
train_data.data

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        ...,


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0.

In [92]:
# basic net class
class Net(nn.Module):
    def __init__(self, num_input_images, num_layers):
        
        # batch size is needed to configure 
        self.num_input_images = num_input_images
        self.num_layers = num_layers
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 1, 3, padding = 1)
        self.linearization = nn.Linear(5*26*26,10)        
        self.convout = nn.Conv2d(1, 5, 3)
    def forward(self, x):
        zprev = x
        # convolution
        for i in range(self.num_layers):
            x = self.conv1(x)
            # activation
            x = F.relu(x)
        x = self.convout(x)
        # outputed images needed to be flattened for a linear layer
        x = x.view(self.num_input_images, 5*26*26)
        # find linear patterns in non-linear data
        x = self.linearization(x)
        return x    

In [93]:
num_input_images = 100
num_epochs = 1500
num_classes = 10
num_layers = 5
# Everyone's playing with the same seed, same data
torch.manual_seed(0)
rand_train_data = torch.randn(num_input_images, 1, 28, 28)
rand_train_labels = torch.LongTensor(num_input_images).random_(0, 10)
rand_test_data = torch.randn(num_input_images, 1, 28, 28)
rand_test_labels = torch.LongTensor(num_input_images).random_(0, 10)

learning_rate = 1e-3  # The speed of convergence

# net class
net = Net(num_input_images, num_layers)

# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

In [120]:
def train_model(epic):

        model = Net(num_input_images, num_layers)
        model.to(device)

        criterion = torch.nn.CrossEntropyLoss() #since ive set this up as a classification problem with bins number of classes

        optimizer = torch.optim.Adam(model.parameters())

        epochs = epic
        # Loop over the data

        for epoch in range(epochs):
                model.train()
                # Loop over each subset of data

                correct = 0
                total = 0        

                for d,t in train_loader:
                        # Zero out the optimizer's gradient buffer
                        optimizer.zero_grad()
                        # Make a prediction based on the model
                        outputs = model(d)
                        # Compute the loss
                        loss = criterion(outputs,t[:,0])
                        # Use backpropagation to compute the derivative of the loss with respect to the parameters
                        loss.backward()
                        # Use the derivative information to update the parameters
                        optimizer.step()

                        if epoch%10==0: #every once in a while see how the model is doing
                                _, predicted = torch.max(outputs.data, 1)
                                correct += len(predicted[predicted==t[:,0]])
                                total += len(predicted.flatten())
                
                if epoch%10==0:      
                        print(epoch,loss.item(), 'Accuracy = ', correct/total*100)

        return model
    

In [125]:
net = train_model(100) #train the model for a hundred epochs

correct = 0
total = 0        

for d,t in test_loader:
    # Zero out the optimizer's gradient buffer
    optimizer.zero_grad()
    # Make a prediction based on the model
    outputs = net(d)
    _, predicted = torch.max(outputs.data, 1)
    correct += len(predicted[predicted==t[:,0]])
    total += len(predicted.flatten())
                     
print('Test Accuracy = ', correct/total*100)

0 1.6370714902877808 Accuracy =  31.6
10 0.3135767877101898 Accuracy =  88.66666666666667
20 0.2120419293642044 Accuracy =  92.43333333333334
30 0.17267699539661407 Accuracy =  95.66666666666667
40 0.06949880719184875 Accuracy =  97.26666666666667
50 0.11266805976629257 Accuracy =  98.76666666666667
60 0.06865020096302032 Accuracy =  98.93333333333332
70 0.017717797309160233 Accuracy =  99.83333333333333
80 0.015102163888514042 Accuracy =  100.0
90 0.005821489728987217 Accuracy =  100.0
Test Accuracy =  80.60000000000001


**Questions**

1. What is the vanishing gradient problem, and what is its primary cause?

2. What are 4 limitations to optimizing a deep convolutional neural network?

3. In terms of how a given block of a network is "fitted", what is the key difference between using skip connections and traditional blocks?

4. In the context of model hyper-parameters, what additional parameters is added in the res-net implementation?

5. How do skip connections resolve the "vanishing gradient" problem?

6. Give an appropriate anology for how kernels are used to extract features from images (i.e. sanding wood)

7. max's questions: was this a good paper when it was released? Is it a good paper now? What has changed between now and it's initial release point? What other methods are there of solving the vanishing gradient problem?