We begin with the usual setup that we've seen in the previous pytorch code.

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
import random

trainset = datasets.MNIST(root='./data', train=True, download=True, transform = transforms.ToTensor())
testset = datasets.MNIST(root='./data', train=False, download=True, transform = transforms.ToTensor())

x_train = (trainset.data / 256) - 0.5
y_train = trainset.targets

x_test = (testset.data / 256) - 0.5
y_test = testset.targets

def confusion_matrix(model, x, y):
  model.eval()
  identification_counts = np.zeros( shape = (10,10), dtype = np.int32 )

  logits, probabilities = model( x ) # Note the assumption here that model will be returning logits and probabilities, so we need to build our models accordingly
  predicted_classes = torch.argmax( probabilities, dim = 1 )

  n = x.shape[0]

  for i in range(n):
    actual_class = y[i]
    predicted_class = predicted_classes[i].item()

    identification_counts[ actual_class, predicted_class ] += 1

  total_correct = 0
  for i in range(10):
    total_correct += identification_counts[i, i] # This is new - note that the diagonal elements of the confusion matrix represent correct classifications, so that the sum is the total correct classified

  accuracy = total_correct / np.sum( identification_counts ) # Accuracy is total correct / total tried
  return identification_counts, accuracy

def get_batch( x, y, batch_size ):
  n = x.shape[0]
  batch_indices = random.sample( [ i for i in range(n) ], k = batch_size )
  x_batch = x[ batch_indices ]
  y_batch = y[ batch_indices ]

  return x_batch, y_batch

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 11.5MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 349kB/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 3.18MB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 4.22MB/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [None]:
loss_function = nn.CrossEntropyLoss()

In [None]:
class MNISTNetwork(nn.Module):
  def __init__(self):
    super(MNISTNetwork, self).__init__()

    self.layer_1 = torch.nn.Linear( in_features = 784, out_features = 170, bias = True ) # Choice of 170 here is arbitrary for the number of hidden nodes in that middle layer
    self.layer_2 = torch.nn.Linear( in_features = 170, out_features = 10, bias = True)

  def forward(self, input_tensor):
    flattened = nn.Flatten()( input_tensor )

    hidden_nodes = self.layer_1( flattened )
    hidden_nodes = torch.nn.Sigmoid()( hidden_nodes )

    logits = self.layer_2( hidden_nodes )
    final_probabilities = nn.Softmax( dim = 1 )( logits )

    return logits, final_probabilities

class CNN_MNISTNetwork(nn.Module):
  def __init__(self):
    super(CNN_MNISTNetwork, self).__init__()

    self.conv_1 = torch.nn.Conv2d( in_channels = 1, out_channels = 20, kernel_size = (3,3), stride = 1, bias = True )

    # 28 x 28 pixel image
    # how many places can I apply a 3x3 window at stride 1? 26
    # We get 26 x 26 'local nodes'
    # Layer output : 26 x 26 x 20

    # Best way to think of this - at each of the 26x26 possible locations where a window can be applied, we are computing 20 features based on the pixels ther

    self.linear = torch.nn.Linear( in_features = 26*26*20, out_features = 10, bias = True )
    # We want to take the total computed features, flatten them, and treat them as input to a softmax layer.

  def forward(self, input_tensor):
    # Input tensor : shape [ N x 28 x 28 ]
    # Reshape: [ N x 1 x 28 x 28 ]

    reshaped = torch.reshape( input_tensor, (-1, 1, 28, 28) )

    conv_results = self.conv_1( reshaped ) # output an [ N x 20 x 26 x 26 ] block

    conv_results = torch.nn.Sigmoid()( conv_results ) # Apply sigmoid for nonlinear features

    flattened = torch.nn.Flatten()( conv_results ) # [ N x (20*26*26) ]

    logits = self.linear( flattened )

    probabilities = nn.Softmax( dim = 1 )( logits )
    return logits, probabilities

In [None]:
model = MNISTNetwork()
cnn_model = CNN_MNISTNetwork()

optimizer = optim.SGD( model.parameters(), lr = 0.01 )
cnn_optimizer = optim.SGD( cnn_model.parameters(), lr = 0.01 )

In the training loop below, we perform 10 passes through the data, simultaneously updating the vanilla NN and the CNN. Losses are printed simultaneously for easy comparison.

In [None]:
batch_size = 1024
model.train()
cnn_model.train()

for epochs in range(10):
  total_loss = 0
  cnn_total_loss = 0

  for batch in range( 60000 // batch_size ):
    x_batch, y_batch = get_batch( x_train, y_train, batch_size )

    optimizer.zero_grad()
    cnn_optimizer.zero_grad()

    logits, probabilities = model( x_batch )
    cnn_logits, cnn_probabilities = cnn_model( x_batch )

    loss = loss_function( logits, y_batch )
    cnn_loss = loss_function( cnn_logits, y_batch )

    loss.backward()
    cnn_loss.backward()

    optimizer.step()
    cnn_optimizer.step()

    total_loss += loss.item()
    cnn_total_loss += cnn_loss.item()

  print("Average Loss per Data Point (Reg/CNN):", total_loss / ( 60000 // batch_size ), cnn_total_loss / ( 60000 // batch_size ) )

Average Loss per Data Point (Reg/CNN): 2.3038760382553627 4.558437980454544
Average Loss per Data Point (Reg/CNN): 2.2778756043006636 3.812012425784407
Average Loss per Data Point (Reg/CNN): 2.2634251035492996 2.813050613321107
Average Loss per Data Point (Reg/CNN): 2.2477537311356643 2.026924421047342
Average Loss per Data Point (Reg/CNN): 2.2319094559241988 1.5435694969933609
Average Loss per Data Point (Reg/CNN): 2.215120928040866 1.2005054467710956
Average Loss per Data Point (Reg/CNN): 2.1974128238086044 1.009103757554087
Average Loss per Data Point (Reg/CNN): 2.179364829227842 0.8453050379095406
Average Loss per Data Point (Reg/CNN): 2.1574009862439385 0.7575626630207588
Average Loss per Data Point (Reg/CNN): 2.136398747049529 0.7137375463699472


We see from the above that while both losses decrease steadily, the performance of the CNN is rapidly superior to the vanilla NN.

The reason for this is that the features that are relevant in computer vision /are/ local and spatially invariant. The CNN does not have to waste time learning this, as we have hard-coded it to learn those kinds of features.

In [None]:
confusion_matrix(model, x_test, y_test)

(array([[ 931,   22,    6,   10,    0,    0,    8,    2,    1,    0],
        [   0, 1131,    1,    2,    0,    0,    1,    0,    0,    0],
        [  50,  380,  558,   18,    3,    0,    5,   15,    3,    0],
        [  27,  245,   23,  703,    0,    0,    0,   11,    1,    0],
        [  25,  335,    6,   17,  279,    0,   32,  287,    1,    0],
        [ 120,  324,    4,  352,    0,    3,   17,   66,    0,    6],
        [  88,  207,    8,    7,    1,    0,  640,    7,    0,    0],
        [   6,  232,    5,    2,    0,    0,    0,  782,    1,    0],
        [  63,  550,   14,  143,    2,    0,    9,   31,  160,    2],
        [  44,  287,    9,   33,   15,    0,    6,  498,    1,  116]],
       dtype=int32),
 0.5303)

In [None]:
confusion_matrix(cnn_model, x_test, y_test)

(array([[ 938,    0,    1,   11,    3,    0,   20,    1,    6,    0],
        [   0, 1078,    3,   14,    3,    0,    5,    1,   31,    0],
        [  20,   27,  762,   70,   33,    0,   34,   32,   54,    0],
        [   4,    6,   12,  932,    2,    2,    5,   23,   24,    0],
        [   1,   11,    2,    4,  914,    1,   31,    5,    9,    4],
        [  23,   42,    1,  255,   61,  380,   36,   40,   54,    0],
        [  19,    9,   10,    4,   20,    9,  882,    1,    4,    0],
        [   3,   35,   18,    6,   18,    0,    1,  933,   13,    1],
        [  13,   20,    4,   85,   16,    4,   21,   18,  792,    1],
        [  20,   17,    6,   31,  469,    3,    8,  182,   49,  224]],
       dtype=int32),
 0.7835)

Evaluating on the test data, we see a significantly improved accuracy, even with the exact same number of passes through the data.

In [None]:
for parameter in cnn_model.conv_1.named_parameters():
  print( parameter[0] )
  print( parameter[1].shape )

weight
torch.Size([20, 1, 3, 3])
bias
torch.Size([20])


The purpose of the above is simply to illustrate what the CNN is doing - it has weights that it is applying on a 3x3 window, and it has 20 such weights, giving a total 'weight block' of 20x3x3. This block of weights is applied at multiple locations, so that the same features are computed at different locations in the input.

Note that the bias is a vector with 20 values - each value representing the bias constant for that specific feature.

In [None]:
class CNN_MNISTNetwork_Big(nn.Module):
  def __init__(self):
    super(CNN_MNISTNetwork_Big, self).__init__()

    self.conv_1 = torch.nn.Conv2d( in_channels = 1, out_channels = 20, kernel_size = (3,3), stride = 1, bias = True )
    self.conv_2 = torch.nn.Conv2d( in_channels = 20, out_channels = 20, kernel_size = (2,2), stride = 2, bias = True ) # Adding a second convolutional layer

    # Note the progression here
    # Input will be [ N x 1 x 28 x 28 ]
    # After the first convolutional layer, [ N x 20 x 26 x 26 ] since there are 26 places to put a 3x3 window at stride 1
    # After the second convolutional layer, [ N x 20 x 13 x 13 ] since there are 13 places to put down a 2x2 window at stride 2

    self.linear = torch.nn.Linear( in_features = 13*13*20, out_features = 10, bias = True ) # Dump all the features into a linear layer to get the logits

  def forward(self, input_tensor):
    reshaped = torch.reshape( input_tensor, (-1, 1, 28, 28) )

    conv_results = self.conv_1( reshaped )
    conv_results = torch.nn.ReLU()( conv_results ) # NOTE: Based on results in class, ReLU gave us a significant speed up and performance improvement

    conv_results = self.conv_2( conv_results )
    conv_results = torch.nn.ReLU()( conv_results )

    flattened = torch.nn.Flatten()( conv_results )

    logits = self.linear( flattened )

    probabilities = nn.Softmax( dim = 1 )( logits )
    return logits, probabilities

In [None]:
cnn_model = CNN_MNISTNetwork_Big()

cnn_optimizer = optim.Adam( cnn_model.parameters(), lr = 0.01 )

In [None]:
batch_size = 1024
cnn_model.train()

for epochs in range(10):
  cnn_total_loss = 0

  for batch in range( 60000 // batch_size ):
    x_batch, y_batch = get_batch( x_train, y_train, batch_size )

    cnn_optimizer.zero_grad()

    cnn_logits, cnn_probabilities = cnn_model( x_batch )

    cnn_loss = loss_function( cnn_logits, y_batch )

    cnn_loss.backward()

    cnn_optimizer.step()

    cnn_total_loss += cnn_loss.item()

  print("Average Loss per Data Point (CNN):", cnn_total_loss / ( 60000 // batch_size ) )

Average Loss per Data Point (CNN): 0.4564738065518182
Average Loss per Data Point (CNN): 0.0983628458504019
Average Loss per Data Point (CNN): 0.06006036332712091
Average Loss per Data Point (CNN): 0.050112428111505916
Average Loss per Data Point (CNN): 0.04772377556896415
Average Loss per Data Point (CNN): 0.03795483663421253
Average Loss per Data Point (CNN): 0.032978989761965026
Average Loss per Data Point (CNN): 0.025611180635490293
Average Loss per Data Point (CNN): 0.025306754801743502
Average Loss per Data Point (CNN): 0.019449021827814907


In [None]:
confusion_matrix(cnn_model, x_test, y_test)

(array([[ 968,    0,    3,    2,    0,    1,    5,    0,    1,    0],
        [   0, 1129,    1,    1,    1,    0,    2,    0,    1,    0],
        [   0,    3, 1017,    6,    0,    0,    1,    4,    1,    0],
        [   0,    0,    2, 1005,    0,    2,    0,    1,    0,    0],
        [   0,    0,    0,    0,  964,    0,    5,    1,    2,   10],
        [   2,    0,    2,   14,    0,  868,    4,    0,    1,    1],
        [   8,    2,    1,    1,    1,    1,  944,    0,    0,    0],
        [   0,    1,   12,    2,    0,    0,    0, 1005,    3,    5],
        [   5,    0,    4,    4,    1,    0,    2,    2,  953,    3],
        [   3,    1,    1,    7,    5,    4,    0,    4,    1,  983]],
       dtype=int32),
 0.9836)

In [None]:
class CNN_MNISTNetwork(nn.Module):
  def __init__(self):
    super(CNN_MNISTNetwork, self).__init__()
    self.conv_1 = torch.nn.Conv2d( in_channels = 1, out_channels = 20, kernel_size = (3,3), stride = 1, bias = True )
    self.linear = torch.nn.Linear( in_features = 26*26*20, out_features = 10, bias = True )

  def forward(self, input_tensor):
    reshaped = torch.reshape( input_tensor, (-1, 1, 28, 28) )
    conv_results = self.conv_1( reshaped )
    conv_results = torch.nn.Sigmoid()( conv_results )
    flattened = torch.nn.Flatten()( conv_results )
    logits = self.linear( flattened )
    probabilities = nn.Softmax( dim = 1 )( logits )
    return logits, probabilities

In [None]:
cnn_model = CNN_MNISTNetwork()

for parameter in cnn_model.parameters():
  print( parameter.shape )

torch.Size([20, 1, 3, 3])
torch.Size([20])
torch.Size([10, 13520])
torch.Size([10])


In [None]:
26*26*20

13520

In [None]:
20*1*3*3 + 20 + 10*13520 + 10

135410

We see from the above that the total number of parameters in this one layer CNN is 135,410 parameters.

In [None]:
class MNISTNetwork(nn.Module):
  def __init__(self):
    super(MNISTNetwork, self).__init__()

    self.layer_1 = torch.nn.Linear( in_features = 784, out_features = 170, bias = True )
    self.layer_2 = torch.nn.Linear( in_features = 170, out_features = 10, bias = True)

  def forward(self, input_tensor):
    flattened = nn.Flatten()( input_tensor )

    hidden_nodes = self.layer_1( flattened )
    hidden_nodes = torch.nn.Sigmoid()( hidden_nodes )

    logits = self.layer_2( hidden_nodes )
    final_probabilities = nn.Softmax( dim = 1 )( logits )

    return logits, final_probabilities

In [None]:
model = MNISTNetwork()

for parameter in model.parameters():
  print( parameter.shape )

torch.Size([170, 784])
torch.Size([170])
torch.Size([10, 170])
torch.Size([10])


In [None]:
170*784 + 170 + 10*170 + 10

135160

We see from this that the total number of parameters in the 1 hidden layer network (with 170 nodes) is 135,160, which is approximately the total number of parameters in the CNN network (hence why I chose 170 as the total number of nodes in the hidden layer). This means that the comparison between the two networks is 'fair' - one doesn't have many more parameters than the other, giving it an advantage in what it can fit. We just see that the convolutional network is better suited to learning the kinds of features that are relevant to this task.