In [None]:
# Filename: 2.0-LeNet-5-MNIST.ipynb
# Author: Eyosyas Dagnachew
# Description: Train LeNet-5 model on MNIST dataset. Remember, the purpose of this implementation is not to create the most optimal network to classift the MNIST dataset. 
#              Instead, it is to learn how to read a machine learning research paper and implement it (and in the process have a better understanding of the paper and its methodology).

In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Parameters/Hyperparameters
NUM_CLASSES = 10

In [None]:
# MNIST dataset
train_dataset = torchvision.datasets.MNIST(root="../../data", 
                                           train=True, 
                                           transform=transforms.Compose([transforms.ToTensor()]),
                                           download=False)
test_dataset = torchvision.datasets.MNIST(root="../../data",
                                          train=False,
                                          transform=transforms.Compose([transforms.ToTensor()]),
                                          download=False)
train_dataset, test_dataset

In [None]:
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100,
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100,
                                          shuffle=False)

In [None]:
# LeNet-5 model
class LeNet5(nn.Module):
    '''
    Input: 32x32 pixel image in the paper, but 28x28 in the dataset
           The paper mentions that "[32x32] is significantly larger than the largest character in the
           database (at most 20x20 pixels centered in a 28x28 field). This might explain why the 
           the images in this dataset have been cropped to 28x28.
           
           Using 2D convolution because the input is technically a 3D (32x32x1) image.
           
    Output: 10
    '''
    
    def __init__(self, num_classes=10):
        super(LeNet5, self).__init__()
        
        # Layer C1: conv layer with 6 28x28 feature maps with 5x5 kernels 
        #           parameters and connections: 156 trainable parameters, 122304 connections (28*28*156)
        #           notes: padding=2 because the original images were 32x32 but this dataset contains 28x28 images (2 pixels removed 
        #                  from all sides), so we have to make up for the removed pixels but adding a padding of 2 on all sides
        #           in: 32x32x1, out: 28x28x6
        self.c1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2)    
        
        # Layer S2: sub-sampling (pooling) layer with 6 14x14 feature maps with 2x2 kernels; result are passed to sigmoidal function
        #           parameters and connections: 12 trainable parameters, 5800 connections (in the paper);
        #                                       0 trainable parameters, 5880 connections (in my implementation, read notes below)
        #           notes: "The four inputs to a unit in S2 are added, then multiplied by a trainable coefficient, 
        #                  and added to a trainable bias." This is where the difference between subsampling and pooling comes to play. 
        #                  Subsampling, as mentioned in the paper, is simply average pooling with learnable weights per feature map. 
        #                  In the Lua implementation of Torch, there is nn.SpatialSubSampling() but there is no such implementation
        #                  for PyTorch, so I will just use average pooling, i.e. AvgPool2d().
        #           in: 28x28x6, out: 14x14x6
        self.s2 = nn.Sequential(
                    nn.AvgPool2d(kernel_size=2, stride=2),    # sub-sampling
                    nn.Sigmoid()                              # sigmoidal function
        )
        
        # Layer C3: conv layer with 16 10x10 feature maps with 5x5 kernels
        #           parameters and connections: 1,516 trainable parameters and 151,600 connections (in the paper);
        #                                       2,416 trainable parameters and 241,600 connections (in my implementation, read notes below)
        #           notes: "Each unit in each feature map is connected to several 5x5 neighborhoods at identical locations in a subset
        #                  of S2's feature maps. Table I shows the set of S2 feature maps combined by each C3 feature maps." Instead of
        #                  connecting each output channel with each input channel, they connect certain output channels with certain input 
        #                  channels (detailed in Table 1). This is done because of two reasons: (1) To reduce the number of connections, 
        #                  and (2) To force a break of symmetry in the network so that "different feature maps are forced to extract 
        #                  different (hopefully complementary) features because they get different sets of input." I have no idea how to do
        #                  this in PyTorch (or if there even is a way), so I'm just going to use the default (a.k.a. groups=1).
        #           in: 14x14x6, out: 10x10x16
        self.c3 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1, padding=0, groups=1)
        
        # Layer S4: sub-sampling (pooling) layer with 16 5x5 feature maps with 2x2 kernels
        #           parameters and connections: 32 trainable parameters, 2,000 connections (in the paper)
        #                                       0 trainable parameters, 2,000 connections (in my implementation, read Layer S2 notes)
        #           notes: Connections between input and output are made in a similar way as C1 and S2.
        #           in: 10x10x16, out: 5x5x16
        self.s4 = nn.AvgPool2d(kernel_size=2, stride=2)
        
        # Layer C5: conv layer with 120 1x1 feature maps with 5x5 kernels
        #           parameters and connections: 48,120 trainable connections (in the paper)
        #                                       48,120 trainable connections (in my implementation)
        #           notes: Since the size of S4 is the same as the size of the kernels of C5 (5x5), the size of C5's feature maps is 1x1. 
        #                  This makes the connection between S4 and C5 a full connection. However, C5 is labelled as a conv layer instead
        #                  of a fully-connected layer because "if LeNet-5 were made bigger and everything else kept constant, the feature 
        #                  map dimensions would be larger than 1x1."
        #           in: 5x5x16, out: 1x1x120
        self.c5 = nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1, padding=0)
        
    
    def forward(self, x):
        # TODO
        pass

In [None]:
# Initialize LeNet model
model = LeNet5(num_classes=NUM_CLASSES).to(device)
print(model)

# Print the number of parameters
for parameter in model.parameters():
    print(parameter.numel())    # 6 filters (150 each because 150/6 = 25 and 5x5 = 25 and weight sharing)

In [None]:
# Initialize loss function and optimizer

In [None]:
# Train the model

In [None]:
# Test the model

In [None]:
# Save the model checkpoint