In [None]:
# Filename: 2.0-LeNet-5-MNIST.ipynb
# Author: Eyosyas Dagnachew
# Description: Train LeNet-5 model on MNIST dataset. 

In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Parameters/Hyperparameters
NUM_CLASSES = 10

In [None]:
# MNIST dataset
train_dataset = torchvision.datasets.MNIST(root="../../data", 
                                           train=True, 
                                           transform=transforms.Compose([transforms.ToTensor()]),
                                           download=False)
test_dataset = torchvision.datasets.MNIST(root="../../data",
                                          train=False,
                                          transform=transforms.Compose([transforms.ToTensor()]),
                                          download=False)
train_dataset, test_dataset

In [None]:
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100,
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100,
                                          shuffle=False)

In [None]:
# LeNet-5 model
class LeNet5(nn.Module):
    '''
    Input: 32x32 pixel image in the paper, but 28x28 in the dataset
           The paper mentions that "[32x32] is significantly larger than the largest character in the
           database (at most 20x20 pixels centered in a 28x28 field). This might explain why the 
           the images in this dataset have been cropped to 28x28.
           
           Using 2D convolution because the input is technically a 3D (32x32x1) image.
           
    Output: 10
    '''
    
    def __init__(self, num_classes=10):
        super(LeNet5, self).__init__()
        # Layer C1: conv layer with 6 28x28 feature maps with 5x5 kernels 
        #           parameters and connections: 156 trainable parameters, 122304 connections
        #           notes: padding=2 because the original images were 32x32 but this dataset contains 28x28 images (2 pixels removed 
        #                  from all sides), so we have to make up for the removed pixels but adding a padding of 2 on all sides
        #           in: 32x32x1, out: 28x28x6
        self.c1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2)    
        
        # Layer S2: sub-sampling (pooling) layer with 6 14x14 feature maps with 2x2 kernels; result are passed to sigmoidal function
        #           parameters and connections: 12 trainable parameters, 5800 connections (in the paper);
        #                                       0 trainable parameters, 5880 connections (in my implementation, read notes below)
        #           notes: "The four inputs to a unit in S2 are added, then multiplied by a trainable coefficient, 
        #                  and added to a trainable bias." This is where the difference between subsampling and pooling comes to play. 
        #                  Subsampling, as mentioned in the paper, is simply average pooling with learnable weights per feature map. 
        #                  In the Lua implementation of Torch, there is nn.SpatialSubSampling() but there is no such implementation
        #                  for PyTorch, so I will just use average pooling, i.e. AvgPool2d().
        #           in: 28x28x6, out: 14x14x6
        self.s2 = nn.Sequential(
                    nn.AvgPool2d(kernel_size=2, stride=2),    # sub-sampling
                    nn.Sigmoid()                              # sigmoidal function
        )
        
        
    
    def forward(self, x):
        # TODO
        pass

In [None]:
# Initialize LeNet model
model = LeNet5(num_classes=NUM_CLASSES).to(device)
print(model)

# Print the number of parameters
for parameter in model.parameters():
    print(parameter.numel())    # 6 filters (150 each because 150/6 = 25 and 5x5 = 25 and weight sharing)

In [None]:
# Initialize loss function and optimizer

In [None]:
# Train the model

In [None]:
# Test the model

In [None]:
# Save the model checkpoint