# Fully connected vs Convoluted neural networks for MNIST classification
### Task 1
In this section I create a simple neural network class, SimpleFCNN. This class has two linear layers, which transform the input of (28 x 28) to 784 -> 64 -> 10. Relu is applied to the first layer.

I then retrieve and organise the MNIST data into seperate sets, define a method to return accuracy and loss averages and then train an instance of the SimpleFCNN class. The accuracy and loss averages of this trained instance is then outputted in graphs. The instance is trained with a batched training set over 100 epochs. I showcase the overfitting nature of the neural network in the resultant graphs.

In [None]:
# imports
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms


# note: mnist must be in grayscale and (batch_size, 1, 28, 28)
# use torchvision.transforms to do this
class SimpleFCNN(nn.Module):
    def __init__(self):
        super(SimpleFCNN, self).__init__()
        self.fc1 = nn.Linear(784, 64)
        self.fc2 = nn.Linear(64, 10) # Output 10 (one for each digit)

    def forward(self, x):
        x = x.view(-1, 784)  # Flatten the input (assuming input size is 28x28)
        x = F.relu(self.fc1(x))  # Apply ReLU activation to the first hidden layer
        x = self.fc2(x)  # Output layer (no activation function)
        return x

    
loss_function = nn.CrossEntropyLoss() 

# retrieve and organise data
transform_list = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.0], std=[1.0,]) ] )

mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform_list)
mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform_list)

mnist_trainset_small = [ mnist_trainset[i] for i in range(0,4000) ]


def getAccuracyAndMeanLoss(trainloader, testloader, network):
    correct = 0 
    total = 0
    total_loss = 0.0
    with torch.no_grad():
        for data in trainloader: 
            images, labels = data
            outputs = network(images)
            loss = loss_function (outputs, labels)
            _, predicted = torch.max(outputs, dim=1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            total_loss += loss.item()

    train_accuracy = 100 * correct / total
    train_loss = total_loss / total


    correct = 0 
    total = 0
    total_loss = 0.0
    with torch.no_grad():
        for data in testloader: 
            images, labels = data
            outputs = network(images)
            _, predicted = torch.max( outputs, dim=1)
            loss = loss_function (outputs, labels)

            total += labels.size(0)
            correct += (predicted == labels).sum().item() 
            total_loss += loss.item()

    test_accuracy = 100 * correct / total
    test_loss = total_loss / total

    return train_accuracy, test_accuracy, train_loss, test_loss

In [None]:
trainloader = torch.utils.data.DataLoader( mnist_trainset_small, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader( mnist_testset, batch_size=32, shuffle=True)

nn1 = SimpleFCNN()
optimizer = torch.optim.Adam(nn1.parameters(), lr=0.0001)

test_accuracy_over_time = []
train_accuracy_over_time = []
test_loss_over_time = []
train_loss_over_time = []

for epoch in range(100):
    current_loss = 0.0 
    n_mini_batches = 0
    
    for i, mini_batch in enumerate( trainloader, 0 ):
        images, labels = mini_batch

        optimizer.zero_grad()
        
        outputs = nn1(images)
        loss = loss_function ( outputs, labels )
        loss.backward()
        optimizer.step()
        
        n_mini_batches += 1 
        current_loss += loss.item()        
     
    print('Epoch %d loss: %.3f' %(epoch+1, current_loss / n_mini_batches ))
    train_accuracy, test_accuracy, train_loss, test_loss = getAccuracyAndMeanLoss(trainloader, testloader, nn1)

    test_accuracy_over_time.append(test_accuracy)
    train_accuracy_over_time.append(train_accuracy)
    test_loss_over_time.append(test_loss)
    train_loss_over_time.append(train_loss)
    

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

axs[0].plot(test_accuracy_over_time, label='Test Accuracy')
axs[0].plot(train_accuracy_over_time, label='Train Accuracy')

axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('Accuracy of estimations')
axs[0].set_title('Network accuracy against training epoch')
axs[0].legend()

axs[1].plot(test_loss_over_time, label='Test Loss')
axs[1].plot(train_loss_over_time, label='Train Loss')

axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('Average Loss on estimations')
axs[1].set_title('Network loss against training epoch')
axs[1].legend()

plt.show()

#### Graph commentary
Both graphs clearly indicate that the neural network has overfitted to the training set. The test set plateaus at approximately 90% accuracy and 0.01 avaerage loss, whereas the training set approaches 100% and 0.00. 

#### Task 2 - Discussion
For this task, I have chosen to test different hyperparameter values for l2 regularisation. I use np.linspace to generate 25 values between 0.00000001 and 0.01 and use these (with the Adam optimiser with learning rate of 0.0001, same as previous work) to train 25 instances of the SimpleFCNN. The final loss and acccuracy averages for these networks are then plotted and compared. The best performing hyperparameter is then chosen based on these results and used in later work.

In [None]:
interval_number = 25
weight_decay_values = np.linspace(0.00000001, 0.01, num=interval_number)

trainloader = torch.utils.data.DataLoader( mnist_trainset_small, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader( mnist_testset, batch_size=32, shuffle=True)

test_accuracy_values = []
train_accuracy_values = []
test_loss_values = []
train_loss_values = []
net_count = 0

for weight_decay in weight_decay_values:
    net_count += 1
    print("Training neural network %d / %d" %(net_count, interval_number))
    
    nn1 = SimpleFCNN()
    optimizer = torch.optim.Adam(nn1.parameters(), lr=0.0001, weight_decay=weight_decay)

    for epoch in range(100):
        current_loss = 0.0 
        n_mini_batches = 0

        for i, mini_batch in enumerate( trainloader, 0 ):
            images, labels = mini_batch

            optimizer.zero_grad()

            outputs = nn1(images)
            loss = loss_function ( outputs, labels )
            loss.backward()
            optimizer.step()

            n_mini_batches += 1 
            current_loss += loss.item()

    train_accuracy, test_accuracy, train_loss, test_loss = getAccuracyAndMeanLoss(trainloader, testloader, nn1)
    train_accuracy_values.append(train_accuracy)
    test_accuracy_values.append(test_accuracy)
    train_loss_values.append(train_loss)
    test_loss_values.append(test_loss)

print("Training complete.")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

axs[0].plot(test_accuracy_values, label='Test Accuracy')
axs[0].plot(train_accuracy_values, label='Train Accuracy')

axs[0].set_xticks(range(len(weight_decay_values)), \
                  ['{:.2e}'.format(value) for value in weight_decay_values], \
                  rotation=90)
axs[0].set_xlabel('Weight Decay (L2)')
axs[0].set_ylabel('Accuracy of estimations')
axs[0].set_title('Network accuracy against L2 regularization value')
axs[0].legend()

axs[1].plot(test_loss_values, label='Test Loss')
axs[1].plot(train_loss_values, label='Train Loss')

axs[1].set_xticks(range(len(weight_decay_values)), \
                  ['{:.2e}'.format(value) for value in weight_decay_values], \
                  rotation=90)
axs[1].set_xlabel('Weight Decay (L2)')
axs[1].set_ylabel('Average Loss on estimations')
axs[1].set_title('Network loss against L2 regularization value')
axs[1].legend()

plt.show()

In [None]:
print("From these results, the regularization value with the best test data accuracy is {:.2e}"
      .format(weight_decay_values[test_accuracy_values.index(max(test_accuracy_values))]))  
print("Accuracy value is:", max(test_accuracy_values))
print("Loss value is:", test_loss_values[test_accuracy_values.index(max(test_accuracy_values))])

first_NN_weight_deacy = weight_decay_values[test_accuracy_values.index(max(test_accuracy_values))]

#### Result discussion
The optimum l2 regularization hyperparameter value is the value that has the highest accuracy and lowest loss. However, the value 1.00e-08 provides the best accuracy and 1.67e-03 provides the best average loss. In this instance, I have chosen to favour accuracy - accuracy measures the ratio of correct to incorrect predictions, rather than the magnitude of incorrect predictions.

#### Task 3 - Discussion
Using the l2 regularisation parameter chosen from task 2, I have trained 8 instances of the SimpleFCNN on different training set sizes, 500, 1000, 2000, 4000, 8000, 16000, 32000, 60000. The accuracy and loss averages (for the test set) have been plotted against the training set size. These graphs are in log log configuration to ease comparison (ideal improvemnet rate has a gradient of 1/sqrt(N), which is a straight line in log log graphs).

In [None]:
training_set_sizes = [500, 1000, 2000, 4000, 8000, 16000, 32000, 60000]
# use the same testloader as instantiated beforehand.

test_NN_accuracy_values = []
test_NN_loss_values = []
net_count = 0
weight_decay = first_NN_weight_deacy

for set_size in training_set_sizes:
    # define trainset
    mnist_trainsubset = [ mnist_trainset[i] for i in range(0,set_size) ]
    trainloader = torch.utils.data.DataLoader(mnist_trainsubset, batch_size=32, shuffle=True)
    
    net_count += 1
    print("Training neural network %d / %d" %(net_count, len(training_set_sizes)))
    
    nn1 = SimpleFCNN()
    optimizer = torch.optim.Adam(nn1.parameters(), lr=0.0001, weight_decay=weight_decay)

    for epoch in range(100):
        current_loss = 0.0 
        n_mini_batches = 0

        for i, mini_batch in enumerate( trainloader, 0 ):
            images, labels = mini_batch

            optimizer.zero_grad()

            outputs = nn1(images)
            loss = loss_function ( outputs, labels )
            loss.backward()
            optimizer.step()

            n_mini_batches += 1 
            current_loss += loss.item()

    _, test_accuracy, _, test_loss = getAccuracyAndMeanLoss(trainloader, testloader, nn1)
    test_NN_accuracy_values.append(test_accuracy)
    test_NN_loss_values.append(test_loss)

print("Training complete.")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5))

axs[0].loglog(training_set_sizes, test_NN_accuracy_values, label='Test Accuracy')
axs[0].set_xlabel('Training size')
axs[0].set_ylabel('Accuracy of estimations')
axs[0].set_title('Network accuracy against Training size')
axs[0].legend()

axs[1].loglog(training_set_sizes, test_NN_loss_values, label='Test Loss')

def approx_func(x):
    return (1 / np.sqrt(x ** 1.3) + 0.002)

x_func = np.linspace(500, 60000, 100)
plt.loglog(x_func, approx_func(x_func), linestyle='--', color='red', label='y = 1/(sqrt(x^1.3)) + 0.002')


axs[1].set_xlabel('Training size')
axs[1].set_ylabel('Average Loss on estimations')

axs[1].set_title('Network loss against Training size')
axs[1].legend()

plt.show()

print("After some experimentation, the loss curve appears to approximately follows y = 1/(sqrt(x^1.3)) + 0.002")

#### Result discussion

The accuracy and loss averages plots are close to linear in these log log graphs. The loss plot is compared to a modification of the 1/sqrt(x) graph.

#### task 4 - Discussion
In this task I create a new neural network class, CNN. This neural network has a sequence of convolutional layers followed by fully connected layers. 25 instances of this network class are trained with different l2 regularization hyperparameters. The resulting loss and accuracy average results are plotted and the hyperparameter value for regulariszation is chosen. 

Following this, seperate instances of the neural network class are trained on increasing training set sizes (using the previoulsy set l2 parameter) and the resulting accuracy and loss averages on the test set are plotted on a log log graph. The results from the SimpleFCNN experiment is included as well as a 1/sqrt(x) graph for comparison. The graph shows the CNN plot has a much closer gradient to 1/sqrt(x).

In [None]:
# convolutional neural network
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = torch.flatten(x, 1)
        x = self.fc_layers(x)
        return x

In [None]:
import sys

interval_number = 25
weight_decay_values = np.linspace(0.00000001, 0.01, num=interval_number)

trainloader = torch.utils.data.DataLoader( mnist_trainset_small, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader( mnist_testset, batch_size=32, shuffle=True)

test_accuracy_values = []
train_accuracy_values = []
test_loss_values = []
train_loss_values = []
net_count = 0

for weight_decay in weight_decay_values:
    net_count += 1
    print("\nTraining neural network %d / %d" %(net_count, interval_number))
    
    nn1 = CNN()
    optimizer = torch.optim.Adam(nn1.parameters(), lr=0.0001, weight_decay=weight_decay)

    for epoch in range(100): # reduced epoch count as network fits data much quicker
        current_loss = 0.0 
        n_mini_batches = 0

        for i, mini_batch in enumerate( trainloader, 0 ):
            images, labels = mini_batch

            optimizer.zero_grad()

            outputs = nn1(images)
            loss = loss_function ( outputs, labels )
            loss.backward()
            optimizer.step()

            n_mini_batches += 1 
            current_loss += loss.item()
        
        sys.stdout.write('\rProgress: {:.2f}%'.format((epoch + 1)))
        sys.stdout.flush()

    train_accuracy, test_accuracy, train_loss, test_loss = getAccuracyAndMeanLoss(trainloader, testloader, nn1)
    train_accuracy_values.append(train_accuracy)
    test_accuracy_values.append(test_accuracy)
    train_loss_values.append(train_loss)
    test_loss_values.append(test_loss)

print("\nTraining complete.")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

axs[0].plot(test_accuracy_values, label='Test Accuracy')
axs[0].plot(train_accuracy_values, label='Train Accuracy')

axs[0].set_xticks(range(len(weight_decay_values)), \
                  ['{:.2e}'.format(value) for value in weight_decay_values], \
                  rotation=90)
axs[0].set_xlabel('Weight Decay (L2)')
axs[0].set_ylabel('Accuracy of estimations')
axs[0].set_title('Network accuracy against L2 regularization value')
axs[0].legend()

axs[1].plot(test_loss_values, label='Test Loss')
axs[1].plot(train_loss_values, label='Train Loss')

axs[1].set_xticks(range(len(weight_decay_values)), \
                  ['{:.2e}'.format(value) for value in weight_decay_values], \
                  rotation=90)
axs[1].set_xlabel('Weight Decay (L2)')
axs[1].set_ylabel('Average Loss on estimations')
axs[1].set_title('Network loss against L2 regularization value')
axs[1].legend()

plt.show()

In [None]:
print("From these results, the regularization value with the best test data accuracy is {:.2e}"
      .format(weight_decay_values[test_accuracy_values.index(max(test_accuracy_values))]))  
print("Accuracy value is:", max(test_accuracy_values))
print("Loss value is:", test_loss_values[test_accuracy_values.index(max(test_accuracy_values))])

second_NN_weight_decay = weight_decay_values[test_accuracy_values.index(max(test_accuracy_values))]

In [None]:
training_set_sizes = [500, 1000, 2000, 4000, 8000, 16000, 32000, 60000]
# use the same testloader as instantiated beforehand.

test_CNN_accuracy_values = []
test_CNN_loss_values = []
net_count = 0
weight_decay = second_NN_weight_decay

for set_size in training_set_sizes:
    # define trainset
    mnist_trainsubset = [ mnist_trainset[i] for i in range(0,set_size) ]
    trainloader = torch.utils.data.DataLoader(mnist_trainsubset, batch_size=32, shuffle=True)
    
    net_count += 1
    print("\nTraining neural network %d / %d" %(net_count, len(training_set_sizes)))
    
    nn1 = CNN()
    optimizer = torch.optim.Adam(nn1.parameters(), lr=0.0001, weight_decay=weight_decay)

    for epoch in range(100):
        current_loss = 0.0 
        n_mini_batches = 0

        for i, mini_batch in enumerate( trainloader, 0 ):
            images, labels = mini_batch

            optimizer.zero_grad()

            outputs = nn1(images)
            loss = loss_function ( outputs, labels )
            loss.backward()
            optimizer.step()

            n_mini_batches += 1 
            current_loss += loss.item()
        
        sys.stdout.write('\rProgress: {:.2f}%'.format(epoch + 1))
        sys.stdout.flush()

    _, test_accuracy, _, test_loss = getAccuracyAndMeanLoss(trainloader, testloader, nn1)
    test_CNN_accuracy_values.append(test_accuracy)
    test_CNN_loss_values.append(test_loss)

print("\nTraining complete.")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5))

axs[0].loglog(training_set_sizes, test_CNN_accuracy_values, label='CNN Test Accuracy')
axs[0].loglog(training_set_sizes, test_NN_accuracy_values, label='First NN Test Accuracy')
axs[0].set_xlabel('Training size')
axs[0].set_ylabel('Accuracy of estimations')
axs[0].set_title('Network accuracy against Training size')
axs[0].legend()

axs[1].loglog(training_set_sizes, test_CNN_loss_values, label='CNN Test Loss')
axs[1].loglog(training_set_sizes, test_NN_loss_values, label='First NN Test Loss')

def approx_func_2(x):
    return (1 / np.sqrt(x ** 1))

x_func = np.linspace(500, 60000, 100)
plt.loglog(x_func, approx_func_2(x_func), linestyle='--', color='red', label='y = 1/(sqrt(x))')


axs[1].set_xlabel('Training size')
axs[1].set_ylabel('Average Loss on estimations')

axs[1].set_title('Network loss against Training size')
axs[1].legend()

plt.show()

#### Result discussion
The CNN and SimpleFCNN results are plotted on both graphs. In both, the accuracy and loss averages are better in the CNN network. Additonally, the CNN plot for average loss has a smoother gradient than SimpleFCNN, and is a better fit to 1/sqrt(x).