In [1]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets
from torchvision.transforms import ToTensor
from torchsummary import summary
import numpy as np
import matplotlib.pyplot as plt

### Loading Data

Load train and test partition of the MNIST dataset.

Prepare the training by splitting the training partition into a training and validation.

In [2]:
training_data = datasets.MNIST(root="data", train=True, download=True, transform=ToTensor())
test_data = datasets.MNIST(root="data", train=False, download=True, transform=ToTensor())

In [3]:
training_size = int(len(training_data)*0.8)
test_size = len(training_data)-training_size

In [4]:
# Partition into train and validate
#from sklearn.model_selection import train_test_split

### YOUR CODE START ###

training_set, validation_set = random_split(training_data,[training_size,test_size])

### YOUR CODE END ###

### MLP

Implement an MLP model that can be configured with a an arbitrary number of layers and units per layer.

To that end, implement a suitable sub-class of `torch.nn.Module` with a constructor that accepts the following arguments:
* `units`: list of integers that specify the number of units in the different layers. The first element corresponds to the number of units in the input layer (layer '0'), the last element is the number of output units, i.e. the number of classes the classifier is designed for (10 for an MNIST classifier). Hence, MLP will have $n$ hidden layers if `units` has $n+1$ elements. 
* `activation_class`: Class name of the activation function layer to be used (such as `torch.nn.ReLU`). Instances can be created by `activation_class()` and added to the succession of layers defined by the model. 

Alternatively, you can implement a utility method that creates a `torch.nn.Sequential` model accordingly. 


In [5]:
### YOUR CODE START ###

class MLP(torch.nn.Module):
    
    def __init__(self, units, activation_class = None):
        self.layers = []
        super(MLP,self).__init__()
        self.nlayers = len(units)
        #self.layers.append(torch.nn.Flatten()) 
        self.layernames = []
        self.__setattr__('flatten_1', torch.nn.Flatten())    ##### <==== see here
        self.layernames.append('flatten_1')
        idx = 2
        for i in range(self.nlayers-2):
            
            self.__setattr__(f'linear_{idx}', torch.nn.Linear(units[i], units[i+1]))
            self.layernames.append(f'linear_{idx}')
            
            if activation_class is None:
                self.__setattr__(f'actv_{idx+1}', torch.nn.ReLU())
                self.layernames.append(f'actv_{idx+1}')
            else:
                self.__setattr__(f'actv_{idx+1}', activation_class())
                self.layernames.append(f'actv_{idx+1}')
            idx += 2
        self.__setattr__(f'linear_{idx}', torch.nn.Linear(units[len(units)-2], units[len(units)-1]))
        self.layernames.append(f'linear_{idx}')
        #print([layer for layer in self.layers])
 
        
    def forward(self, x):
        z = x.clone()
        
        #z = self.flatten(x) ##### <==== see here
        for layer in self.layernames:
            
            z = self.__getattr__(layer)(z)
        return z  

### YOUR CODE END ###

In [6]:
model = MLP([28*28,300, 100, 10])

from torchsummary import summary
summary(model, (1,28,28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                  [-1, 300]         235,500
              ReLU-3                  [-1, 300]               0
            Linear-4                  [-1, 100]          30,100
              ReLU-5                  [-1, 100]               0
            Linear-6                   [-1, 10]           1,010
Total params: 266,610
Trainable params: 266,610
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 1.02
Estimated Total Size (MB): 1.03
----------------------------------------------------------------


### Training Loop

For training, implement a method with the arguments:
* `model`: Model to be trained
* `lr`: Learning rate
* `nepochs`: Number of epochs
* `batchsize`: Batch size
* `training_data`: Training set (subclassed of `Dataset`)
* `validation_data`: Validation set (subclassed of `Dataset`)

Remember the training and validation cost and accuracy, respectively for monitoring the progress of the training. <br>
Note that for the training cost and accuracy you can use the per batch quantities averaged over an epoch. 

Furthermore, you can use the SGD optimizer of pytorch (`torch.optim.SGD`) - but without momentum.

In [7]:
def train_eval(model, lr, nepochs, nbatch, training_set, validation_set):
    # finally return the sequence of per epoch values
    cost_hist = []
    cost_hist_valid = []
    acc_hist = []
    acc_hist_valid = []

    cost_ce = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    ### YOUR CODE START ###
    
    # epoch: current epoch
    # cost, cost_valid, acc, acc_valid: cost and acurracy (for training, validation set) per epoch     
    
    training_loader = DataLoader(training_set, batch_size=nbatch)
    validation_loader = DataLoader(validation_set, batch_size=nbatch)
    
    for epoch in range(nepochs):

        training_cost = 0
        correct = 0
        for inputs, targets in training_loader:
            optimizer.zero_grad()
            predictions = model(inputs)
            cost = cost_ce(predictions, targets)
            cost.backward()
            optimizer.step()
            training_cost += cost.item()
            correct += (torch.argmax(predictions, dim=1) == targets).sum()
        
        cost = training_cost / len(training_set)
        acc = correct / len(training_set)

        validation_cost = 0
        correct = 0
        for inputs, targets in validation_loader:
            predictions = model(inputs)
            cost = cost_ce(predictions, targets)
            validation_cost += cost.item()
            correct += (torch.argmax(predictions, dim=1) == targets).sum()

        cost_valid = validation_cost / len(validation_set)
        acc_valid = correct / len(validation_set)
        
        print("Epoch %i: %f, %f, %f, %f"%(epoch, cost, acc, cost_valid, acc_valid))

        ### YOUR CODE END ###
        
        cost_hist.append(cost.data)
        cost_hist_valid.append(cost_valid)
        acc_hist.append(acc)
        acc_hist_valid.append(acc_valid)
    return cost_hist, cost_hist_valid, acc_hist, acc_hist_valid


### Exploration

Now use this functionality to explore different layer configurations: 
* Number of layers
* Number of units per layer
* Suitable learning rate
* Suitable number of epochs.

Use a batchsize of 64.

Make sure that you choose a sufficinetly large number of epochs so that the learning has more or less stabilizes (converged). 

In [8]:
model = MLP([28*28,10, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.005, 20, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 1.526094, 0.387750, 0.025471, 0.580917
Epoch 1: 0.909135, 0.707979, 0.014619, 0.774083
Epoch 2: 0.701220, 0.810479, 0.010538, 0.822167
Epoch 3: 0.623750, 0.844771, 0.008847, 0.844500
Epoch 4: 0.584618, 0.863708, 0.007906, 0.859083
Epoch 5: 0.562121, 0.874875, 0.007297, 0.870333
Epoch 6: 0.547863, 0.883125, 0.006871, 0.878167
Epoch 7: 0.537547, 0.888625, 0.006561, 0.880917
Epoch 8: 0.529595, 0.892042, 0.006325, 0.884250
Epoch 9: 0.522818, 0.894917, 0.006139, 0.886500
Epoch 10: 0.516079, 0.897396, 0.005987, 0.888667
Epoch 11: 0.509194, 0.899562, 0.005860, 0.890833
Epoch 12: 0.502777, 0.901208, 0.005752, 0.892750
Epoch 13: 0.496686, 0.902667, 0.005657, 0.895000
Epoch 14: 0.491055, 0.904479, 0.005573, 0.896833
Epoch 15: 0.485565, 0.905854, 0.005497, 0.897917
Epoch 16: 0.480343, 0.907292, 0.005429, 0.899333
Epoch 17: 0.475610, 0.908708, 0.005365, 0.901083
Epoch 18: 0.471775, 0.909917, 0.005307, 0.902333
Epoch 19: 0.469319, 0.911000, 0.005253, 0.903417
train_accuracy 0.9110000133514

In [9]:
model = MLP([28*28, 300, 100, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.05, 30, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 0.470590, 0.800750, 0.005415, 0.900083
Epoch 1: 0.394060, 0.916792, 0.004164, 0.923417
Epoch 2: 0.316332, 0.935542, 0.003347, 0.938000
Epoch 3: 0.248632, 0.948625, 0.002797, 0.948250
Epoch 4: 0.204454, 0.957417, 0.002412, 0.955500
Epoch 5: 0.171525, 0.964104, 0.002131, 0.960000
Epoch 6: 0.152941, 0.969333, 0.001927, 0.962500
Epoch 7: 0.138211, 0.973521, 0.001776, 0.965667
Epoch 8: 0.131266, 0.977146, 0.001660, 0.968167
Epoch 9: 0.126039, 0.979708, 0.001568, 0.969667
Epoch 10: 0.122767, 0.982542, 0.001494, 0.971833
Epoch 11: 0.117698, 0.984625, 0.001438, 0.972583
Epoch 12: 0.112344, 0.986875, 0.001394, 0.973250
Epoch 13: 0.104434, 0.988500, 0.001359, 0.973750
Epoch 14: 0.096911, 0.990062, 0.001328, 0.974500
Epoch 15: 0.092030, 0.991500, 0.001310, 0.974833
Epoch 16: 0.084524, 0.992521, 0.001292, 0.975333
Epoch 17: 0.076341, 0.993813, 0.001283, 0.975750
Epoch 18: 0.069496, 0.995000, 0.001277, 0.976083
Epoch 19: 0.062835, 0.995729, 0.001270, 0.976667
Epoch 20: 0.058253, 0.996708, 

In [10]:
model = MLP([28*28, 500,300, 100, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.05, 30, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 0.443315, 0.687000, 0.005704, 0.893667
Epoch 1: 0.339967, 0.914875, 0.003787, 0.929417
Epoch 2: 0.244934, 0.943313, 0.002775, 0.946750
Epoch 3: 0.177933, 0.957750, 0.002232, 0.957917
Epoch 4: 0.127428, 0.966500, 0.001889, 0.964417
Epoch 5: 0.099255, 0.973750, 0.001662, 0.968333
Epoch 6: 0.078899, 0.979021, 0.001514, 0.972417
Epoch 7: 0.066396, 0.983167, 0.001433, 0.973417
Epoch 8: 0.056571, 0.986479, 0.001391, 0.974583
Epoch 9: 0.047236, 0.989021, 0.001368, 0.975167
Epoch 10: 0.040297, 0.991354, 0.001369, 0.975333
Epoch 11: 0.037941, 0.993688, 0.001363, 0.976000
Epoch 12: 0.034309, 0.995354, 0.001372, 0.976250
Epoch 13: 0.029889, 0.996458, 0.001391, 0.976833
Epoch 14: 0.027230, 0.997667, 0.001403, 0.976750
Epoch 15: 0.022056, 0.998354, 0.001438, 0.976417
Epoch 16: 0.019677, 0.998833, 0.001466, 0.976750
Epoch 17: 0.017375, 0.999229, 0.001488, 0.976667
Epoch 18: 0.013451, 0.999500, 0.001476, 0.976667
Epoch 19: 0.013004, 0.999646, 0.001412, 0.977333
Epoch 20: 0.013151, 0.999792, 

In [13]:
model = MLP([28*28, 100, 500, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.05, 10, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 0.438900, 0.821875, 0.005144, 0.905750
Epoch 1: 0.338543, 0.919875, 0.004014, 0.926750
Epoch 2: 0.272688, 0.937542, 0.003295, 0.940083
Epoch 3: 0.221854, 0.948417, 0.002807, 0.948583
Epoch 4: 0.185421, 0.956979, 0.002464, 0.953917
Epoch 5: 0.158095, 0.962625, 0.002214, 0.957917
Epoch 6: 0.130408, 0.967875, 0.002001, 0.962083
Epoch 7: 0.119372, 0.971583, 0.001848, 0.964250
Epoch 8: 0.107013, 0.974688, 0.001717, 0.966333
Epoch 9: 0.099006, 0.977771, 0.001628, 0.967750
train_accuracy 0.9777708053588867, val_accuracy 0.9677500128746033


In [14]:
model = MLP([28*28, 100, 50, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.05, 10, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 0.485014, 0.783646, 0.005698, 0.894750
Epoch 1: 0.377636, 0.911729, 0.004482, 0.917250
Epoch 2: 0.306156, 0.929021, 0.003731, 0.930500
Epoch 3: 0.241460, 0.942500, 0.003191, 0.940750
Epoch 4: 0.194938, 0.951604, 0.002789, 0.948083
Epoch 5: 0.172424, 0.958229, 0.002500, 0.952333
Epoch 6: 0.154281, 0.963208, 0.002283, 0.956833
Epoch 7: 0.143050, 0.967188, 0.002103, 0.960917
Epoch 8: 0.130611, 0.971000, 0.001957, 0.963667
Epoch 9: 0.125209, 0.973875, 0.001861, 0.965250
train_accuracy 0.9738749861717224, val_accuracy 0.9652500152587891


### Summary

Summarize your findings with the different settings in a table

| Units | nepochs | lr | Acc (Train) | Acc (Valid) |
| --- | :-: | :-: | :-: | :-: |
| (784,10,10) | 20 | 0.5 | 91.1% | 90.3% |
| (784,300,100,10) | 10 | 0.05 | 99.9% | 97.7% |
| (784,500,300,100,10) | 30 | 0.005 | 99.9% | 97.8% |
| (784,100,500,10) | 10 | 0.05 | 97.8% | 96.7% |
| (784,100,50,10) | 10 | 0.05 | 97.38% | 96.5% |


I prever the Model with the following attributes `(784,100,500,10)	10	0.05	97.8%	96.7%`

It does not seem to have such an immense overfit as the last one in the table