In [17]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets
from torchvision.transforms import ToTensor
from torchsummary import summary
import numpy as np
import matplotlib.pyplot as plt

### Loading Data

Load train and test partition of the MNIST dataset.

Prepare the training by splitting the training partition into a training and validation.

In [87]:
training_data = datasets.MNIST(root="data", train=True, download=True, transform=ToTensor())
test_data = datasets.MNIST(root="data", train=False, download=True, transform=ToTensor())

<class 'torchvision.datasets.mnist.MNIST'>


In [22]:
training_size = int(len(training_data)*0.8)
test_size = len(training_data)-training_size

In [23]:
# Partition into train and validate
#from sklearn.model_selection import train_test_split

### YOUR CODE START ###

training_set, validation_set = random_split(training_data,[training_size,test_size])

### YOUR CODE END ###

### MLP

Implement an MLP model that can be configured with a an arbitrary number of layers and units per layer.

To that end, implement a suitable sub-class of `torch.nn.Module` with a constructor that accepts the following arguments:
* `units`: list of integers that specify the number of units in the different layers. The first element corresponds to the number of units in the input layer (layer '0'), the last element is the number of output units, i.e. the number of classes the classifier is designed for (10 for an MNIST classifier). Hence, MLP will have $n$ hidden layers if `units` has $n+1$ elements. 
* `activation_class`: Class name of the activation function layer to be used (such as `torch.nn.ReLU`). Instances can be created by `activation_class()` and added to the succession of layers defined by the model. 

Alternatively, you can implement a utility method that creates a `torch.nn.Sequential` model accordingly. 


In [54]:
### YOUR CODE START ###

class MLP(torch.nn.Module):
    
    def __init__(self, units, activation_class = None):
        super(MLP,self).__init__()
        self.flatten_1 = torch.nn.Flatten()
        self.linear_2  = torch.nn.Linear(units[0],units[1])
        self.relu_3  =  torch.nn.ReLU()
        self.linear_4  = torch.nn.Linear(units[1],units[2])
        self.relu_5    = torch.nn.ReLU()
        self.linear_6  = torch.nn.Linear(units[2],units[3])
 
        
    def forward(self, x):
        z = self.linear_2(self.flatten_1(x))
        z = self.relu_3(z)
        z = self.linear_4(z)
        z = self.relu_5(z)
        z = self.linear_6(z)
        return z
        

### YOUR CODE END ###

In [55]:
model = MLP([28*28,300, 100, 10])

from torchsummary import summary
summary(model, (1,28,28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                  [-1, 300]         235,500
              ReLU-3                  [-1, 300]               0
            Linear-4                  [-1, 100]          30,100
              ReLU-5                  [-1, 100]               0
            Linear-6                   [-1, 10]           1,010
Total params: 266,610
Trainable params: 266,610
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 1.02
Estimated Total Size (MB): 1.03
----------------------------------------------------------------


### Training Loop

For training, implement a method with the arguments:
* `model`: Model to be trained
* `lr`: Learning rate
* `nepochs`: Number of epochs
* `batchsize`: Batch size
* `training_data`: Training set (subclassed of `Dataset`)
* `validation_data`: Validation set (subclassed of `Dataset`)

Remember the training and validation cost and accuracy, respectively for monitoring the progress of the training. <br>
Note that for the training cost and accuracy you can use the per batch quantities averaged over an epoch. 

Furthermore, you can use the SGD optimizer of pytorch (`torch.optim.SGD`) - but without momentum.

In [65]:
def train_eval(model, lr, nepochs, nbatch, training_set, validation_set):
    # finally return the sequence of per epoch values
    cost_hist = []
    cost_hist_valid = []
    acc_hist = []
    acc_hist_valid = []

    cost_ce = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    ### YOUR CODE START ###
    
    # epoch: current epoch
    # cost, cost_valid, acc, acc_valid: cost and acurracy (for training, validation set) per epoch     
    
    training_loader = DataLoader(training_data, batch_size=nbatch, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=10000, shuffle=True)
    Xtest, Ytest = next(iter(test_loader))
    size = len(training_loader.dataset)
    nbatches = len(training_loader)
    
    for epoch in range(nepochs):
        cost, acc = 0.0, 0.0
        for batch, (X, Y) in enumerate(training_loader):
            pred = model(X)
            cost = cost_ce(pred, Y)
            acc += (pred.argmax(dim=1) == Y).type(torch.float).sum().item()

            # gradient, parameter update
            optimizer.zero_grad()
            cost.backward()
            optimizer.step()
        # cost /= nbatches
        acc /= size

        cost_valid, acc_valid = 0.0, 0.0
        with torch.no_grad():
            for X, Y, in test_loader:
                pred = model(X)
                cost_valid = cost_ce(pred, Y)
                acc_valid = (pred.argmax(dim=1) == Y).type(torch.float).sum().item()
        cost_valid /= len(test_loader)
        acc_valid /= len(test_loader.dataset)
        
        print("Epoch %i: %f, %f, %f, %f"%(epoch, cost, acc, cost_valid, acc_valid))

        ### YOUR CODE END ###
        
        cost_hist.append(cost)
        cost_hist_valid.append(cost_valid)
        acc_hist.append(acc)
        acc_hist_valid.append(acc_valid)
    return cost_hist, cost_hist_valid, acc_hist, acc_hist_valid

### Exploration

Now use this functionality to explore different layer configurations: 
* Number of layers
* Number of units per layer
* Suitable learning rate
* Suitable number of epochs.

Use a batchsize of 64.

Make sure that you choose a sufficinetly large number of epochs so that the learning has more or less stabilizes (converged). 

In [72]:
class MLP4L(torch.nn.Module):
    
    def __init__(self, units, activation_class = None):
        super(MLP4L,self).__init__()
        self.flatten_1 = torch.nn.Flatten()
        self.linear_2  = torch.nn.Linear(units[0],units[1])
        self.relu_3  =  torch.nn.ReLU()
        self.linear_4  = torch.nn.Linear(units[1],units[2])
        self.relu_5    = torch.nn.ReLU()
        self.linear_6  = torch.nn.Linear(units[2],units[3])
 
        
    def forward(self, x):
        z = self.linear_2(self.flatten_1(x))
        z = self.relu_3(z)
        z = self.linear_4(z)
        z = self.relu_5(z)
        z = self.linear_6(z)
        return z

class MLP5L(torch.nn.Module):
    
    def __init__(self, units, activation_class = None):
        super(MLP5L,self).__init__()
        self.flatten_1 = torch.nn.Flatten()
        self.linear_2  = torch.nn.Linear(units[0],units[1])
        self.relu_3  =  torch.nn.ReLU()
        self.linear_4  = torch.nn.Linear(units[1],units[2])
        self.relu_5    = torch.nn.ReLU()
        self.linear_6  = torch.nn.Linear(units[2],units[3])
        self.relu_7    = torch.nn.ReLU()
        self.linear_8  = torch.nn.Linear(units[3],units[4])
 
        
    def forward(self, x):
        z = self.linear_2(self.flatten_1(x))
        z = self.relu_3(z)
        z = self.linear_4(z)
        z = self.relu_5(z)
        z = self.linear_6(z)
        z = self.relu_7(z)
        z = self.linear_8(z)
        return z
    
class MLP6L(torch.nn.Module):
    
    def __init__(self, units, activation_class = None):
        super(MLP5L,self).__init__()
        self.flatten_1 = torch.nn.Flatten()
        self.linear_2  = torch.nn.Linear(units[0],units[1])
        self.relu_3  =  torch.nn.ReLU()
        self.linear_4  = torch.nn.Linear(units[1],units[2])
        self.relu_5    = torch.nn.ReLU()
        self.linear_6  = torch.nn.Linear(units[2],units[3])
        self.relu_7    = torch.nn.ReLU()
        self.linear_8  = torch.nn.Linear(units[3],units[4])
        self.relu_9    = torch.nn.ReLU()
        self.linear_10  = torch.nn.Linear(units[5],units[6])
 
        
    def forward(self, x):
        z = self.linear_2(self.flatten_1(x))
        z = self.relu_3(z)
        z = self.linear_4(z)
        z = self.relu_5(z)
        z = self.linear_6(z)
        z = self.relu_7(z)
        z = self.linear_8(z)
        z = self.relu_9(z)
        z = self.linear_10(z)
        return z
        

In [74]:
model = MLP5L([28*28,500, 300, 100, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.005, 30, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 2.230736, 0.154817, 2.244276, 0.365500
Epoch 1: 1.730013, 0.418333, 1.702673, 0.529200
Epoch 2: 0.696280, 0.690167, 0.770192, 0.774600
Epoch 3: 0.457589, 0.805833, 0.549864, 0.838400
Epoch 4: 0.627504, 0.855833, 0.444192, 0.870800
Epoch 5: 0.205761, 0.879017, 0.387338, 0.887200
Epoch 6: 0.187060, 0.891567, 0.354102, 0.898000
Epoch 7: 0.143832, 0.900133, 0.331479, 0.905400
Epoch 8: 0.309057, 0.907000, 0.310427, 0.909300
Epoch 9: 0.214746, 0.912583, 0.289715, 0.915900
Epoch 10: 0.336021, 0.916933, 0.278952, 0.920600
Epoch 11: 0.193686, 0.921517, 0.261938, 0.925600
Epoch 12: 0.222441, 0.926100, 0.249639, 0.927500
Epoch 13: 0.315549, 0.929817, 0.241067, 0.931900
Epoch 14: 0.295807, 0.932917, 0.224346, 0.936900
Epoch 15: 0.472278, 0.936783, 0.216438, 0.939700
Epoch 16: 0.159291, 0.939267, 0.204230, 0.941000
Epoch 17: 0.246632, 0.941917, 0.195720, 0.944400
Epoch 18: 0.094231, 0.944433, 0.188206, 0.945800
Epoch 19: 0.149558, 0.946900, 0.182838, 0.947900
Epoch 20: 0.111168, 0.948733, 

In [78]:
model = MLP4L([28*28, 300, 100, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.05, 30, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 0.306316, 0.813167, 0.315312, 0.908500
Epoch 1: 0.251741, 0.924400, 0.216339, 0.936000
Epoch 2: 0.267334, 0.943233, 0.164234, 0.952900
Epoch 3: 0.223379, 0.955700, 0.141268, 0.956500
Epoch 4: 0.095921, 0.963833, 0.128064, 0.960700
Epoch 5: 0.012733, 0.969700, 0.100661, 0.968400
Epoch 6: 0.019854, 0.974033, 0.097006, 0.969900
Epoch 7: 0.019475, 0.977250, 0.085023, 0.972800
Epoch 8: 0.014026, 0.980133, 0.086449, 0.972100
Epoch 9: 0.009115, 0.982600, 0.077764, 0.976100
Epoch 10: 0.017065, 0.985350, 0.076842, 0.975800
Epoch 11: 0.100301, 0.986983, 0.077165, 0.975000
Epoch 12: 0.024994, 0.988683, 0.078208, 0.975200
Epoch 13: 0.014860, 0.990200, 0.067405, 0.977900
Epoch 14: 0.022580, 0.991133, 0.068705, 0.977700
Epoch 15: 0.012145, 0.992267, 0.067183, 0.978600
Epoch 16: 0.007576, 0.993433, 0.071132, 0.977500
Epoch 17: 0.006002, 0.994517, 0.064572, 0.979500
Epoch 18: 0.008288, 0.995300, 0.065683, 0.978900
Epoch 19: 0.025417, 0.996500, 0.067301, 0.979500
Epoch 20: 0.035615, 0.997033, 

In [79]:
model = MLP4L([28*28, 100, 50, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.05, 30, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 0.561184, 0.810183, 0.312940, 0.907100
Epoch 1: 0.226070, 0.918017, 0.258294, 0.925500
Epoch 2: 0.233998, 0.940733, 0.191774, 0.941800
Epoch 3: 0.183037, 0.952767, 0.145208, 0.955900
Epoch 4: 0.137461, 0.960767, 0.128336, 0.961200
Epoch 5: 0.225403, 0.966750, 0.124678, 0.962300
Epoch 6: 0.179621, 0.971100, 0.115188, 0.966200
Epoch 7: 0.056146, 0.974567, 0.099261, 0.969600
Epoch 8: 0.054897, 0.977217, 0.096771, 0.970400
Epoch 9: 0.109749, 0.979283, 0.099586, 0.969700
Epoch 10: 0.017977, 0.982067, 0.086884, 0.973000
Epoch 11: 0.021430, 0.983450, 0.084277, 0.973200
Epoch 12: 0.093491, 0.985517, 0.101964, 0.969200
Epoch 13: 0.005897, 0.985967, 0.083285, 0.975200
Epoch 14: 0.008281, 0.988033, 0.079412, 0.975900
Epoch 15: 0.010568, 0.989633, 0.080077, 0.975800
Epoch 16: 0.028055, 0.990183, 0.081462, 0.974100
Epoch 17: 0.007607, 0.990917, 0.076026, 0.976100
Epoch 18: 0.051711, 0.992217, 0.077234, 0.977000
Epoch 19: 0.004848, 0.993017, 0.076359, 0.977900
Epoch 20: 0.003397, 0.993683, 

In [80]:
model_final = MLP4L([28*28, 100, 500, 10])
_,_, acc_hist, acc_val_hist = train_eval(model, 0.05, 10, 64, training_set, validation_set)
print(f'train_accuracy {acc_hist[-1]}, val_accuracy {acc_val_hist[-1]}')

Epoch 0: 0.003397, 0.998700, 0.079191, 0.978200
Epoch 1: 0.020406, 0.998867, 0.081839, 0.977600
Epoch 2: 0.001967, 0.998950, 0.079048, 0.978300
Epoch 3: 0.008159, 0.998967, 0.084851, 0.977500
Epoch 4: 0.002401, 0.999350, 0.080948, 0.978200
Epoch 5: 0.000453, 0.999433, 0.081796, 0.978500
Epoch 6: 0.004543, 0.999567, 0.082780, 0.977900
Epoch 7: 0.001239, 0.999650, 0.082206, 0.978600
Epoch 8: 0.000582, 0.999733, 0.083712, 0.978300
Epoch 9: 0.000614, 0.999767, 0.083512, 0.978400
train_accuracy 0.9997666666666667, val_accuracy 0.9784


### Summary

Summarize your findings with the different settings in a table

| Units | nepochs | lr | Acc (Train) | Acc (Valid) |
| --- | :-: | :-: | :-: | :-: |
| (784,10,10) | 20 | 0.5 | 94.1% | 93.4% |
| (784,300,100,10) | 10 | 0.05 | 98.2% | 97.4% |
| (784,500,300,100,10) | 30 | 0.005 | 96.4% | 96.0% |
| (784,100,500,10) | 10 | 0.05 | 98.0% | 97.3% |
| (784,100,50,10) | 10 | 0.05 | 99.84% | 97.7% |


I prever the Model with the following attributes `(784,100,500,10)	10	0.05	98.0%	97.3%`

It does not seem to have such an immense overfit as the last one in the table

In [88]:
model_final.eval()
pred = model_final(test_data)
Y = test_data.targets
print(Y)
#acc_valid = (pred.argmax(dim=1) == Y).type(torch.float).sum().item()


AttributeError: 