## Authors:
#### Daniel Stöckein (5018039), Alexander Triol (5018451)

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 1. Load Data

In [3]:
mnist_train = datasets.FashionMNIST(
    root='../datasets/', 
    train=True, 
    download=True, 
    transform=transforms.ToTensor()
)

mnist_test = datasets.FashionMNIST(
    root='../datasets/', 
    train=False, 
    download=True, 
    transform=transforms.ToTensor()
)


## 2. Preparing DataLoader

In [4]:
def dloaders(batch_size):
    train_loader = DataLoader(mnist_train, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(mnist_test, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

## 3. Residual Block
- A custom class from nn.Module will be created and in the ``forward`` function the data will pass through various layers according to the diagram
- The following code implements the residual block with skip connections such that the input passed via the shortcut matches the dimensions of the main path's output.

In [5]:
class Residual(nn.Module):
    def __init__(self, input_channels, fc_output1, fc_output2):
        super(Residual, self).__init__()
        self.lin1 = nn.Linear(input_channels, fc_output1)
        self.lin2 = nn.Linear(fc_output1, fc_output2)
        self.lin3 = nn.Linear(input_channels, fc_output2)
        

        self.rel1 = nn.ReLU()
        self.rel2 = nn.ReLU()

        self.bn1 = nn.BatchNorm1d(input_channels)
        self.bn2 = nn.BatchNorm1d(fc_output1)

    def forward(self, X):
        Y = self.bn1(X)
        Y = self.rel1(Y)
        Y = F.dropout(Y, p=0.5)
        Y = self.lin1(Y)
        Y = self.bn2(Y)
        Y = self.rel2(Y)
        Y = self.lin2(Y)
        Y += self.lin3(X)
        return Y

## 4. Model
Data reminder:
- 28x28 = 784 pixel values
- 10 classes
- 256 hidden units

In [6]:
class MLP(nn.Module):
    def __init__(self, input_features, fc_output1, fc_output2, outputs):
        super(MLP, self).__init__()
        self.net = nn.Sequential(nn.Flatten(), 
                          nn.Linear(input_features, 256),
                          Residual(256, fc_output1, fc_output2),
                          nn.BatchNorm1d(fc_output2),
                          nn.ReLU(),
                          nn.Linear(fc_output2, 64),
                          Residual(64, fc_output1, fc_output2),
                          nn.BatchNorm1d(fc_output2),
                          nn.ReLU(),
                          nn.Linear(fc_output2, outputs),
                          )

    def forward(self, X):
        Y = self.net(X)
        return Y


In [7]:
model = MLP(784, 120, 84, 10)
print(model)

MLP(
  (net): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=256, bias=True)
    (2): Residual(
      (lin1): Linear(in_features=256, out_features=120, bias=True)
      (lin2): Linear(in_features=120, out_features=84, bias=True)
      (lin3): Linear(in_features=256, out_features=84, bias=True)
      (rel1): ReLU()
      (rel2): ReLU()
      (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (3): BatchNorm1d(84, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): ReLU()
    (5): Linear(in_features=84, out_features=64, bias=True)
    (6): Residual(
      (lin1): Linear(in_features=64, out_features=120, bias=True)
      (lin2): Linear(in_features=120, out_features=84, bias=True)
      (lin3): Linear(in_features=64, out_features=84, bias=True)
      (rel1): ReLU()
      (rel2): 

## 5. Metrics
- computes accuracy and returns ``list(correct, wrong, accuracy)``

In [8]:
def comp_accuracy(model, data_loader):
    correct = 0
    wrong = 0
    num_examples = 0
    
    # turn on eval mode if model Inherits from nn.Module
    if isinstance(model, nn.Module):
        model.eval()
    
    with torch.no_grad():
        for batch_index, (features, labels) in enumerate(data_loader):
            features = features.to(device)
            labels = labels.to(device)

            logits = model(features)
            _, predictions = torch.max(logits, dim=1) # single class with highest probability. simply retain indices

            num_examples += labels.size(0)

            correct += (predictions == labels).sum().float()
            wrong += (predictions != labels).sum().float()
            
        accuracy = correct / num_examples * 100      
        
    return correct, wrong, accuracy

## 6. Training procedure
- Training will be done on GPU if available
- Everything else will remain the same as in the linear regression problem

In [9]:
def fit(model, train_loader, epochs, learning_rate, loss_func=nn.CrossEntropyLoss(), opt_func=torch.optim.SGD):
    
    optimizer = opt_func(model.parameters(), learning_rate) # objective function
    model = model.to(device)
    
    for epoch in range(epochs):
        
        model = model.train()
              
        for batch_index, (features, labels) in enumerate(train_loader):
            
            # gpu usage if possible
            features = features.to(device)
            labels = labels.to(device)
            
            # 1. forward
            logits = model(features)

            # 2. compute objective function (softmax, cross entropy)
            cost = loss_func(logits, labels)

            # 3. cleaning gradients
            optimizer.zero_grad() 

            # 4. accumulate partial derivatives
            cost.backward() 

            # 5. step in the opposite direction of the gradient
            optimizer.step() 
            
            if not batch_index % 250:
                print ('Epoch: {}/{} | Batch {}/{} | Cost: {:.4f}'.format(
                    epoch+1,
                    epochs,
                    batch_index,
                    len(train_loader),
                    cost
                ))
        
        correct, wrong, accuracy = comp_accuracy(model, train_loader)
        print ('Training: Correct[{:.0f}] | Wrong[{:.0f}] | Accuracy[{:.2f}%]'.format(
            correct,
            wrong,
            accuracy
        ), '\n')

    correct, wrong, accuracy = comp_accuracy(model, test_loader)  
    print ('Test: Correct[{:.0f}] | Wrong[{:.0f}] | Accuracy[{:.2f}%]'.format(
        correct,
        wrong,
        accuracy
    ), '\n')

## 1. Attempt

In [10]:
batch_size = 50
epochs = 10
learning_rate = 0.01
model = MLP(784, 200, 100, 10)
train_loader, test_loader = dloaders(batch_size=batch_size) # data iters
fit(model, train_loader, epochs, learning_rate) # training


Epoch: 1/10 | Batch 0/1200 | Cost: 2.3632
Epoch: 1/10 | Batch 250/1200 | Cost: 0.9561
Epoch: 1/10 | Batch 500/1200 | Cost: 0.7735
Epoch: 1/10 | Batch 750/1200 | Cost: 0.8270
Epoch: 1/10 | Batch 1000/1200 | Cost: 0.7934
Training: Correct[49300] | Wrong[10700] | Accuracy[82.17%] 

Epoch: 2/10 | Batch 0/1200 | Cost: 0.5931
Epoch: 2/10 | Batch 250/1200 | Cost: 0.6580
Epoch: 2/10 | Batch 500/1200 | Cost: 0.6855
Epoch: 2/10 | Batch 750/1200 | Cost: 0.6615
Epoch: 2/10 | Batch 1000/1200 | Cost: 0.5434
Training: Correct[50750] | Wrong[9250] | Accuracy[84.58%] 

Epoch: 3/10 | Batch 0/1200 | Cost: 0.5120
Epoch: 3/10 | Batch 250/1200 | Cost: 0.3446
Epoch: 3/10 | Batch 500/1200 | Cost: 0.3999
Epoch: 3/10 | Batch 750/1200 | Cost: 0.3637
Epoch: 3/10 | Batch 1000/1200 | Cost: 0.4108
Training: Correct[51524] | Wrong[8476] | Accuracy[85.87%] 

Epoch: 4/10 | Batch 0/1200 | Cost: 0.2805
Epoch: 4/10 | Batch 250/1200 | Cost: 0.4766
Epoch: 4/10 | Batch 500/1200 | Cost: 0.6631
Epoch: 4/10 | Batch 750/1200 | C

### Summary 1. Attempt with Problem 1 - Part B Parameters

With the hyper parameters from Problem 1 - Part B, our model achieves an accuracy of ~87 percent. It therefore provides approx. 4 percent better accuracy.

## 2. Attempt

In [11]:
batch_size = 50
epochs = 10
learning_rate = 0.1

train_loader, test_loader = dloaders(batch_size=batch_size) # data iters
model = MLP(784, 200, 100, 10)
fit(model, train_loader, epochs, learning_rate) # training


Epoch: 1/10 | Batch 0/1200 | Cost: 2.3516
Epoch: 1/10 | Batch 250/1200 | Cost: 0.5006
Epoch: 1/10 | Batch 500/1200 | Cost: 0.5191
Epoch: 1/10 | Batch 750/1200 | Cost: 0.5828
Epoch: 1/10 | Batch 1000/1200 | Cost: 0.5477
Training: Correct[50608] | Wrong[9392] | Accuracy[84.35%] 

Epoch: 2/10 | Batch 0/1200 | Cost: 0.4515
Epoch: 2/10 | Batch 250/1200 | Cost: 0.3562
Epoch: 2/10 | Batch 500/1200 | Cost: 0.3796
Epoch: 2/10 | Batch 750/1200 | Cost: 0.2359
Epoch: 2/10 | Batch 1000/1200 | Cost: 0.4101
Training: Correct[52048] | Wrong[7952] | Accuracy[86.75%] 

Epoch: 3/10 | Batch 0/1200 | Cost: 0.3070
Epoch: 3/10 | Batch 250/1200 | Cost: 0.4326
Epoch: 3/10 | Batch 500/1200 | Cost: 0.6614
Epoch: 3/10 | Batch 750/1200 | Cost: 0.3928
Epoch: 3/10 | Batch 1000/1200 | Cost: 0.2975
Training: Correct[52596] | Wrong[7404] | Accuracy[87.66%] 

Epoch: 4/10 | Batch 0/1200 | Cost: 0.3212
Epoch: 4/10 | Batch 250/1200 | Cost: 0.3847
Epoch: 4/10 | Batch 500/1200 | Cost: 0.3021
Epoch: 4/10 | Batch 750/1200 | Co

### Summary 2. Attempt, testing different parameters
After playing with the hyper-parameters we have found that these:
*   batch_size = 50
*   epochs = 10
*   learning_rate = 0.1

gave us the best results.




