## Authors:
#### Daniel Stöckein (5018039), Alexander Triol (5018451)

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 1. Load Data

In [None]:
mnist_train = datasets.FashionMNIST(
    root='datasets/', 
    train=True, 
    download=True, 
    transform=transforms.ToTensor()
)

mnist_test = datasets.FashionMNIST(
    root='datasets/', 
    train=False, 
    download=True, 
    transform=transforms.ToTensor()
)


## 2. Preparing DataLoader

In [None]:
def dloaders(batch_size):
    train_loader = DataLoader(mnist_train, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(mnist_test, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

## 3. Residual Block

In [None]:
class Residual(nn.Module):
    def __init__(self, input_channels, fc_output1, fc_output2):
        super(Residual, self).__init__()
        self.lin1 = nn.Linear(input_channels, fc_output1)
        self.lin2 = nn.Linear(fc_output1, fc_output2)
        self.lin3 = nn.Linear(input_channels, fc_output2)
        

        self.rel1 = nn.ReLU()
        self.rel2 = nn.ReLU()

        self.bn1 = nn.BatchNorm1d(input_channels)
        self.bn2 = nn.BatchNorm1d(fc_output1)

    def forward(self, X):
        Y = self.bn1(X)
        Y = self.rel1(Y)
        Y = F.dropout(Y, p=0.5)
        Y = self.lin1(Y)
        Y = self.bn2(Y)
        Y = self.rel2(Y)
        Y = self.lin2(Y)
        Y += self.lin3(X)
        return Y

## 4. Model

In [None]:
class MLP(nn.Module):
    def __init__(self, input_features, fc_output1, fc_output2, outputs):
        super(MLP, self).__init__()
        self.net = nn.Sequential(nn.Flatten(), 
                          nn.Linear(input_features, 256),
                          Residual(256, fc_output1, fc_output2),
                          nn.BatchNorm1d(fc_output2),
                          nn.ReLU(),
                          nn.Linear(fc_output2, 64),
                          Residual(64, fc_output1, fc_output2),
                          nn.BatchNorm1d(fc_output2),
                          nn.ReLU(),
                          nn.Linear(fc_output2, outputs),
                          )

    def forward(self, X):
        Y = self.net(X)
        return Y


In [None]:
model = MLP(784, 120, 84, 10)
print(model)

## 5. Metrics

In [None]:
def comp_accuracy(model, data_loader):
    correct = 0
    wrong = 0
    num_examples = 0
    
    # turn on eval mode if model Inherits from nn.Module
    if isinstance(model, nn.Module):
        model.eval()
    
    with torch.no_grad():
        for batch_index, (features, labels) in enumerate(data_loader):
            features = features.to(device)
            labels = labels.to(device)

            logits = model(features)
            _, predictions = torch.max(logits, dim=1) # single class with highest probability. simply retain indices

            num_examples += labels.size(0)

            correct += (predictions == labels).sum().float()
            wrong += (predictions != labels).sum().float()
            
        accuracy = correct / num_examples * 100      
        
    return correct, wrong, accuracy

## 6. Training procedure

In [None]:
def fit(model, train_loader, epochs, learning_rate, loss_func=nn.CrossEntropyLoss(), opt_func=torch.optim.SGD):
    
    optimizer = opt_func(model.parameters(), learning_rate) # objective function
    model = model.to(device)
    
    for epoch in range(epochs):
        
        model = model.train()
              
        for batch_index, (features, labels) in enumerate(train_loader):
            
            # gpu usage if possible
            features = features.to(device)
            labels = labels.to(device)
            
            # 1. forward
            logits = model(features)

            # 2. compute objective function (softmax, cross entropy)
            cost = loss_func(logits, labels)

            # 3. cleaning gradients
            optimizer.zero_grad() 

            # 4. accumulate partial derivatives
            cost.backward() 

            # 5. step in the opposite direction of the gradient
            optimizer.step() 
            
            if not batch_index % 250:
                print ('Epoch: {}/{} | Batch {}/{} | Cost: {:.4f}'.format(
                    epoch+1,
                    epochs,
                    batch_index,
                    len(train_loader),
                    cost
                ))
        
        correct, wrong, accuracy = comp_accuracy(model, train_loader)
        print ('Training: Correct[{:.0f}] | Wrong[{:.0f}] | Accuracy[{:.2f}%]'.format(
            correct,
            wrong,
            accuracy
        ), '\n')

    correct, wrong, accuracy = comp_accuracy(model, test_loader)  
    print ('Test: Correct[{:.0f}] | Wrong[{:.0f}] | Accuracy[{:.2f}%]'.format(
        correct,
        wrong,
        accuracy
    ), '\n')

## 1. Attempt

In [None]:
batch_size = 50
epochs = 10
learning_rate = 0.01
model = MLP(784, 200, 100, 10)
train_loader, test_loader = dloaders(batch_size=batch_size) # data iters
fit(model, train_loader, epochs, learning_rate) # training


### Summary 1. Attempt with Problem 1 - Part B Parameters

With the hyper parameters from Problem 1 - Part B, our model achieves an accuracy of ~86 percent. It therefore provides approx. 4 percent better accuracy.

## 2. Attempt

In [None]:
batch_size = 50
epochs = 10
learning_rate = 0.1

train_loader, test_loader = dloaders(batch_size=batch_size) # data iters
model = MLP(784, 200, 100, 10)
fit(model, train_loader, epochs, learning_rate) # training


### Summary 2. Attempt, testing different parameters
After playing with the hyper-parameters we have found that these hyper-parameters:
*   batch_size = 50
*   epochs = 10
*   learning_rate = 0.1

gave us the best accuracy of ~88 percent.




