## Authors:
#### Daniel Stöckein (5018039), Alexander Triol (5018451)

In [5]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
from matplotlib import pyplot as plt
%matplotlib inline

In [103]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 1. Load Data

In [104]:
mnist_train = datasets.FashionMNIST(
    root='..\datasets', 
    train=True, 
    download=True, 
    transform=transforms.ToTensor()
)

mnist_test = datasets.FashionMNIST(
    root='..\datasets', 
    train=False, 
    download=True, 
    transform=transforms.ToTensor()
)

## 2. Preparing DataLoader

In [105]:
def dloaders(batch_size):
    train_loader = DataLoader(mnist_train, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(mnist_test, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

## 3. Model definition
- 28x28 = 784 pixel values
- 10 classes
- 256 hidden units

The following code implements the residual block with skip connections such that the input passed via the shortcut matches the dimensions of the main path's output.

### 3.1 Residual Block

In [107]:
class ResBlock(nn.Module):
    def __init__(self, in_features, out_features):
        super(ResBlock, self).__init__()
        
        #define layers
        self.bn1 = nn.BatchNorm1d(in_features)
        self.relu1 = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5) # proportion or neurons to dropout
        self.fc1 = nn.Linear(in_features, out_features)
        self.bn2 = nn.BatchNorm1d(out_features)
        self.relu2 = nn.ReLU()
        self.fc2 = nn.Linear(out_features, in_features)

    def forward(self, x):
        residual = x
        out = self.bn1(x)
        out = self.relu1(out)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.fc2(out)
        out += residual
        
        return out

### 3.2 MLP

In [108]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()

        self.fc1 = nn.Linear(28*28, 256)
        
        self.resblock1 = ResBlock(256, 256) 
        
        self.bn1 = nn.BatchNorm1d(256) 
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 64) 
        
        self.resblock2 = ResBlock(64, 64)
        
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 10)
        
    def forward(self, x):      
        out = x.view(-1, 28*28) #flatten
        
        out = self.fc1(out)
        
        out = self.resblock1(out)
        
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        
        out = self.resblock2(out)
        
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

## 4. Metrics

In [139]:
def comp_accuracy(model, data_loader):
    correct = 0
    wrong = 0
    num_examples = 0
    
    # turn on eval mode if model Inherits from nn.Module
    if isinstance(model, nn.Module):
        model.eval()
    
    with torch.no_grad():
        for batch_index, (features, labels) in enumerate(data_loader):
            features = features.to(device)
            labels = labels.to(device)

            logits = model(features)
            _, predictions = torch.max(logits, dim=1) # single class with highest probability. simply retain indices

            num_examples += labels.size(0)

            correct += (predictions == labels).sum().float()
            wrong += (predictions != labels).sum().float()
            
        accuracy = correct / num_examples * 100      
        
    return correct, wrong, accuracy

## 5. Training

In [153]:
def fit(model, train_loader, epochs, learning_rate, loss_func=nn.CrossEntropyLoss(), opt_func=torch.optim.SGD):
    
    optimizer = opt_func(model.parameters(), learning_rate) # objective function
    
    for epoch in range(epochs):
        
        model = model.train()
              
        for batch_index, (features, labels) in enumerate(train_loader):
            
            # gpu usage if possible
            features = features.to(device)
            labels = labels.to(device)
            
            # 1. forward
            logits = model(features)

            # 2. compute objective function (softmax, cross entropy)
            cost = loss_func(logits, labels)

            # 3. cleaning gradients
            optimizer.zero_grad() 

            # 4. accumulate partial derivatives
            cost.backward() 

            # 5. step in the opposite direction of the gradient
            optimizer.step() 
            
            if not batch_index % 250:
                print ('Epoch: {}/{} | Batch {}/{} | Cost: {:.4f}'.format(
                    epoch+1,
                    epochs,
                    batch_index,
                    len(train_loader),
                    cost
                ))
        
        correct, wrong, accuracy = comp_accuracy(model, train_loader)
        print ('Training: Correct[{:.0f}] | Wrong[{:.0f}] | Accuracy[{:.2f}%]'.format(
            correct,
            wrong,
            accuracy
        ), '\n')

    correct, wrong, accuracy = comp_accuracy(model, test_loader)  
    print ('Test: Correct[{:.0f}] | Wrong[{:.0f}] | Accuracy[{:.2f}%]'.format(
        correct,
        wrong,
        accuracy
    ), '\n')
        

## 6. Playground

### 1. Attempt

In [154]:
batch_size = 50
epochs = 10
learning_rate = 0.01

In [155]:
train_loader, test_loader = dloaders(batch_size=batch_size) # data iters
model = MLP()
model = model.to(device)
fit(model, train_loader, epochs, learning_rate) # training

Epoch: 1/10 | Batch 0/1200 | Cost: 2.3177
Epoch: 1/10 | Batch 250/1200 | Cost: 0.8728
Epoch: 1/10 | Batch 500/1200 | Cost: 0.7239
Epoch: 1/10 | Batch 750/1200 | Cost: 0.5144
Epoch: 1/10 | Batch 1000/1200 | Cost: 0.3994
Training: Correct[50436] | Wrong[9564] | Accuracy[84.06%] 

Epoch: 2/10 | Batch 0/1200 | Cost: 0.5419
Epoch: 2/10 | Batch 250/1200 | Cost: 0.4994
Epoch: 2/10 | Batch 500/1200 | Cost: 0.3134
Epoch: 2/10 | Batch 750/1200 | Cost: 0.3356
Epoch: 2/10 | Batch 1000/1200 | Cost: 0.4121
Training: Correct[51838] | Wrong[8162] | Accuracy[86.40%] 

Epoch: 3/10 | Batch 0/1200 | Cost: 0.6254
Epoch: 3/10 | Batch 250/1200 | Cost: 0.4114
Epoch: 3/10 | Batch 500/1200 | Cost: 0.5007
Epoch: 3/10 | Batch 750/1200 | Cost: 0.4742
Epoch: 3/10 | Batch 1000/1200 | Cost: 0.4106
Training: Correct[52479] | Wrong[7521] | Accuracy[87.46%] 

Epoch: 4/10 | Batch 0/1200 | Cost: 0.3027
Epoch: 4/10 | Batch 250/1200 | Cost: 0.2631
Epoch: 4/10 | Batch 500/1200 | Cost: 0.5483
Epoch: 4/10 | Batch 750/1200 | Co

### 2. Attempt

In [156]:
batch_size = 256
epochs = 30
learning_rate = 0.001

In [157]:
train_loader, test_loader = dloaders(batch_size=batch_size) # data iters
model = MLP()
model = model.to(device)
fit(model, train_loader, epochs, learning_rate) # training

Epoch: 1/30 | Batch 0/235 | Cost: 2.3590
Training: Correct[30652] | Wrong[29348] | Accuracy[51.09%] 

Epoch: 2/30 | Batch 0/235 | Cost: 1.9217
Training: Correct[37582] | Wrong[22418] | Accuracy[62.64%] 

Epoch: 3/30 | Batch 0/235 | Cost: 1.7218
Training: Correct[39908] | Wrong[20092] | Accuracy[66.51%] 

Epoch: 4/30 | Batch 0/235 | Cost: 1.4809
Training: Correct[41169] | Wrong[18831] | Accuracy[68.61%] 

Epoch: 5/30 | Batch 0/235 | Cost: 1.3557
Training: Correct[41771] | Wrong[18229] | Accuracy[69.62%] 

Epoch: 6/30 | Batch 0/235 | Cost: 1.2388
Training: Correct[42363] | Wrong[17637] | Accuracy[70.60%] 

Epoch: 7/30 | Batch 0/235 | Cost: 1.1237
Training: Correct[42952] | Wrong[17048] | Accuracy[71.59%] 

Epoch: 8/30 | Batch 0/235 | Cost: 1.1238
Training: Correct[43553] | Wrong[16447] | Accuracy[72.59%] 

Epoch: 9/30 | Batch 0/235 | Cost: 0.9957
Training: Correct[44089] | Wrong[15911] | Accuracy[73.48%] 

Epoch: 10/30 | Batch 0/235 | Cost: 1.0029
Training: Correct[44693] | Wrong[15307] 

In [160]:
batch_size = 128
epochs = 15
learning_rate = 0.2

In [161]:
train_loader, test_loader = dloaders(batch_size=batch_size) # data iters
model = MLP()
model = model.to(device)
fit(model, train_loader, epochs, learning_rate) # training

Epoch: 1/15 | Batch 0/469 | Cost: 2.4079
Epoch: 1/15 | Batch 250/469 | Cost: 0.4494
Training: Correct[51888] | Wrong[8112] | Accuracy[86.48%] 

Epoch: 2/15 | Batch 0/469 | Cost: 0.5381
Epoch: 2/15 | Batch 250/469 | Cost: 0.5519
Training: Correct[52186] | Wrong[7814] | Accuracy[86.98%] 

Epoch: 3/15 | Batch 0/469 | Cost: 0.4128
Epoch: 3/15 | Batch 250/469 | Cost: 0.2326
Training: Correct[52824] | Wrong[7176] | Accuracy[88.04%] 

Epoch: 4/15 | Batch 0/469 | Cost: 0.2926
Epoch: 4/15 | Batch 250/469 | Cost: 0.3408
Training: Correct[53984] | Wrong[6016] | Accuracy[89.97%] 

Epoch: 5/15 | Batch 0/469 | Cost: 0.2953
Epoch: 5/15 | Batch 250/469 | Cost: 0.2725
Training: Correct[52275] | Wrong[7725] | Accuracy[87.12%] 

Epoch: 6/15 | Batch 0/469 | Cost: 0.2883
Epoch: 6/15 | Batch 250/469 | Cost: 0.2392
Training: Correct[54227] | Wrong[5773] | Accuracy[90.38%] 

Epoch: 7/15 | Batch 0/469 | Cost: 0.2226
Epoch: 7/15 | Batch 250/469 | Cost: 0.2571
Training: Correct[54522] | Wrong[5478] | Accuracy[90