# Deep Learning Homework \#03
### Deep Learning Course $\in$ DSSC @ UniTS (Spring 2021)  

#### Submitted by [Emanuele Ballarin](mailto:emanuele@ballarin.cc)  

In [None]:
# Type hints
from torch import Tensor
#from typing import Union, Optional

# Just to force-load MKL (if available)
import numpy as np

# Mathematical functions
from math import sqrt as msqrt

# Neural networks and friends
import torch as th
from torch.nn import Sequential, BatchNorm1d, Linear, LogSoftmax, Dropout
import torch.nn.functional as F

# Optimization and scheduling
from torch.optim.lr_scheduler import StepLR, MultiStepLR

# Bespoke Modules / Functions / Optimizers
from ebtorch.logging import AverageMeter
from ebtorch.nn import Mish, mishlayer_init
from ebtorch.optim import Lookahead
from madgrad.madgrad import MADGRAD as MadGrad

# Model summarization
from torchinfo import summary

# Dataset handling for PyTorch
import os
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Normalize, Compose, Lambda

In [None]:
# MNIST DataLoader(s) builder

def spawn_mnist_loaders(
    data_root="datasets/",
    batch_size_train=256,
    batch_size_test=512,
    cuda_accel=False,
    **kwargs
):

    os.makedirs(data_root, exist_ok=True)

    transforms = Compose(
        [
            ToTensor(),
            Normalize((0.1307,), (0.3081,)),  # usual normalization constants for MNIST
            Lambda(lambda x: th.flatten(x)),
        ]
    )

    trainset = MNIST(data_root, train=True, transform=transforms, download=True)
    testset = MNIST(data_root, train=False, transform=transforms, download=True)

    # Permute trainset.targets
    idx = th.randperm(trainset.targets.nelement())
    trainset.targets = trainset.targets.view(-1)[idx].view(trainset.targets.size())

    cuda_args = {}
    if cuda_accel:
        cuda_args = {"num_workers": 1, "pin_memory": True}

    trainloader = DataLoader(
        trainset, batch_size=batch_size_train, shuffle=True, **cuda_args
    )
    testloader = DataLoader(
        testset, batch_size=batch_size_test, shuffle=False, **cuda_args
    )
    tontrloader = DataLoader(   # tontr == test on train
        trainset, batch_size=batch_size_test, shuffle=False, **cuda_args
    )

    return trainloader, testloader, tontrloader

In [None]:
# Train / Test tooling

train_acc_avgmeter = AverageMeter("Training Loss")

def train_epoch(
    model, device, train_loader, loss_fn, optimizer, epoch, print_every_nep, inner_scheduler=None, quiet=False,
):
    train_acc_avgmeter.reset()
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        if inner_scheduler is not None:
            inner_scheduler.step()
        
        train_acc_avgmeter.update(loss.item())

        if not quiet and batch_idx % print_every_nep == 0:
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tAvg. loss: {:.6f}".format(
                    epoch,
                    batch_idx * len(data),
                    len(train_loader.dataset),
                    100.0 * batch_idx / len(train_loader),
                    train_acc_avgmeter.avg
                )
            )


def test(model, device, test_loader, loss_fn, quiet=False):
    model.eval()
    test_loss = 0
    correct = 0
    with th.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += loss_fn(
                output, target, reduction="sum"
            ).item()  # sum up batch loss
            pred = output.argmax(
                dim=1, keepdim=True
            )  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    ltlds = len(test_loader.dataset)

    test_loss /= ltlds
    
    if not quiet:
        print(
            "Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
                test_loss,
                correct,
                ltlds,
                100.0 * correct / ltlds,
            )
        )
    
    return test_loss, correct / ltlds

In [None]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")

In [None]:
# Hyperparameters & co.

minibatch_size_train: int = 1024 # I know it's high; I just want a "little" more stability
minibatch_size_test: int = 512

nrepochs = 90

lossfn = F.nll_loss

In [None]:
train_loader, test_loader, test_on_train_loader = spawn_mnist_loaders(
    batch_size_train=minibatch_size_train,
    batch_size_test=minibatch_size_test,
    cuda_accel=bool(device == "cuda"),
)

In [None]:
model = Sequential(
    # -> Input is here <-

    # POST-INPUT BLOCK:
    Linear(in_features=28*28, out_features=1500, bias=True),    # Hyperfeaturize ~2*input
    Mish(),

    # HIDDEN BLOCK:
    BatchNorm1d(num_features=1500, affine=True),
    Linear(in_features=1500, out_features=500, bias=True),      # Compress ~0.75*input
    Mish(),

    # PRE-OUTPUT BLOCK:
    BatchNorm1d(num_features=500, affine=True),
    Linear(in_features=500, out_features=10, bias=True),        # To output
    LogSoftmax(dim=1)

    # -> Output is here <-
        ).to(device)

base_optimizer = MadGrad(model.parameters(), lr=0.00017)
optimizer      = Lookahead(base_optimizer, la_steps=4)
scheduler      = MultiStepLR(optimizer, milestones=[], gamma=0.4)

In [None]:
# Initialize weights and biases in the proper way ;)
for layr in model:
    mishlayer_init(layr)

In [None]:
summary(model)

In [None]:
for epoch in range(1, nrepochs + 1):

    # Training
    print("TRAINING...")
    train_epoch(
        model, device, train_loader, lossfn, optimizer, epoch, print_every_nep=15, inner_scheduler=None, quiet=False,
    )

    # Tweaks for the Lookahead optimizer (before testing)
    if isinstance(optimizer, Lookahead):
        optimizer._backup_and_load_cache()

    # Testing: on training and testing set
    print("\nON TRAINING SET:")
    _ = test(model, device, test_on_train_loader, lossfn, quiet=False)
    print("\nON TEST SET:")
    _ = test(model, device, test_loader, lossfn, quiet=False)
    print("\n\n")

    # Tweaks for the Lookahead optimizer (after testing)
    if isinstance(optimizer, Lookahead):
        optimizer._clear_and_load_backup()