In [1]:
import torch
import os
import numpy as np
from torch import nn
import torchvision

from matplotlib import pyplot as plt


from scripts import mnist
from scripts.train_utils import accuracy, AverageMeter

In [3]:
class MLP_BN(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28*28, 1000),
            nn.ReLU(),

            nn.BatchNorm1d(num_features=1000), # we specify the dimensionality of the incoming data
            nn.Linear(1000, 500),
            nn.ReLU(),

            nn.BatchNorm1d(num_features=500),
            nn.Linear(500, 250),
            nn.ReLU(),

            #nn.BatchNorm1d(num_features=64),
            #nn.Linear(64, 64),
            #nn.ReLU(),

            nn.BatchNorm1d(num_features=250),
            nn.Linear(250, 100),
            nn.ReLU(),


            nn.BatchNorm1d(num_features=100),
            nn.Linear(100, 10),

        )


    def forward(self, X):
        return self.layers(X)

In [4]:
def get_params_and_gradients_norm(named_parameters):
    square_norms_params = []
    square_norms_grads = []

    for _, param in named_parameters:

        # Q: what is this and why did I write it here?
        if param.requires_grad:
            square_norms_params.append((param ** 2).sum())
            square_norms_grads.append((param.grad ** 2).sum())
    
    norm_params = sum(square_norms_params).sqrt().item()
    norm_grads = sum(square_norms_grads).sqrt().item()

    return norm_params, norm_grads

In [5]:
def train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance): # note: I've added a generic performance to replace accuracy
    for X, y in dataloader:
        # 1. reset the gradients previously accumulated by the optimizer
        #    this will avoid re-using gradients from previous loops
        optimizer.zero_grad() 
        # 2. get the predictions from the current state of the model
        #    this is the forward pass
        y_hat = model(X)
        # 3. calculate the loss on the current mini-batch
        loss = loss_fn(y_hat, y)
        # 4. execute the backward pass given the current loss
        loss.backward()
        # 5. update the value of the params
        optimizer.step()
        # 6. calculate the accuracy for this mini-batch
        acc = performance(y_hat, y)
        # 7. update the loss and accuracy AverageMeter
        loss_meter.update(val=loss.item(), n=X.shape[0])
        performance_meter.update(val=acc, n=X.shape[0])

def train_model(model, dataloader, loss_fn, optimizer, num_epochs, checkpoint_loc=None, checkpoint_name="checkpoint.pt", performance=accuracy, lr_scheduler=None, epoch_start_scheduler=1):
    # added lr_scheduler

    # create the folder for the checkpoints (if it's not None)
    if checkpoint_loc is not None:
        os.makedirs(checkpoint_loc, exist_ok=True)
    
    model.train()

    # epoch loop
    for epoch in range(num_epochs):

        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        # added print for LR
        print(f"Epoch {epoch+1} --- learning rate {optimizer.param_groups[0]['lr']:.5f}")

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

        # produce checkpoint dictionary -- but only if the name and folder of the checkpoint are not None
        if checkpoint_name is not None and checkpoint_loc is not None:
            checkpoint_dict = {
                "parameters": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "epoch": epoch
            }
            torch.save(checkpoint_dict, os.path.join(checkpoint_loc, checkpoint_name))
        
        if lr_scheduler is not None:
            if epoch >= epoch_start_scheduler:
                lr_scheduler.step()
            # or you can use a MultiStepLR with milestones=[6, 11] thus deleting the `if` construct for the epoch
        if performance_meter.avg == 1:
            return loss_meter.sum, performance_meter.avg

    return loss_meter.sum, performance_meter.avg

def test_model(model, dataloader, performance=accuracy, loss_fn=None, device=None):
    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # create an AverageMeter for the loss if passed
    if loss_fn is not None:
        loss_meter = AverageMeter()
    
    performance_meter = AverageMeter()

    model.to(device)
    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)

            y_hat = model(X)
            loss = loss_fn(y_hat, y) if loss_fn is not None else None
            acc = performance(y_hat, y)
            if loss_fn is not None:
                loss_meter.update(loss.item(), X.shape[0])
            performance_meter.update(acc, X.shape[0])
    # get final performances
    fin_loss = loss_meter.sum if loss_fn is not None else None
    fin_perf = performance_meter.avg
    print(f"TESTING - loss {fin_loss if fin_loss is not None else '--'} - performance {fin_perf}")
    return fin_loss, fin_perf

In [6]:
minibatch_size_train = 256
minibatch_size_test = 512


trainloader, testloader, trainset, testset = mnist.get_data(batch_size_train=minibatch_size_test, batch_size_test=minibatch_size_test)


In order to reach 100% accuracy on training what I've done has been to enlarge the MLP, adding one layer and specially increasing the number of parameters for each layer. The same model also worked on the permutated labels reaching 99,9% on accuracy.

In [7]:
learn_rate = 0.1


num_epochs = 15

best_model = MLP_BN()
loss_fn = nn.CrossEntropyLoss()


optimizer = torch.optim.SGD(best_model.parameters(), lr=learn_rate)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=15)




In [8]:
train_model(best_model, trainloader, loss_fn, optimizer, num_epochs, lr_scheduler=scheduler)

Epoch 1 --- learning rate 0.10000
Epoch 1 completed. Loss - total: 14178.19571018219 - average: 0.23630326183636982; Performance: 0.9414
Epoch 2 --- learning rate 0.10000
Epoch 2 completed. Loss - total: 3623.018170952797 - average: 0.060383636182546614; Performance: 0.9847833333333333
Epoch 3 --- learning rate 0.09891
Epoch 3 completed. Loss - total: 1550.2869215011597 - average: 0.025838115358352662; Performance: 0.9955
Epoch 4 --- learning rate 0.09568
Epoch 4 completed. Loss - total: 802.5168937146664 - average: 0.013375281561911105; Performance: 0.9984166666666666
Epoch 5 --- learning rate 0.09045
Epoch 5 completed. Loss - total: 414.4504934847355 - average: 0.006907508224745592; Performance: 0.9996833333333334
Epoch 6 --- learning rate 0.08346
Epoch 6 completed. Loss - total: 254.04075384140015 - average: 0.0042340125640233355; Performance: 0.9999333333333333
Epoch 7 --- learning rate 0.07500
Epoch 7 completed. Loss - total: 194.5168101489544 - average: 0.0032419468358159064; Per

(119.8120453953743, 1.0)

In [9]:
model = MLP_BN()

trainset_permuted=trainset
trainset_permuted.targets= trainset_permuted.targets[torch.randperm(trainset_permuted.targets.size()[0])]
trainloader_permuted = torch.utils.data.DataLoader(trainset_permuted, batch_size=minibatch_size_train, shuffle=True)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=150)
train_model(model, trainloader_permuted, loss_fn, optimizer, 150, lr_scheduler=scheduler)

3333336
Epoch 17 --- learning rate 0.09755
Epoch 17 completed. Loss - total: 88765.5573310852 - average: 1.4794259555180866; Performance: 0.48291666666666666
Epoch 18 --- learning rate 0.09722
Epoch 18 completed. Loss - total: 83576.23254776001 - average: 1.3929372091293335; Performance: 0.5158666666666667
Epoch 19 --- learning rate 0.09686
Epoch 19 completed. Loss - total: 79160.08657455444 - average: 1.319334776242574; Performance: 0.54085
Epoch 20 --- learning rate 0.09649
Epoch 20 completed. Loss - total: 74688.8642616272 - average: 1.2448144043604532; Performance: 0.5665666666666667
Epoch 21 --- learning rate 0.09609
Epoch 21 completed. Loss - total: 70150.38455200195 - average: 1.1691730758666992; Performance: 0.5944166666666667
Epoch 22 --- learning rate 0.09568
Epoch 22 completed. Loss - total: 65413.09572601318 - average: 1.0902182621002197; Performance: 0.6197333333333334
Epoch 23 --- learning rate 0.09524
Epoch 23 completed. Loss - total: 60915.782329559326 - average: 1.0152

(135.00259064137936, 0.9999333333333333)

In [13]:
test_model(model, testloader, performance=accuracy, loss_fn=None, device=None)

TESTING - loss -- - performance 0.9998666666666667


(None, 0.9998666666666667)

In [12]:
test_model(best_model, testloader, performance=accuracy, loss_fn=None, device=None)

TESTING - loss -- - performance 0.1013


(None, 0.1013)