# Deep Learning Homework \#05
### Deep Learning Course $\in$ DSSC @ UniTS (Spring 2021)  

#### Submitted by [Emanuele Ballarin](mailto:emanuele@ballarin.cc)  

### Preliminaries:

#### Imports:

We start off by importing all the libraries, modules, classes and functions we are going to use *today*...

In [1]:
# System interaction
import os

# Typing
from torch import Tensor

# Tensor computation and ANNs
import torch        # Backward compatibility
import torch as th  # Forward compatibility

# Pruning utilities
from imp_roved import IdxSet, Mask, paramsplit, maskterialize, magnitude_pruning, mask_size

# Scripted easers
from scripts import mnist, train_utils, architectures, train
from scripts.torch_utils import use_gpu_if_possible
from scripts.train_utils import accuracy, AverageMeter

#### *Training with pruning* routine

Taken from the provided *Jupyter* notebook, and slightly adapted.

In [2]:
def train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, mask, layers_to_prune, params_type_to_prune):

    for X, y in dataloader:
        X = X.to(device)
        y = y.to(device)

        optimizer.zero_grad() 

        y_hat = model(X)

        loss = loss_fn(y_hat, y)

        loss.backward()

        if mask is not None:
            for name, param in model.named_parameters():

                layname = paramsplit(name, 1)[0]
                parname = paramsplit(name, 1)[1]

                if (not layers_to_prune or layname in layers_to_prune) and (
                    not params_type_to_prune or parname in params_type_to_prune
                ):
                    param.grad *= maskterialize(param.grad.numel(), mask[layname][parname]).view(param.grad.shape).to(device)

        optimizer.step()

        acc = performance(y_hat, y)

        loss_meter.update(val=loss.item(), n=X.shape[0])
        performance_meter.update(val=acc, n=X.shape[0])

In [3]:
def train_model(model, dataloader, loss_fn, optimizer, num_epochs, checkpoint_loc=None, checkpoint_name="checkpoint.pt", performance=accuracy, lr_scheduler=None, device=None, mask=None, layers_to_prune=None, params_type_to_prune=["weight", "bias"]):
    if checkpoint_loc is not None:
        os.makedirs(checkpoint_loc, exist_ok=True)

    if device is None:
        device = use_gpu_if_possible()
    
    model = model.to(device)
    model.train()

    for epoch in range(num_epochs):
        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        print(f"Epoch {epoch+1} --- learning rate {optimizer.param_groups[0]['lr']:.5f}")

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device, mask, layers_to_prune, params_type_to_prune)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

        if checkpoint_name is not None and checkpoint_loc is not None:
            checkpoint_dict = {
                "parameters": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "epoch": epoch
            }
            torch.save(checkpoint_dict, os.path.join(checkpoint_loc, checkpoint_name))
        
        if lr_scheduler is not None:
            lr_scheduler.step()

    return loss_meter.sum, performance_meter.avg

In [4]:
layers = [
    {"n_in": 784, "n_out": 16, "batchnorm": False},
    {"n_out": 32, "batchnorm": True},
    {"n_out": 64, "batchnorm": True},
    {"n_out": 10, "batchnorm": True}
]
# MLPCustom is a net architecture I prepared to create MLPs with less code to write than before.
# See the implementation in the `architectures` script for further insights
net = architectures.MLPCustom(layers)
print(net)

MLPCustom(
  (layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=16, bias=True)
    (2): ReLU()
    (3): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=16, out_features=32, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Linear(in_features=32, out_features=64, bias=True)
    (8): ReLU()
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Linear(in_features=64, out_features=10, bias=True)
    (11): ReLU()
  )
)


In [5]:
trainloader, testloader, _, _ = mnist.get_data()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = th.optim.SGD(net.parameters(), lr=0.1)

In [6]:
train_model(net, trainloader, loss_fn, optimizer, num_epochs=2, layers_to_prune=["1", "4", "7", "10"])

Epoch 1 --- learning rate 0.10000
Epoch 1 completed. Loss - total: 23144.67014503479 - average: 0.3857445024172465; Performance: 0.8955833333333333
Epoch 2 --- learning rate 0.10000
Epoch 2 completed. Loss - total: 11758.425701618195 - average: 0.1959737616936366; Performance: 0.9435666666666667


(11758.425701618195, 0.9435666666666667)

In [7]:
mymask = magnitude_pruning(net, 0.6, set(["1", "4", "7", "10"]), set(["weight", "bias"]))

In [8]:
mask_size(mymask)

9519

In [9]:
train_model(net, trainloader, loss_fn, optimizer, num_epochs=2, layers_to_prune=["1", "4", "7", "10"], mask=mymask)

Epoch 1 --- learning rate 0.10000
Epoch 1 completed. Loss - total: 9833.770930290222 - average: 0.1638961821715037; Performance: 0.9519333333333333
Epoch 2 --- learning rate 0.10000
Epoch 2 completed. Loss - total: 8796.41000509262 - average: 0.14660683341821035; Performance: 0.95625


(8796.41000509262, 0.95625)

In [10]:
mymask = magnitude_pruning(net, 0.6, set(["1", "4", "7", "10"]), set(["weight", "bias"]), mask=mymask)

In [11]:
mask_size(mymask)

13327

In [12]:
train_model(net, trainloader, loss_fn, optimizer, num_epochs=2, layers_to_prune=["1", "4", "7", "10"], mask=mymask)

Epoch 1 --- learning rate 0.10000
Epoch 1 completed. Loss - total: 16329.837278842926 - average: 0.2721639546473821; Performance: 0.91915
Epoch 2 --- learning rate 0.10000
Epoch 2 completed. Loss - total: 13796.016090869904 - average: 0.2299336015144984; Performance: 0.9305


(13796.016090869904, 0.9305)

In [13]:
mymask = magnitude_pruning(net, 0.6, set(["1", "4", "7", "10"]), set(["weight", "bias"]), mask=mymask)

In [14]:
mask_size(mymask)

14850

In [15]:
train_model(net, trainloader, loss_fn, optimizer, num_epochs=2, layers_to_prune=["1", "4", "7", "10"], mask=mymask)

Epoch 1 --- learning rate 0.10000
Epoch 1 completed. Loss - total: 35785.282616615295 - average: 0.5964213769435882; Performance: 0.8122333333333334
Epoch 2 --- learning rate 0.10000
Epoch 2 completed. Loss - total: 30058.1871881485 - average: 0.5009697864691416; Performance: 0.8438666666666667


(30058.1871881485, 0.8438666666666667)