# Utils

In [1]:
import time
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision import models
from torchinfo import summary

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
from lab1_utils import train, test, get_lr
from lab1_utils import multiple_diagnostic, test_class

In [4]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [5]:
# batch_size = 64
batch_size = 128
max_epochs = 30

In [6]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  # some augmentation
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

dataset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform_train)
# create a split for train/validation. We can use early stop
trainset, valset = torch.utils.data.random_split(dataset, [40000, 10000])

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2,
                                          drop_last=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                          shuffle=False, num_workers=2,
                                          drop_last=False)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2,
                                          drop_last=False)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
def train_loop(train_loader, test_loader, model, criterion, device,
               lr, momentum, max_epochs, do_test=True):

    model.to(device)

    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=1e-4)

    losses_train, accs_train = [], []
    losses_test, accs_test = [], []
    
    _start = time.time()
    _epoch_time = time.time()

    for epoch in range(1, max_epochs + 1):
        loss_train, acc_train = train(model, device, train_loader, criterion, optimizer)
        print(f"Epoch: {epoch}, Learning rate: {get_lr(optimizer):.6f}")
        print(f"Training - Loss: {loss_train:.4f}, Accuracy: {acc_train:.2f}, Runtime: {(time.time() - _epoch_time):.2f}")
        losses_train.append(loss_train)
        accs_train.append(acc_train)

        if do_test:
            loss_test, acc_test = test(model, device, criterion, test_loader)
            losses_test.append(loss_test)
            accs_test.append(acc_test)
            print(f"Test - Loss: {loss_test:.4f}, Accuracy: {acc_test:.2f}")

        _epoch_time = time.time()

    _end = time.time()
    print(f"Done! - Runtime: {(_end-_start):.2f} seconds")

    # test_class(model, device, criterion, testloader)

    if do_test:
        return losses_train, accs_train, losses_test, accs_test
    else:
        return losses_train, accs_train

# Exercise 1

Scegliamo di usare `resnet18`

Due prove di fine-tuning:
- Modificare il layer finale di classificazione `resnet18_1` partendo dai pesi originali cercando di arrivare alle migliori performance possibili
- Mettere in coda un MLP `resnet18_2`
- Riaddestrare un layer precedente `resnet18_3`

## Classification head: linear

In [7]:
resnet18_1 = models.resnet18(weights="DEFAULT")

# start with random weights
resnet18_1.fc = nn.Linear(resnet18_1.fc.in_features, 10)

resnet18_1 = resnet18_1.to(device)

# print(resnet18_1)
# print(resnet18_1.fc.weight.data)
print(summary(resnet18_1))

Layer (type:depth-idx)                   Param #
ResNet                                   --
├─Conv2d: 1-1                            9,408
├─BatchNorm2d: 1-2                       128
├─ReLU: 1-3                              --
├─MaxPool2d: 1-4                         --
├─Sequential: 1-5                        --
│    └─BasicBlock: 2-1                   --
│    │    └─Conv2d: 3-1                  36,864
│    │    └─BatchNorm2d: 3-2             128
│    │    └─ReLU: 3-3                    --
│    │    └─Conv2d: 3-4                  36,864
│    │    └─BatchNorm2d: 3-5             128
│    └─BasicBlock: 2-2                   --
│    │    └─Conv2d: 3-6                  36,864
│    │    └─BatchNorm2d: 3-7             128
│    │    └─ReLU: 3-8                    --
│    │    └─Conv2d: 3-9                  36,864
│    │    └─BatchNorm2d: 3-10            128
├─Sequential: 1-6                        --
│    └─BasicBlock: 2-3                   --
│    │    └─Conv2d: 3-11                 73,728

In [8]:
# freeze all layers
for param in resnet18_1.parameters():
    param.requires_grad = False

# let final layer be trainable, that goes into classification head
resnet18_1.fc.weight.requires_grad = True
resnet18_1.fc.bias.requires_grad = True

print(count_trainable_parameters(resnet18_1))
print(summary(resnet18_1))

5130
Layer (type:depth-idx)                   Param #
ResNet                                   --
├─Conv2d: 1-1                            (9,408)
├─BatchNorm2d: 1-2                       (128)
├─ReLU: 1-3                              --
├─MaxPool2d: 1-4                         --
├─Sequential: 1-5                        --
│    └─BasicBlock: 2-1                   --
│    │    └─Conv2d: 3-1                  (36,864)
│    │    └─BatchNorm2d: 3-2             (128)
│    │    └─ReLU: 3-3                    --
│    │    └─Conv2d: 3-4                  (36,864)
│    │    └─BatchNorm2d: 3-5             (128)
│    └─BasicBlock: 2-2                   --
│    │    └─Conv2d: 3-6                  (36,864)
│    │    └─BatchNorm2d: 3-7             (128)
│    │    └─ReLU: 3-8                    --
│    │    └─Conv2d: 3-9                  (36,864)
│    │    └─BatchNorm2d: 3-10            (128)
├─Sequential: 1-6                        --
│    └─BasicBlock: 2-3                   --
│    │    └─Conv2d: 3-

### Train with `batch_size=64`

In [9]:
criterion = nn.CrossEntropyLoss()

resnet18_1_dict = dict(model=resnet18_1, criterion=criterion, device=device, lr=0.01, momentum=0.,
                       max_epochs=max_epochs, do_test=True)

stats = train_loop(trainloader, valloader, **resnet18_1_dict)

print("=========")
test_class(resnet18_1, device, criterion, valloader, classes)

Epoch: 1, Learning rate: 0.010000
Training - Loss: 1.8961, Accuracy: 0.32, Runtime: 8.57
Test - Loss: 1.7694, Accuracy: 0.38
Epoch: 2, Learning rate: 0.010000
Training - Loss: 1.7298, Accuracy: 0.39, Runtime: 8.17
Test - Loss: 1.7118, Accuracy: 0.40
Epoch: 3, Learning rate: 0.010000
Training - Loss: 1.7199, Accuracy: 0.40, Runtime: 8.18
Test - Loss: 1.6913, Accuracy: 0.41
Epoch: 4, Learning rate: 0.010000
Training - Loss: 1.7070, Accuracy: 0.40, Runtime: 8.29
Test - Loss: 1.6642, Accuracy: 0.42
Epoch: 5, Learning rate: 0.010000
Training - Loss: 1.6889, Accuracy: 0.41, Runtime: 8.17
Test - Loss: 1.6822, Accuracy: 0.42
Epoch: 6, Learning rate: 0.010000
Training - Loss: 1.6868, Accuracy: 0.41, Runtime: 8.16
Test - Loss: 1.6863, Accuracy: 0.41
Epoch: 7, Learning rate: 0.010000
Training - Loss: 1.6910, Accuracy: 0.41, Runtime: 8.23
Test - Loss: 1.6602, Accuracy: 0.42
Epoch: 8, Learning rate: 0.010000
Training - Loss: 1.6825, Accuracy: 0.41, Runtime: 8.19
Test - Loss: 1.6524, Accuracy: 0.43


### Train with `batch_size=128`

In [11]:
criterion = nn.CrossEntropyLoss()

resnet18_1_dict = dict(model=resnet18_1, criterion=criterion, device=device, lr=0.01, momentum=0.9,
                       max_epochs=max_epochs, do_test=True)

stats = train_loop(trainloader, valloader, **resnet18_1_dict)

print("=========")
test_class(resnet18_1, device, criterion, valloader, classes)

Epoch: 1, Learning rate: 0.010000
Training - Loss: 1.8610, Accuracy: 0.35, Runtime: 8.25
Test - Loss: 1.7563, Accuracy: 0.39
Epoch: 2, Learning rate: 0.010000
Training - Loss: 1.7389, Accuracy: 0.39, Runtime: 8.02
Test - Loss: 1.7122, Accuracy: 0.41
Epoch: 3, Learning rate: 0.010000
Training - Loss: 1.7203, Accuracy: 0.40, Runtime: 7.95
Test - Loss: 1.7240, Accuracy: 0.40
Epoch: 4, Learning rate: 0.010000
Training - Loss: 1.7270, Accuracy: 0.40, Runtime: 7.96
Test - Loss: 1.6928, Accuracy: 0.42
Epoch: 5, Learning rate: 0.010000
Training - Loss: 1.7161, Accuracy: 0.41, Runtime: 7.93
Test - Loss: 1.7467, Accuracy: 0.40
Epoch: 6, Learning rate: 0.010000
Training - Loss: 1.7112, Accuracy: 0.40, Runtime: 7.92
Test - Loss: 1.8087, Accuracy: 0.38
Epoch: 7, Learning rate: 0.010000
Training - Loss: 1.7245, Accuracy: 0.40, Runtime: 7.93
Test - Loss: 1.7509, Accuracy: 0.39
Epoch: 8, Learning rate: 0.010000
Training - Loss: 1.7102, Accuracy: 0.40, Runtime: 7.88
Test - Loss: 1.7310, Accuracy: 0.40


## Classification head: MLP

In [10]:
resnet18_2 = models.resnet18(weights="DEFAULT")
resnet18_2.fc = nn.Sequential(
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 10)  # logits
)
resnet18_2 = resnet18_2.to(device)
print(resnet18_2)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [37]:
(512*256 + 256) + (256*128 + 128) + (128*10 + 10)

165514

In [12]:
# freeze all layers
for param in resnet18_2.parameters():
    param.requires_grad = False

# let final layer be trainable, that goes into classification head
# random initialization
for layer_idx in (0, 2, 4):
    resnet18_2.fc[layer_idx].weight.requires_grad = True
    resnet18_2.fc[layer_idx].bias.requires_grad = True
# resnet18_2.fc.weight.requires_grad = True
# resnet18_2.fc.bias.requires_grad = True
print(count_trainable_parameters(resnet18_2))
print(summary(resnet18_2))

165514
Layer (type:depth-idx)                   Param #
ResNet                                   --
├─Conv2d: 1-1                            (9,408)
├─BatchNorm2d: 1-2                       (128)
├─ReLU: 1-3                              --
├─MaxPool2d: 1-4                         --
├─Sequential: 1-5                        --
│    └─BasicBlock: 2-1                   --
│    │    └─Conv2d: 3-1                  (36,864)
│    │    └─BatchNorm2d: 3-2             (128)
│    │    └─ReLU: 3-3                    --
│    │    └─Conv2d: 3-4                  (36,864)
│    │    └─BatchNorm2d: 3-5             (128)
│    └─BasicBlock: 2-2                   --
│    │    └─Conv2d: 3-6                  (36,864)
│    │    └─BatchNorm2d: 3-7             (128)
│    │    └─ReLU: 3-8                    --
│    │    └─Conv2d: 3-9                  (36,864)
│    │    └─BatchNorm2d: 3-10            (128)
├─Sequential: 1-6                        --
│    └─BasicBlock: 2-3                   --
│    │    └─Conv2d: 

### Training with `batch_size=64`

In [13]:
criterion = nn.CrossEntropyLoss()

resnet18_2_dict = dict(model=resnet18_2, criterion=criterion, device=device, lr=0.01, momentum=0.9,
                       max_epochs=max_epochs, do_test=True)

stats = train_loop(trainloader, valloader, **resnet18_2_dict)

print("=========")
test_class(resnet18_2, device, criterion, valloader, classes)

Epoch: 1, Learning rate: 0.010000
Training - Loss: 1.8652, Accuracy: 0.33, Runtime: 8.43
Test - Loss: 1.7251, Accuracy: 0.39
Epoch: 2, Learning rate: 0.010000
Training - Loss: 1.7355, Accuracy: 0.38, Runtime: 8.12
Test - Loss: 1.7407, Accuracy: 0.38
Epoch: 3, Learning rate: 0.010000
Training - Loss: 1.7252, Accuracy: 0.39, Runtime: 8.38
Test - Loss: 1.7009, Accuracy: 0.39
Epoch: 4, Learning rate: 0.010000
Training - Loss: 1.7179, Accuracy: 0.39, Runtime: 8.02
Test - Loss: 1.6821, Accuracy: 0.40
Epoch: 5, Learning rate: 0.010000
Training - Loss: 1.7161, Accuracy: 0.39, Runtime: 8.77
Test - Loss: 1.6914, Accuracy: 0.40
Epoch: 6, Learning rate: 0.010000
Training - Loss: 1.7121, Accuracy: 0.39, Runtime: 8.54
Test - Loss: 1.7317, Accuracy: 0.39
Epoch: 7, Learning rate: 0.010000
Training - Loss: 1.7172, Accuracy: 0.39, Runtime: 8.48
Test - Loss: 1.7116, Accuracy: 0.40
Epoch: 8, Learning rate: 0.010000
Training - Loss: 1.7098, Accuracy: 0.39, Runtime: 8.40
Test - Loss: 1.6876, Accuracy: 0.39


### Training with `batch_size=128`

In [18]:
criterion = nn.CrossEntropyLoss()

resnet18_2_dict = dict(model=resnet18_2, criterion=criterion, device=device, lr=0.01, momentum=0.9,
                       max_epochs=max_epochs, do_test=True)

stats = train_loop(trainloader, valloader, **resnet18_2_dict)

print("=========")
test_class(resnet18_2, device, criterion, valloader, classes)

Epoch: 1, Learning rate: 0.010000
Training - Loss: 1.6623, Accuracy: 0.41, Runtime: 8.18
Test - Loss: 1.6614, Accuracy: 0.42
Epoch: 2, Learning rate: 0.010000
Training - Loss: 1.6558, Accuracy: 0.41, Runtime: 8.19
Test - Loss: 1.6517, Accuracy: 0.42
Epoch: 3, Learning rate: 0.010000
Training - Loss: 1.6553, Accuracy: 0.41, Runtime: 8.21


KeyboardInterrupt: 

## Classification head + previous layer

# Exercise 2

Cambiare il learning rate dei layer finali oppure mettere trainable un layer di convoluzione interno e cambiargli learning rate

# Exercise 3

Model selection strategy