In [74]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import random_split
from datetime import datetime

torch.manual_seed(123)
torch.set_default_dtype(torch.double)

In [75]:
device = (torch.device('cuda') if torch.cuda.is_available()
else torch.device('cpu'))
print(device)

cuda


In [76]:
batch_size =  256
n_epoch =  30
loss_fn =  nn.CrossEntropyLoss()
seed =  265

# 3.1
Load and preprocess the CIFAR-10 dataset. Split it into 3 datasets: training, validation and
test. Take a subset of these datasets by keeping only 2 labels: bird and plane.

In [77]:
def load_cifar(train_val_split=0.9, data_path='../data/', preprocessor=None):
    
    # Define preprocessor if not already given
    if preprocessor is None:
        preprocessor = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4915, 0.4823, 0.4468),
                                (0.2470, 0.2435, 0.2616))
        ])
    
    # load datasets
    data_train_val = datasets.CIFAR10(
        data_path,      
        train=True,      
        download=True,
        transform=preprocessor)

    data_test = datasets.CIFAR10(
        data_path, 
        train=False,
        download=True,
        transform=preprocessor)

    # train/validation split
    n_train = int(len(data_train_val)*train_val_split)
    n_val =  len(data_train_val) - n_train

    data_train, data_val = random_split(
        data_train_val, 
        [n_train, n_val],
        generator=torch.Generator().manual_seed(123)
    )

    print("Size of the train dataset:        ", len(data_train))
    print("Size of the validation dataset:   ", len(data_val))
    print("Size of the test dataset:         ", len(data_test))
    
    return (data_train, data_val, data_test)

cifar10_train, cifar10_val, cifar10_test = load_cifar()

# Now define a lighter version of CIFAR10: cifar
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']

# For each dataset, keep only airplanes and birds
cifar2_train = [(img, label_map[label]) for img, label in cifar10_train if label in [0, 2]]
cifar2_val = [(img, label_map[label]) for img, label in cifar10_val if label in [0, 2]]
cifar2_test = [(img, label_map[label]) for img, label in cifar10_test if label in [0, 2]]

print('Size of the training dataset: ', len(cifar2_train))
print('Size of the validation dataset: ', len(cifar2_val))
print('Size of the test dataset: ', len(cifar2_test))

Files already downloaded and verified
Files already downloaded and verified
Size of the train dataset:         45000
Size of the validation dataset:    5000
Size of the test dataset:          10000
Size of the training dataset:  9017
Size of the validation dataset:  983
Size of the test dataset:  2000


# 3.2
Write a MyMLP class that implements a MLP in PyTorch (so only fully connected layers) such
that:
(a) The input dimension is 3072 (= 32*32*3) and the output dimension is 2 (for the 2
classes).
(b) The hidden layers have respectively 512, 128 and 32 hidden units.
(c) All activation functions are ReLU. The last layer has no activation function since the
cross-entropy loss already includes a softmax activation function.

In [78]:
class MyMLP(nn.Module):
    def __init__(self):
        super().__init__()  

        self.flat = nn.Flatten()
        # 32*32*3: determined by our dataset: 32x32 RGB images
        self.fc1 = nn.Linear(32*32*3, 512)
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(512, 128)
        self.act2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 32)
        self.act3 = nn.ReLU()
        # 2: determined by our number of classes (birds and planes)
        self.fc4 = nn.Linear(32, 2)

    def forward(self, x):
        out = self.flat(x)
        out = self.act1(self.fc1(out))
        out = self.act2(self.fc2(out))
        out = self.act3(self.fc3(out))
        out = self.fc4(out)
        return out

# 3.3
Write a train(n epochs, optimizer, model, loss fn, train loader) function that trains
model for n epochs epochs given an optimizer optimizer, a loss function loss fn and a dat-
aloader train loader.

In [79]:
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    optimizer.zero_grad(set_to_none=True)
    
    for epoch in range(1, n_epochs + 1):
        
        loss_train = 0.0
        for imgs, labels in train_loader:

            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            outputs = model(imgs)
            
            loss = loss_fn(outputs, labels)
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()

            loss_train += loss.item()
            
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train

# 3.4
Write a similar function train manual update that has no optimizer parameter, but a learn-
ing rate lr parameter instead and that manually updates each trainable parameter of model
using equation (3). Do not forget to zero out all gradients after each iteration.

In [80]:
def train_manual_update(n_epochs, lr, model, loss_fn, train_loader, verbose = False):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:

            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            outputs = model(imgs)
            
            loss = loss_fn(outputs, labels)
            loss.backward()
            
            # optimizer step
            with torch.no_grad():
                for p in model.parameters():
                    p -= lr * p.grad
            
            # zero out all gradients
            with torch.no_grad():
                for p in model.parameters():
                    p.grad.zero_()

            loss_train += loss.item()
            
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0 or verbose:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train

# 3.5
Train 2 instances of MyMLP, one using train and the other using train manual update (use
the same parameter values for both models). Compare their respective training losses.

In [81]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
optimizer = optim.SGD(model.parameters(), lr=0.01)
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
res = train(n_epoch, optimizer, model, loss_fn, train_loader)

11:27:46.619482  |  Epoch 1  |  Training loss 0.681
11:27:54.501430  |  Epoch 10  |  Training loss 0.465
11:28:03.374169  |  Epoch 20  |  Training loss 0.384
11:28:12.270145  |  Epoch 30  |  Training loss 0.318


In [82]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
lr = 0.01
res = train_manual_update(n_epoch, lr, model, loss_fn, train_loader)

11:28:13.220978  |  Epoch 1  |  Training loss 0.681
11:28:21.257603  |  Epoch 10  |  Training loss 0.465
11:28:30.120996  |  Epoch 20  |  Training loss 0.384
11:28:38.974485  |  Epoch 30  |  Training loss 0.318


We see that the train and train_manual_update function produces the same training losses when given the same data and learning rate.

# 3.6

Modify train manual update by adding a L2 regularization term in your manual parameter
update. Add an additional weight decay parameter to train manual update. Compare
again train and train manual update results with 0 < weight decay < 1

In [96]:
def train_manual_update(n_epochs, lr, weight_decay, model, loss_fn, train_loader):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:

            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            outputs = model(imgs)
            
            loss = loss_fn(outputs, labels)

            L2_reg = None
            for param in model.parameters():
                if L2_reg == None:
                    L2_reg = 0.5 * torch.pow(param, 2).sum()
                else:
                    L2_reg += 0.5 * torch.pow(param, 2).sum()
            
            loss += (weight_decay * L2_reg) / (2*len(train_loader))
            
            loss.backward()
                
            # optimizer step
            with torch.no_grad():
                for p in model.parameters():
                    p -= lr * p.grad
                model.zero_grad()
        
            
            loss_train += loss.item()
            
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train




In [84]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.01)
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
res = train(n_epoch, optimizer, model, loss_fn, train_loader)

11:28:40.025306  |  Epoch 1  |  Training loss 0.681
11:28:48.019227  |  Epoch 10  |  Training loss 0.470
11:28:56.864729  |  Epoch 20  |  Training loss 0.394
11:29:05.831795  |  Epoch 30  |  Training loss 0.336


In [97]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
lr = 0.01
weight_decay= 0.01
res = train_manual_update(n_epoch, lr, weight_decay, model, loss_fn, train_loader)

11:38:00.219010  |  Epoch 1  |  Training loss 0.697
11:38:08.667992  |  Epoch 10  |  Training loss 0.481
11:38:18.020535  |  Epoch 20  |  Training loss 0.400
11:38:27.312707  |  Epoch 30  |  Training loss 0.334


# 3.7

Modify train manual update by adding a momentum term in your parameter update. Add
an additional momentum parameter to train manual update. Check again the correctness of
the new update rule by comparing it to train function (with 0 < momentum < 1).

In [98]:
def train_manual_update(n_epochs, lr, weight_decay, momentum_coeff, model, loss_fn, train_loader):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0

        layers = sum([1 for _ in model.parameters()])
        velocity = [[]] * layers

        for imgs, labels in train_loader:

            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            outputs = model(imgs)
            
            loss = loss_fn(outputs, labels)

            # L2 Reg
            L2_reg = None
            for param in model.parameters():
                if L2_reg == None:
                    L2_reg = 0.5 * torch.pow(param, 2).sum()
                else:
                    L2_reg += 0.5 * torch.pow(param, 2).sum()
            
            loss += (weight_decay * L2_reg) / (2*len(train_loader))
            loss.backward()


            # Momentum 
            with torch.no_grad():
                for param_idx, param in enumerate(model.parameters()):
                    if velocity[param_idx] == []:
                        velocity[param_idx] = torch.zeros(param.grad.shape).to(device=device)

                    velocity[param_idx] = velocity[param_idx] * momentum_coeff + param.grad
                    
                    param -= lr*velocity[param_idx]
                model.zero_grad()
            
            
            # optimizer step
            with torch.no_grad():
                for p in model.parameters():
                    p -= lr * p.grad
                model.zero_grad()
        
            
            loss_train += loss.item()
            
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train

In [99]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.01, momentum=0.9)
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
res train(n_epoch, optimizer, model, loss_fn, train_loader)

14:52:37.363943  |  Epoch 1  |  Training loss 0.620
14:52:45.837682  |  Epoch 10  |  Training loss 0.262
14:52:54.905316  |  Epoch 20  |  Training loss 0.191
14:53:03.846555  |  Epoch 30  |  Training loss 0.087


[0.6197159367150589,
 0.4851744007381643,
 0.4337747364279043,
 0.39923619147999084,
 0.37261967820073433,
 0.3412975960276278,
 0.31786830551926454,
 0.2982694451620969,
 0.27962311738510653,
 0.2621730343009682,
 0.25557925876148574,
 0.2514835548522462,
 0.2332476599459216,
 0.21645391711632564,
 0.2004332527346621,
 0.18797397852520473,
 0.18233553354178433,
 0.19818053112630502,
 0.19478543418845584,
 0.19053297655903967,
 0.20984107986232303,
 0.21646185132131848,
 0.21819009632657427,
 0.18378265287522422,
 0.15727066061441133,
 0.13241703954642914,
 0.11206583415673098,
 0.09618548591236015,
 0.08704017878793552,
 0.08699225187862035]

In [100]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
lr=0.01
weight_decay=0.01
momentum=0.9
train_manual_update(n_epoch, lr, weight_decay, momentum, model, loss_fn, train_loader)

14:53:04.985964  |  Epoch 1  |  Training loss 0.634


# 3.8

Train different instances (at least 4) of the MyMLP model with different learning rate, momentum
and weight decay values . You can choose the same values as in the
gradient descent output.txt file

# 3.9
Select the best model among those trained in the previous question based on their accuracy

# 3.10
Evaluate the best model