In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import random_split
from torch import linalg as LA
from datetime import datetime

torch.manual_seed(123)
torch.set_default_dtype(torch.double)

In [2]:
device = (torch.device('cuda') if torch.cuda.is_available()
else torch.device('cpu'))
print(device)

cuda


In [3]:
batch_size =  64
n_epoch =  30
loss_fn =  nn.CrossEntropyLoss()
seed =  265

# 3.1
Load and preprocess the CIFAR-10 dataset. Split it into 3 datasets: training, validation and
test. Take a subset of these datasets by keeping only 2 labels: bird and plane.

In [4]:
def load_cifar(train_val_split=0.9, data_path='../data/', preprocessor=None):
    
    # Define preprocessor if not already given
    if preprocessor is None:
        preprocessor = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4915, 0.4823, 0.4468),
                                (0.2470, 0.2435, 0.2616))
        ])
    
    # load datasets
    data_train_val = datasets.CIFAR10(
        data_path,      
        train=True,      
        download=True,
        transform=preprocessor)

    data_test = datasets.CIFAR10(
        data_path, 
        train=False,
        download=True,
        transform=preprocessor)

    # train/validation split
    n_train = int(len(data_train_val)*train_val_split)
    n_val =  len(data_train_val) - n_train

    data_train, data_val = random_split(
        data_train_val, 
        [n_train, n_val],
        generator=torch.Generator().manual_seed(123)
    )

    print("Size of the train dataset:        ", len(data_train))
    print("Size of the validation dataset:   ", len(data_val))
    print("Size of the test dataset:         ", len(data_test))
    
    return (data_train, data_val, data_test)

cifar10_train, cifar10_val, cifar10_test = load_cifar()

# Now define a lighter version of CIFAR10: cifar
label_map = {0: 0, 2: 1}
class_names = ['airplane', 'bird']

# For each dataset, keep only airplanes and birds
cifar2_train = [(img, label_map[label]) for img, label in cifar10_train if label in [0, 2]]
cifar2_val = [(img, label_map[label]) for img, label in cifar10_val if label in [0, 2]]
cifar2_test = [(img, label_map[label]) for img, label in cifar10_test if label in [0, 2]]

print('Size of the training dataset: ', len(cifar2_train))
print('Size of the validation dataset: ', len(cifar2_val))
print('Size of the test dataset: ', len(cifar2_test))

Files already downloaded and verified
Files already downloaded and verified
Size of the train dataset:         45000
Size of the validation dataset:    5000
Size of the test dataset:          10000
Size of the training dataset:  9017
Size of the validation dataset:  983
Size of the test dataset:  2000


# 3.2
Write a MyMLP class that implements a MLP in PyTorch (so only fully connected layers) such
that:
(a) The input dimension is 3072 (= 32*32*3) and the output dimension is 2 (for the 2
classes).
(b) The hidden layers have respectively 512, 128 and 32 hidden units.
(c) All activation functions are ReLU. The last layer has no activation function since the
cross-entropy loss already includes a softmax activation function.

In [5]:
class MyMLP(nn.Module):
    def __init__(self):
        super().__init__()  

        self.flat = nn.Flatten()
        # 32*32*3: determined by our dataset: 32x32 RGB images
        self.fc1 = nn.Linear(32*32*3, 512)
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(512, 128)
        self.act2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 32)
        self.act3 = nn.ReLU()
        # 2: determined by our number of classes (birds and planes)
        self.fc4 = nn.Linear(32, 2)

    def forward(self, x):
        out = self.flat(x)
        out = self.act1(self.fc1(out))
        out = self.act2(self.fc2(out))
        out = self.act3(self.fc3(out))
        out = self.fc4(out)
        return out

# 3.3
Write a train(n epochs, optimizer, model, loss fn, train loader) function that trains model for n epochs epochs given an optimizer optimizer, a loss function loss fn and a dataloader train loader.

In [6]:
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    optimizer.zero_grad(set_to_none=True)
    
    for epoch in range(1, n_epochs + 1):
        
        loss_train = 0.0
        for imgs, labels in train_loader:

            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            outputs = model(imgs)
            
            loss = loss_fn(outputs, labels)
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()

            loss_train += loss.item()
            
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train

# 3.4
Write a similar function train manual update that has no optimizer parameter, but a learning rate lr parameter instead and that manually updates each trainable parameter of model
using equation (3). Do not forget to zero out all gradients after each iteration.

In [7]:
def train_manual_update(n_epochs, lr, model, loss_fn, train_loader, verbose = False):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:

            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            outputs = model(imgs)
            
            loss = loss_fn(outputs, labels)
            loss.backward()
            
            # optimizer step
            with torch.no_grad():
                for p in model.parameters():
                    p -= lr * p.grad
            
            # zero out all gradients
            with torch.no_grad():
                for p in model.parameters():
                    p.grad.zero_()

            loss_train += loss.item()
            
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0 or verbose:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train

# 3.5
Train 2 instances of MyMLP, one using train and the other using train manual update (use
the same parameter values for both models). Compare their respective training losses.

In [8]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
optimizer = optim.SGD(model.parameters(), lr=0.01)
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
res = train(n_epoch, optimizer, model, loss_fn, train_loader)

13:03:03.904185  |  Epoch 1  |  Training loss 0.635
13:03:13.876134  |  Epoch 10  |  Training loss 0.317
13:03:24.915506  |  Epoch 20  |  Training loss 0.195
13:03:36.010210  |  Epoch 30  |  Training loss 0.131


In [9]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
lr = 0.01
res = train_manual_update(n_epoch, lr, model, loss_fn, train_loader)

13:03:37.468136  |  Epoch 1  |  Training loss 0.635
13:03:48.224021  |  Epoch 10  |  Training loss 0.317
13:03:59.492313  |  Epoch 20  |  Training loss 0.195
13:04:10.656989  |  Epoch 30  |  Training loss 0.131


We see that the train and train_manual_update function produces the same training losses when given the same data and learning rate.

# 3.6

Modify train manual update by adding a L2 regularization term in your manual parameter
update. Add an additional weight decay parameter to train manual update. Compare
again train and train manual update results with 0 < weight decay < 1

In [None]:
def train_manual_update(n_epochs, lr, weight_decay, model, loss_fn, train_loader):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:

            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            outputs = model(imgs)
            
            loss = loss_fn(outputs, labels)
            
            # L2 regularization
            if weight_decay != 0:
                for parameter in model.parameters():
                    if parameter.grad is None:
                        continue
                    else:
                        grad = parameter.grad.data

                        # Adding p.data (weight) multiplied by weight_decay since in the backward pass the gradient of w**2 is 2*w
                        grad.add_(weight_decay, parameter.data)

                        # Updates the weights with the gradient with the standard SGD formula.
                        parameter.data.add_(-lr, grad)
            
            loss.backward()
                
            # optimizer step
            with torch.no_grad():
                for p in model.parameters():
                    p -= lr * p.grad
            
            # zero out all gradients
            with torch.no_grad():
                for p in model.parameters():
                    p.grad.zero_()
        
            loss_train += loss.item()
            
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train




In [11]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.01)
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
res = train(n_epoch, optimizer, model, loss_fn, train_loader)

13:04:11.903401  |  Epoch 1  |  Training loss 0.636
13:04:22.025526  |  Epoch 10  |  Training loss 0.334
13:04:33.572157  |  Epoch 20  |  Training loss 0.217
13:04:44.922731  |  Epoch 30  |  Training loss 0.143


In [12]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
lr = 0.01
weight_decay= 0.01
res = train_manual_update(n_epoch, lr, weight_decay, model, loss_fn, train_loader)

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /tmp/pip-req-build-kchz20tw/torch/csrc/utils/python_arg_parser.cpp:882.)
  grad.add_(weight_decay, parameter.data)


13:04:46.214283  |  Epoch 1  |  Training loss 0.638
13:04:57.060284  |  Epoch 10  |  Training loss 0.349
13:05:08.924803  |  Epoch 20  |  Training loss 0.246
13:05:20.997607  |  Epoch 30  |  Training loss 0.192


We see that the results differ some with the manual implementation of L2 reguralization. We suspect this is from our L2 loss formula which i have struggled a bit with. We initially thought from Andrews videos was that we could implement with `L2_reg += torch.pow(param, 2).sum()` or `L2_reg += param.norm(2)`, but with this implementation we got drasticly different results from the pytorch implementation. We saw that often the formula is expressed as `/2 * w **2` so we tried with the implementation `0.5 * torch.pow(param, 2).sum()` which gave us more similar results. The current implementation works a bit simpler by using the fact that parameters have to be loaded and iterated over later during corrections performed by the optimizer. With this is mind we dont need to do power of 2 because the gradient of w**2 is 2*w
We modify the existing gradient by adding p.data (weight) multiplied by weight_decay which results in an implementation done in-place. The last line updates the weights with the gradient with the standard SGD formula.

# 3.7

Modify train manual update by adding a momentum term in your parameter update. Add
an additional momentum parameter to train manual update. Check again the correctness of
the new update rule by comparing it to train function (with 0 < momentum < 1).

In [13]:
def train_manual_update(n_epochs, lr, weight_decay, momentum, model, loss_fn, train_loader):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0

        layers = sum([1 for _ in model.parameters()])
        velocity = [[]] * layers

        for imgs, labels in train_loader:

            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            outputs = model(imgs)
            
            loss = loss_fn(outputs, labels)

            # L2 regularization
            if weight_decay != 0:
                for parameter in model.parameters():
                    if parameter.grad is None:
                        continue
                    else:
                        grad = parameter.grad.data

                        # Adding p.data (weight) multiplied by weight_decay since in the backward pass the gradient of w**2 is 2*w
                        grad.add_(weight_decay, parameter.data)

                        # Updates the weights with the gradient with the standard SGD formula.
                        parameter.data.add_(-lr, grad)
            
            loss.backward()
            
            # Momentum 
            if momentum != 0:
                with torch.no_grad():
                    for param_idx, param in enumerate(model.parameters()):
                        if velocity[param_idx] == []:
                            velocity[param_idx] = torch.zeros(param.grad.shape).to(device=device)

                        velocity[param_idx] = velocity[param_idx] * momentum + param.grad
                        
                        param -= lr*velocity[param_idx]
                    model.zero_grad()
            
            # Optimizer step
            with torch.no_grad():
                for p in model.parameters():
                    p -= lr * p.grad
            
            # zero out all gradients
            with torch.no_grad():
                for p in model.parameters():
                    p.grad.zero_()
            
            loss_train += loss.item()
            
        losses_train.append(loss_train / n_batch)

        if epoch == 1 or epoch % 10 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))
    return losses_train

In [14]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0, momentum=0.9)
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
res = train(n_epoch, optimizer, model, loss_fn, train_loader)

13:05:22.312178  |  Epoch 1  |  Training loss 0.505
13:05:32.557347  |  Epoch 10  |  Training loss 0.191
13:05:44.171285  |  Epoch 20  |  Training loss 0.114
13:05:55.883847  |  Epoch 30  |  Training loss 0.057


In [15]:
torch.manual_seed(seed)
model = MyMLP().to(device=device) 
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)
lr=0.01
weight_decay=0
momentum=0.9
res = train_manual_update(n_epoch, lr, weight_decay, momentum, model, loss_fn, train_loader)

13:05:57.273626  |  Epoch 1  |  Training loss 0.505
13:06:08.862003  |  Epoch 10  |  Training loss 0.195
13:06:21.808895  |  Epoch 20  |  Training loss 0.095
13:06:34.839580  |  Epoch 30  |  Training loss 0.056


We implemented momentum by the defnition given in [Andrew's videos](https://www.youtube.com/watch?v=k8fTYJPd3_I). We initialize tensor of velocities which are computed for every gradient using the momentum input. With our manual momentum implementation we see that the two implementations converge in the same matter, but have slightly different numbers. We cannot quite pinpoint what makes these small differences, since to our understanding the momentum algorithm is implemented with the same logic as pytorch's implementation.

# 3.8

Train different instances (at least 4) of the MyMLP model with different learning rate, momentum
and weight decay values . You can choose the same values as in the
gradient descent output.txt file

In [52]:
batch_size =  256
n_epoch =  30
loss_fn =  nn.CrossEntropyLoss()
seed =  265
train_loader = torch.utils.data.DataLoader(cifar2_train, batch_size=batch_size, shuffle=False)

In [53]:
torch.manual_seed(seed)
model1 = MyMLP().to(device=device) 
lr=0.01
weight_decay=0
momentum=0
res = train_manual_update(n_epoch, lr, weight_decay, momentum, model1, loss_fn, train_loader)

13:32:03.234594  |  Epoch 1  |  Training loss 0.681
13:32:11.056466  |  Epoch 10  |  Training loss 0.465
13:32:19.983478  |  Epoch 20  |  Training loss 0.384
13:32:29.120044  |  Epoch 30  |  Training loss 0.318


In [54]:
torch.manual_seed(seed)
model2 = MyMLP().to(device=device) 
lr=0.01
weight_decay=0.01
momentum=0
res = train_manual_update(n_epoch, lr, weight_decay, momentum, model2, loss_fn, train_loader)

13:32:30.149938  |  Epoch 1  |  Training loss 0.681
13:32:38.541689  |  Epoch 10  |  Training loss 0.475
13:32:47.832251  |  Epoch 20  |  Training loss 0.404
13:32:57.022197  |  Epoch 30  |  Training loss 0.352


In [55]:
torch.manual_seed(seed)
model3 = MyMLP().to(device=device) 
lr=0.01
weight_decay=0
momentum=0.9
res = train_manual_update(n_epoch, lr, weight_decay, momentum, model3, loss_fn, train_loader)

13:32:58.046741  |  Epoch 1  |  Training loss 0.618
13:33:06.440061  |  Epoch 10  |  Training loss 0.266
13:33:15.812640  |  Epoch 20  |  Training loss 0.215
13:33:25.042414  |  Epoch 30  |  Training loss 0.100


In [56]:
torch.manual_seed(seed)
model4 = MyMLP().to(device=device) 
lr=0.01
weight_decay=0.001
momentum=0.9
res = train_manual_update(n_epoch, lr, weight_decay, momentum, model4, loss_fn, train_loader)

13:33:26.114930  |  Epoch 1  |  Training loss 0.618
13:33:34.709452  |  Epoch 10  |  Training loss 0.268
13:33:44.216785  |  Epoch 20  |  Training loss 0.194
13:33:53.849680  |  Epoch 30  |  Training loss 0.115


In [57]:
torch.manual_seed(seed)
model5 = MyMLP().to(device=device) 
lr=0.02
weight_decay=0.01
momentum=0.8
res = train_manual_update(n_epoch, lr, weight_decay, momentum, model5, loss_fn, train_loader)

13:33:55.102387  |  Epoch 1  |  Training loss 0.598
13:34:03.639401  |  Epoch 10  |  Training loss 0.269
13:34:13.202753  |  Epoch 20  |  Training loss 0.240
13:34:22.817917  |  Epoch 30  |  Training loss 0.233


In [58]:
models = [model1, model2, model3, model4, model5]

# 3.9
Select the best model among those trained in the previous question based on their accuracy

In [59]:
def validate(model, train_loader, val_loader):
    """
    Plot training and validation accuracy
    """
    model.eval()
    accdict = {}
    for name, loader in [("train", train_loader), ("val", val_loader)]:
        correct = 0
        total = 0

        with torch.no_grad():
            for imgs, labels in loader:
                imgs = imgs.to(device=device, dtype=torch.double)
                labels = labels.to(device=device)

                outputs = model(imgs)
                _, predicted = torch.max(outputs, dim=1)
                total += labels.shape[0]
                correct += int((predicted == labels).sum())

        print("Accuracy {}: {:.2f}".format(name , correct / total))
        accdict[name] = correct / total
    return accdict

val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=batch_size, shuffle=False)

In [60]:
for i, m in enumerate(models):
    print("-----------Model ",i+1,"-----------")
    validate(m, train_loader, val_loader)

-----------Model  1 -----------
Accuracy train: 0.88
Accuracy val: 0.83
-----------Model  2 -----------
Accuracy train: 0.86
Accuracy val: 0.83
-----------Model  3 -----------
Accuracy train: 0.90
Accuracy val: 0.81
-----------Model  4 -----------
Accuracy train: 0.96
Accuracy val: 0.84
-----------Model  5 -----------
Accuracy train: 0.88
Accuracy val: 0.81


# 3.10
We choose model 4 since it gives us the highest validation accuracy

In [63]:
test_loader =  torch.utils.data.DataLoader(cifar2_test, batch_size=batch_size, shuffle=False)

correct = 0
total = 0

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs = imgs.to(device=device, dtype=torch.double)
        labels = labels.to(device=device)

        outputs = model4(imgs)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.shape[0]
        correct += int((predicted == labels).sum())

print("Test accuracy of model 4: {:.2f}".format(correct / total))

Test accuracy of model 4: 0.85


We get a test accuracy (85%) which is almost the same as for validation (84%) which is what we expected