<a href="https://colab.research.google.com/github/ekingit/CNN-for-CIFAR10/blob/main/pretrained_ResNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F

import torchvision
from torchvision import datasets
from torchvision.transforms import v2

#Data

In [2]:
#mean and std. dev. for the pretrained model
mu = torch.tensor([0.485, 0.456, 0.406]) #  (3)
sigma = torch.tensor([0.229, 0.224, 0.225]) # (3)

v2_train = v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True), v2.Normalize(mu, sigma),])
v2_test = v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True), v2.Normalize(mu, sigma),])

In [3]:
ds_train = datasets.CIFAR10('./data', train=True, download=True, transform=v2_train)
ds_test = datasets.CIFAR10('./data', train=False, download=True, transform=v2_test)
dl_train = DataLoader(ds_train, batch_size=128,shuffle=True) #50.000 item = 128*390.625 = 128*390+80
dl_test = DataLoader(ds_test, batch_size=128,shuffle=True) #10.000 item = 128*78.125 = 128*78+16

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:11<00:00, 14.5MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


#Model

In [4]:
# import ResNet18 trained on ImageNet
pretrained_model = torchvision.models.resnet18(weights='IMAGENET1K_V1')
total_parameters = sum(p.numel() for p in pretrained_model.parameters())
print(f"Total Parameters: {total_parameters:,}") #12 million parameters

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 185MB/s]


Total Parameters: 11,689,512


In [None]:
#let's take a look at the layers of the model
for name, layer in pretrained_model.named_children():
    print(name, layer)

conv1 Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
bn1 BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
relu ReLU(inplace=True)
maxpool MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
layer1 Sequential(
  (0): BasicBlock(
    (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (1): BasicBlock(
    (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1

In [None]:
#While all layers in this model extract features from images, the final layer classifies the image into one of 1,000 labels.
pretrained_model.fc

Linear(in_features=512, out_features=1000, bias=True)

In [None]:
#We modify the last layer to classify into 10 classes, as required for CIFAR-10.
pretrained_model.fc = nn.Linear(pretrained_model.fc.in_features, 10)
nn.init.xavier_uniform_(pretrained_model.fc.weight)

Parameter containing:
tensor([[-0.0388, -0.1046,  0.0758,  ...,  0.0668, -0.0284,  0.0834],
        [ 0.1063,  0.0632, -0.0109,  ..., -0.0014,  0.0328,  0.0672],
        [ 0.0601, -0.0880, -0.0404,  ..., -0.0346,  0.0763,  0.0432],
        ...,
        [-0.0761,  0.0398,  0.0946,  ...,  0.0214,  0.0443, -0.0795],
        [-0.0573,  0.0936,  0.0144,  ..., -0.0964,  0.0273, -0.0672],
        [ 0.0799, -0.0011, -0.0401,  ..., -0.0927, -0.0899, -0.0725]],
       requires_grad=True)

In [None]:
def train(model, dl, optimizer, epoch, device='cpu'):
    model.to(device)
    model.train() #from nn.Module
    correct = 0
    train_loss = 0
    for data, target in dl:
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data) #(batch_size,3,32,32) --> (10)
        batch_loss = loss(output, target) #(10x10) --> 1
        batch_loss.backward() #calculates gradients
        optimizer.step() #updates weights and kernels
        train_loss += batch_loss.detach().item()

        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item() # get the correct predictions

    correct = 100. * correct / len(dl.dataset)
    train_loss /= len(dl)
    trainloss.append(train_loss)
    trainacc.append(correct)
    print(f'Epoch: {epoch}, Avarage train loss: {train_loss:.2f}, Accuracy: {correct:.1f}%')

In [None]:
def test(model, dl_test, device='cpu'):
    model.to(device)
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad(): # no need to track gradients. Saves memory.
        for data, target in dl_test:
            data, target = data.to(device), target.to(device)
            output = model(data)
            batch_loss = loss(output,target)
            test_loss += batch_loss.detach().item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item() # get the correct predictions

    test_loss /= len(dl_test)
    correct = 100. * correct / len(dl_test.dataset)
    testloss.append(test_loss)
    testacc.append(correct)
    print(f'Avarage test loss: {test_loss:.2f}, Accuracy: {correct:.1f}%')

#Train

All model parameters, except those of the final classification layer, are initialized using a pretrained model. During training, we aim to fine-tune these parameters while also training the final layer's parameters. To achieve this, we assign a larger learning rate to the last layer—specifically, 10 times the learning rate of the other layers.

In [None]:
lr = 5e-5
momentum = 0.9
weight_decay = 5e-5
epochs = 50

torch.manual_seed(4321)

device = 'cuda'
if device == 'cuda': torch.backends.cudnn.benchmark = True # additional speed up

#
params = [param for name, param in pretrained_model.named_parameters() if name not in ["fc.weight", "fc.bias"]] # all params except the last layer
optimizer = torch.optim.SGD([{'params': params}, {'params': pretrained_model.fc.parameters(), 'lr': 10 * lr}], lr=lr, momentum=momentum, weight_decay=weight_decay)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

loss = nn.CrossEntropyLoss()

# run()
a = time.time()
trainloss = []
testloss = []
trainacc = []
testacc = []

for epoch in range(0, epochs):
    train(pretrained_model, dl_train, optimizer, epoch+1, device=device)
    test(pretrained_model, dl_test, device=device)
    scheduler.step()

b = time.time()
print(f'Training took {round(b - a, 0)} seconds.')

Epoch: 1, Avarage train loss: 1.96, Accuracy: 35.9%
Avarage test loss: 1.56, Accuracy: 45.9%
Epoch: 2, Avarage train loss: 1.40, Accuracy: 51.0%
Avarage test loss: 1.34, Accuracy: 53.3%
Epoch: 3, Avarage train loss: 1.23, Accuracy: 56.9%
Avarage test loss: 1.24, Accuracy: 57.4%
Epoch: 4, Avarage train loss: 1.13, Accuracy: 60.6%
Avarage test loss: 1.16, Accuracy: 59.5%
Epoch: 5, Avarage train loss: 1.05, Accuracy: 63.1%
Avarage test loss: 1.12, Accuracy: 61.4%
Epoch: 6, Avarage train loss: 0.98, Accuracy: 65.4%
Avarage test loss: 1.07, Accuracy: 63.0%
Epoch: 7, Avarage train loss: 0.93, Accuracy: 67.3%
Avarage test loss: 1.03, Accuracy: 63.9%
Epoch: 8, Avarage train loss: 0.88, Accuracy: 68.9%
Avarage test loss: 1.01, Accuracy: 65.2%
Epoch: 9, Avarage train loss: 0.84, Accuracy: 70.2%
Avarage test loss: 0.99, Accuracy: 65.8%
Epoch: 10, Avarage train loss: 0.81, Accuracy: 71.6%
Avarage test loss: 0.97, Accuracy: 66.4%
Epoch: 11, Avarage train loss: 0.78, Accuracy: 72.7%
Avarage test los

To evaluate the effectiveness of pretraining, let's also train the model with random initialization for comparison.

In [None]:
model = torchvision.models.resnet18(pretrained=False)



In [None]:
model.fc = nn.Linear(model.fc.in_features, 10)
nn.init.xavier_uniform_(model.fc.weight)

Parameter containing:
tensor([[ 0.0790, -0.0576, -0.0450,  ..., -0.0095,  0.0788,  0.0545],
        [ 0.0374,  0.1041, -0.0254,  ...,  0.0206,  0.0418, -0.0794],
        [-0.0456,  0.0770,  0.0245,  ..., -0.0440,  0.1064,  0.0973],
        ...,
        [ 0.0055,  0.0475, -0.0400,  ..., -0.0041, -0.0834,  0.0660],
        [ 0.0418,  0.0817,  0.0433,  ..., -0.0074, -0.0647,  0.0371],
        [ 0.0039,  0.0132,  0.0398,  ...,  0.0661,  0.0467, -0.1025]],
       requires_grad=True)

In [None]:
lr = 5e-4
momentum = 0.9
weight_decay = 5e-5
epochs = 20

torch.manual_seed(4321) #fixes initial weights and kernels to test different models/hyperparameters

device = 'cuda'
if device == 'cuda': torch.backends.cudnn.benchmark = True # additional speed up

optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-5)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

loss = nn.CrossEntropyLoss()

# run()
a = time.time()
trainloss = []
testloss = []
trainacc = []
testacc = []

for epoch in range(0, epochs):
    train(model, dl_train, optimizer, epoch+1, device=device)
    test(model, dl_test, device=device)
    scheduler.step()

b = time.time()
print(f'Training took {round(b - a, 0)} seconds.')

Epoch: 1, Avarage train loss: 1.90, Accuracy: 35.0%
Avarage test loss: 1.59, Accuracy: 44.0%
Epoch: 2, Avarage train loss: 1.43, Accuracy: 48.9%
Avarage test loss: 1.43, Accuracy: 48.8%
Epoch: 3, Avarage train loss: 1.23, Accuracy: 56.2%
Avarage test loss: 1.36, Accuracy: 51.9%
Epoch: 4, Avarage train loss: 1.07, Accuracy: 62.1%
Avarage test loss: 1.34, Accuracy: 52.9%
Epoch: 5, Avarage train loss: 0.93, Accuracy: 67.3%
Avarage test loss: 1.35, Accuracy: 53.9%
Epoch: 6, Avarage train loss: 0.80, Accuracy: 72.2%
Avarage test loss: 1.35, Accuracy: 54.6%
Epoch: 7, Avarage train loss: 0.67, Accuracy: 77.3%
Avarage test loss: 1.38, Accuracy: 55.1%
Epoch: 8, Avarage train loss: 0.55, Accuracy: 81.9%
Avarage test loss: 1.44, Accuracy: 54.3%
Epoch: 9, Avarage train loss: 0.43, Accuracy: 86.4%
Avarage test loss: 1.50, Accuracy: 54.4%
Epoch: 10, Avarage train loss: 0.33, Accuracy: 90.0%
Avarage test loss: 1.59, Accuracy: 54.5%
Epoch: 11, Avarage train loss: 0.25, Accuracy: 92.9%
Avarage test los

As we see, model overfits very rapidly.