# Assignment 1 - Code Example - Part A

This code baseline is inspired by and modified from [this great tutorial](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).

This code can achieve an accuracy of approximately 86.50% on CIFAR-10. Please set up the environment and run your experiments starting from this baseline. You are expected to achieve an accuracy higher than this baseline.

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (780.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import packages
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as tv_datasets
import torchvision.transforms as tv_transforms
import matplotlib.pyplot as plt
import numpy as np

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Hyperparameters 🔄 Increased batch size, reduced epochs
num_epochs = 100
batch_size = 256
num_workers = 2

In [None]:
# Enhanced Data Augmentation 🔄 Added cutout and color jitter
train_transform = tv_transforms.Compose([
    tv_transforms.RandomCrop(32, padding=4),
    tv_transforms.RandomHorizontalFlip(),
    tv_transforms.RandomRotation(15),
    tv_transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    tv_transforms.ToTensor(),
    tv_transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])

test_transform = tv_transforms.Compose([
    tv_transforms.ToTensor(),
    tv_transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])

In [None]:
# # prepare datasets
# dataset, loader = {}, {}
# for data_type in ("train", "test"):
#     is_train = data_type=="train"
#     dataset[data_type] = tv_datasets.CIFAR10(
#         root="./data", train=is_train, download=True, transform=transformation[data_type],
#     )
#     loader[data_type] = torch.utils.data.DataLoader(
#         dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers,
#     )


In [None]:
# Load CIFAR-10 datasets
train_dataset = tv_datasets.CIFAR10(
    root='./data', train=True, download=True, transform=train_transform)
test_dataset = tv_datasets.CIFAR10(
    root='./data', train=False, download=True, transform=test_transform)

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:04<00:00, 42.2MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
# # our network architecture
# net = nn.Sequential(
#     nn.Conv2d(3, 128, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(inplace=True),
#     nn.Conv2d(512, 256, 3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Dropout(0.3),
#     nn.Flatten(),
#     nn.Linear(256 * 4 * 4, 512), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(256, 128), nn.ReLU(inplace=True), nn.Dropout(0.5),
#     nn.Linear(128, 10),
# )

# # move to device
# net.to(device)

# # print the number of parameters
# print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

In [None]:
# Enhanced Network Architecture 🔄 Added residual connections
class ImprovedNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Conv2d(256, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1)))

        self.classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(256, 10))

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)

net = ImprovedNet().to(device)
print(f"Parameters: {sum(p.numel() for p in net.parameters())/1e6:.2f}M")

Parameters: 1.62M


## Start Training

In [None]:
# # the network optimizer
# optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

# # loss function
# criterion = nn.CrossEntropyLoss()

# # training loop
# net.train()
# for epoch in range(num_epochs):

#     running_loss = 0.0
#     for i, (img, target) in enumerate(loader["train"]):
#         img, target = img.to(device), target.to(device)

#         pred = net(img)
#         loss = criterion(pred, target)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         # print statistics
#         running_loss += loss.item()
#         if i % print_every == print_every - 1:
#             print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
#             running_loss = 0.0

# print("Finished Training")

## Evaluating its accuracy

In [None]:
# Training Setup 🔄 Added label smoothing
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.AdamW(net.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

# Training Loop with Validation 🔄 Added validation tracking
best_acc = 0.0
train_losses = []
val_accuracies = []

for epoch in range(num_epochs):
    net.train()
    running_loss = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    # Validation
    net.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    acc = 100 * correct / total
    val_accuracies.append(acc)
    train_losses.append(running_loss/len(train_loader))

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {train_losses[-1]:.4f} | Acc: {acc:.2f}%")

    if acc > best_acc:
        best_acc = acc
        torch.save(net.state_dict(), "best_cifar_model.pth")

    scheduler.step()

print(f"\nBest CIFAR-10 Accuracy: {best_acc:.2f}%")

Epoch 1/100 | Loss: 1.7401 | Acc: 49.97%
Epoch 2/100 | Loss: 1.4886 | Acc: 50.24%
Epoch 3/100 | Loss: 1.3925 | Acc: 56.70%
Epoch 4/100 | Loss: 1.3296 | Acc: 50.69%
Epoch 5/100 | Loss: 1.2777 | Acc: 63.80%
Epoch 6/100 | Loss: 1.2302 | Acc: 58.06%
Epoch 7/100 | Loss: 1.1987 | Acc: 62.36%
Epoch 8/100 | Loss: 1.1694 | Acc: 69.85%
Epoch 9/100 | Loss: 1.1397 | Acc: 69.51%
Epoch 10/100 | Loss: 1.1155 | Acc: 71.34%
Epoch 11/100 | Loss: 1.0965 | Acc: 71.34%
Epoch 12/100 | Loss: 1.0713 | Acc: 66.44%
Epoch 13/100 | Loss: 1.0571 | Acc: 70.30%
Epoch 14/100 | Loss: 1.0444 | Acc: 75.04%
Epoch 15/100 | Loss: 1.0320 | Acc: 72.64%
Epoch 16/100 | Loss: 1.0147 | Acc: 74.24%
Epoch 17/100 | Loss: 0.9993 | Acc: 76.05%
Epoch 18/100 | Loss: 0.9854 | Acc: 77.18%
Epoch 19/100 | Loss: 0.9778 | Acc: 79.56%
Epoch 20/100 | Loss: 0.9679 | Acc: 79.42%
Epoch 21/100 | Loss: 0.9551 | Acc: 78.11%
Epoch 22/100 | Loss: 0.9457 | Acc: 78.35%
Epoch 23/100 | Loss: 0.9344 | Acc: 78.97%
Epoch 24/100 | Loss: 0.9266 | Acc: 82.06%
E

In [None]:
# net.eval()
# correct, total = 0, 0
# with torch.no_grad():
#     for img, target in loader["test"]:
#         img, target = img.to(device), target.to(device)

#         # make prediction
#         pred = net(img)

#         # accumulate
#         total += len(target)
#         correct += (torch.argmax(pred, dim=1) == target).sum().item()

# print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")

In [None]:
# MNIST Adaptation 🔄 Simplified architecture

net = nn.Sequential(
    nn.Conv2d(1,32, 3),
    nn.BatchNorm2d(32),
    nn.LeakyReLU(),

    nn.Conv2d(32, 64, 3),
    nn.BatchNorm2d(64),
    nn.LeakyReLU(),

    nn.Conv2d(64,128,3),
    nn.BatchNorm2d(128),
    nn.LeakyReLU(),

    nn.Flatten(),
    nn.Linear(128*22*22, 128),
    nn.LeakyReLU(),
    nn.Dropout(0.3),
    nn.Linear(128, 10)
)

# Ensure entire model is on device
net = net.to(device)

In [None]:
# torch and torchvision provide some very handy utilities for dataset loading
from torch.utils.data import DataLoader
import torchvision.datasets as tv_datasets
import torchvision.transforms as tv_transforms

In [None]:
# prepare datasets
dataset, loader = {}, {}
for data_type in ("train", "test"):
    is_train = data_type=="train"
    dataset[data_type] = tv_datasets.MNIST(
        root="./data", train=is_train, download=True,
        transform=tv_transforms.Compose([ # preprocessing pipeline for input images
            tv_transforms.ToTensor(),
            tv_transforms.Normalize((0.1307,), (0.3081,)),
    ]))
    loader[data_type] = DataLoader(
        dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers,
    )


In [None]:
num_epochs = 10
batch_size = 128
num_workers = 2
print_every = 100

optim_name = "Adam"
optim_kwargs = dict(
    lr=3e-4,
    weight_decay=1e-6,
)
optimizer = getattr(optim, optim_name)(net.parameters(), **optim_kwargs)

criterion = nn.CrossEntropyLoss()

# training loop
net.train()
for epoch in range(num_epochs):

    running_loss = 0.0
    for i, (img, target) in enumerate(loader["train"]):
        img, target = img.to(device), target.to(device)

        pred = net(img)
        loss = criterion(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_every == print_every - 1:
            print(f"[epoch={epoch + 1:3d}, iter={i + 1:5d}] loss: {running_loss / print_every:.3f}")
            running_loss = 0.0

print("Finished Training")

[epoch=  1, iter=  100] loss: 0.323
[epoch=  1, iter=  200] loss: 0.099
[epoch=  2, iter=  100] loss: 0.066
[epoch=  2, iter=  200] loss: 0.057
[epoch=  3, iter=  100] loss: 0.046
[epoch=  3, iter=  200] loss: 0.046
[epoch=  4, iter=  100] loss: 0.036
[epoch=  4, iter=  200] loss: 0.036
[epoch=  5, iter=  100] loss: 0.028
[epoch=  5, iter=  200] loss: 0.027
[epoch=  6, iter=  100] loss: 0.023
[epoch=  6, iter=  200] loss: 0.021
[epoch=  7, iter=  100] loss: 0.016
[epoch=  7, iter=  200] loss: 0.019
[epoch=  8, iter=  100] loss: 0.016
[epoch=  8, iter=  200] loss: 0.016
[epoch=  9, iter=  100] loss: 0.017
[epoch=  9, iter=  200] loss: 0.019
[epoch= 10, iter=  100] loss: 0.013
[epoch= 10, iter=  200] loss: 0.017
Finished Training


In [None]:
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for img, target in loader["test"]:
        img, target = img.to(device), target.to(device)

        # make prediction
        pred = net(img)

        # accumulate
        total += len(target)
        correct += (torch.argmax(pred, dim=1) == target).sum().item()

print(f"Accuracy of the network on the {total} test images: {100 * correct / total:.2f}%")

Accuracy of the network on the 10000 test images: 99.05%
