# Step 1

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from torchvision import transforms, datasets
import torch


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip "/content/drive/MyDrive/cnn_dataset.zip" -d "/content/"

applying transformations

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

data_dir = '/content/cnn_dataset'
dataset = datasets.ImageFolder(data_dir, transform=transform)

print("Number of images:", len(dataset))
print("Classes:", dataset.classes)



In [None]:
from collections import Counter
class_counts = Counter(label for _, label in dataset.samples)

print("Number of images per class:")
for class_idx, count in class_counts.items():
    class_name = dataset.classes[class_idx]
    print(f"{class_name}: {count} images")

showing some sample images

In [None]:
def show_sample_images(dataset, num_samples=3):
    fig, axes = plt.subplots(nrows=len(dataset.classes), ncols=num_samples, figsize=(num_samples*3, len(dataset.classes)*3))
    for i, class_name in enumerate(dataset.classes):
        class_idx = dataset.class_to_idx[class_name]
        imgs = [dataset[idx][0].permute(1, 2, 0) for idx in range(len(dataset)) if dataset.imgs[idx][1] == class_idx][:num_samples]
        for j, img in enumerate(imgs):
            axes[i, j].imshow((img * 0.5 + 0.5).numpy())
            axes[i, j].set_title(class_name)
            axes[i, j].axis('off')
    plt.tight_layout()
    plt.show()

show_sample_images(dataset)

visualizing distribution of classes

In [None]:
import collections
counts = collections.Counter([label for _, label in dataset.imgs])
plt.bar(list(counts.keys()), list(counts.values()))
plt.xlabel('Class Index')
plt.ylabel('Number of Images')
plt.title('Class Distribution')
plt.show()


histogram of pixel distribution of sample

In [None]:
sample_img = dataset[0][0].numpy().flatten()
plt.hist(sample_img, bins=50, color='blue', alpha=0.7)
plt.title("Histogram of Pixel Values")
plt.xlabel("Pixel Value")
plt.ylabel("Frequency")
plt.show()


Average image of each class

In [None]:
def compute_average_image(dataset, class_idx):
    imgs = [dataset[idx][0].numpy() for idx in range(len(dataset)) if dataset.imgs[idx][1] == class_idx]
    return np.mean(np.stack(imgs), axis=0)

avg_images = {cls: compute_average_image(dataset, idx) for cls, idx in dataset.class_to_idx.items()}
fig, axes = plt.subplots(1, len(avg_images), figsize=(15, 5))
for ax, (cls, avg_img) in zip(axes, avg_images.items()):
    ax.imshow((avg_img.transpose(1, 2, 0) * 0.5 + 0.5))
    ax.set_title(f'Average: {cls}')
    ax.axis('off')
plt.show()


dataset split and loading splits

In [None]:
data_indices = list(range(len(dataset)))
train_idx, test_idx = train_test_split(data_indices, test_size=0.2, stratify=[s[1] for s in dataset.imgs], random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, stratify=[dataset.imgs[i][1] for i in train_idx], random_state=42)



In [None]:
from torch.utils.data import Subset, DataLoader
train_dataset = Subset(dataset, train_idx)
val_dataset   = Subset(dataset, val_idx)
test_dataset  = Subset(dataset, test_idx)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
import torch.nn as nn

# Step 2

Architecture of VGG16 with some modifications to match dataset

In [None]:
class VGG16(nn.Module):
    def __init__(self, num_classes=3):
        super(VGG16, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.classifier = nn.Sequential(
            nn.Linear(512 * 4 * 4, 4096),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

vgg_model = VGG16(num_classes=3)


1. Here, this architecture use convulution layer that learns low to high level feature progressively, where starting layers are there to capture patterns like edges and textures and the later layerss will capture more complex shapes and object.

2. The MaxPool2d layers will reduce the spatial dimensions, which not only decreases computational load but also helps the learned features to sustain with small translations in the input image.

3. The classification section aggregates the extracted features into a compact representation. This is important to distinguish between the three classes by learning complex, non-linear relationships.

4. Dropout will help to decrease the level of overfitting. As we are training on 3 classes, each class (dogs, food, vehicles) might have varying amounts of data or some minor intra-class variability.

5. Here, I have used one less block than original paper. Additionally, the input is of 64 * 64 size with feature maps using 4 * 4. VGG was designed for 1000 class output but this is modified for 3 classes.

<span style="color:salmon; font-size:20px;">Model with kaiming initialization, Adam optimizer and 64 batch size (dropout, learning rate scheduler, regularization and transformations used):</span>

using kaiming initilaization first

In [None]:
def init_weights(m, init_type='xavier'):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        if init_type == 'xavier':
            nn.init.xavier_uniform_(m.weight)
        elif init_type == 'he':
            nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

vgg_model.apply(lambda m: init_weights(m, init_type='he'))

adam optimizer with regularization

In [None]:
import torch.optim as optim

optimizer_adam = optim.Adam(vgg_model.parameters(), lr=0.001, weight_decay=1e-4)


In [None]:
!pip install wandb

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg_model.to(device)

In [None]:
import wandb
import torch
import torch.nn.functional as F

starting training with learning rate scheduler and wandb to log the training

In [None]:
wandb.init(project="my_vgg16_project", name="vgg16_he_wandb")

criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_adam, mode='min', factor=0.1, patience=5)

num_epochs = 20
best_val_acc = 0

train_loss_list, val_loss_list = [], []
train_acc_list,  val_acc_list  = [], []

for epoch in range(num_epochs):

    vgg_model.train()
    running_train_loss = 0.0
    correct_train = 0
    total_train   = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer_adam.zero_grad()
        outputs = vgg_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_adam.step()

        running_train_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    train_acc = 100. * correct_train / total_train
    avg_train_loss = running_train_loss / len(train_loader)


    vgg_model.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val   = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = vgg_model(images)
            loss = criterion(outputs, labels)

            running_val_loss += loss.item()
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    val_acc = 100. * correct_val / total_val
    avg_val_loss = running_val_loss / len(val_loader)

    scheduler.step(avg_val_loss)


    wandb.log({
        "Epoch": epoch,
        "Train Loss": avg_train_loss,
        "Train Accuracy": train_acc,
        "Val Loss": avg_val_loss,
        "Val Accuracy": val_acc,
        "Learning Rate": optimizer_adam.param_groups[0]['lr']
    })

    print(f"Epoch {epoch}/{num_epochs}: "
          f"Train Acc: {train_acc:.2f}% | Train Loss: {avg_train_loss:.4f} | "
          f"Val Acc: {val_acc:.2f}% | Val Loss: {avg_val_loss:.4f}")

    train_loss_list.append(avg_train_loss)
    val_loss_list.append(avg_val_loss)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(vgg_model.state_dict(), 'vgg16_he_best.pth')


wandb.finish()


In [None]:
checkpoint = {
    'epoch': num_epochs,
    'model_state_dict': vgg_model.state_dict(),
    'optimizer_state_dict': optimizer_adam.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss_history': train_loss_list,
    'val_loss_history': val_loss_list,
    'train_acc_history': train_acc_list,
    'val_acc_history': val_acc_list,
    'best_val_acc': best_val_acc
}
torch.save(checkpoint, 'checkpoint_vgg16_he.pth')

In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

loading this model for evaluation

In [None]:
best_model = VGG16(num_classes=3).to(device)
best_model.load_state_dict(torch.load("vgg16_he_best.pth", map_location=device, weights_only=True))
best_model.eval()

all_preds = []
all_labels = []
test_loss = 0.0
correct = 0
total = 0

criterion = nn.CrossEntropyLoss()

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = best_model(images)

        loss = criterion(outputs, labels)
        test_loss += loss.item() * images.size(0)

        _, predicted = outputs.max(dim=1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

test_loss /= total
test_acc = 100. * correct / total

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=dataset.classes, yticklabels=dataset.classes)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - VGG16")
plt.show()

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

<span style="color:salmon; font-size:20px;">Model with Xavier initialization, Adam optimizer and 64 batch size (dropout, learning rate scheduler, regularization and transformations used):</span>

In [None]:
vgg_model_xavier = VGG16(num_classes=3)

In [None]:
vgg_model_xavier.apply(lambda m: init_weights(m, init_type='xavier'))

In [None]:
optimizer_adam = optim.Adam(vgg_model_xavier.parameters(), lr=0.001, weight_decay=1e-4)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg_model_xavier.to(device)

training loop

In [None]:
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_adam, mode='min', factor=0.1, patience=5)

num_epochs = 20
best_val_acc = 0

train_loss_list, val_loss_list = [], []
train_acc_list,  val_acc_list  = [], []

for epoch in range(1, num_epochs+1):

    vgg_model_xavier.train()
    running_train_loss = 0.0
    correct_train = 0
    total_train   = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer_adam.zero_grad()
        outputs = vgg_model_xavier(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_adam.step()

        running_train_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    train_acc = 100. * correct_train / total_train
    avg_train_loss = running_train_loss / len(train_loader)

    vgg_model_xavier.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val   = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = vgg_model_xavier(images)
            loss = criterion(outputs, labels)

            running_val_loss += loss.item()
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    val_acc = 100. * correct_val / total_val
    avg_val_loss = running_val_loss / len(val_loader)

    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch}/{num_epochs}: "
          f"Train Acc: {train_acc:.2f}% | Train Loss: {avg_train_loss:.4f} | "
          f"Val Acc: {val_acc:.2f}% | Val Loss: {avg_val_loss:.4f}")

    train_loss_list.append(avg_train_loss)
    val_loss_list.append(avg_val_loss)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(vgg_model_xavier.state_dict(), 'vgg16_xavier_best.pth')

saving logs to wandb

In [None]:
wandb.init(project="my_vgg16_project", name="vgg16_xavier_wandb")
num_epochs = 20

for epoch in range(num_epochs):
    wandb.log({
        "Epoch": epoch + 1,
        "Train Loss": train_loss_list[epoch],
        "Train Accuracy": train_acc_list[epoch],
        "Val Loss": val_loss_list[epoch],
        "Val Accuracy": val_acc_list[epoch]
    })

wandb.finish()


In [None]:
checkpoint = {
    'epoch': num_epochs,
    'model_state_dict': vgg_model_xavier.state_dict(),
    'optimizer_state_dict': optimizer_adam.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss_history': train_loss_list,
    'val_loss_history': val_loss_list,
    'train_acc_history': train_acc_list,
    'val_acc_history': val_acc_list,
    'best_val_acc': best_val_acc
}
torch.save(checkpoint, 'checkpoint_vgg16_xavier.pth')

evaluation on testing dataset

In [None]:
best_model = VGG16(num_classes=3).to(device)
best_model.load_state_dict(torch.load("vgg16_xavier_best.pth", map_location=device, weights_only=True))
best_model.eval()

all_preds = []
all_labels = []
test_loss = 0.0
correct = 0
total = 0

criterion = nn.CrossEntropyLoss()

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = best_model(images)

        loss = criterion(outputs, labels)
        test_loss += loss.item() * images.size(0)

        _, predicted = outputs.max(dim=1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

test_loss /= total
test_acc = 100. * correct / total

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=dataset.classes, yticklabels=dataset.classes)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - VGG16")
plt.show()

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Kaiming initilization is better

<span style="color:salmon; font-size:20px;">Model with kaiming initialization, AdamW optimizer and 64 batch size (dropout, learning rate scheduler, regularization and transformations used):</span>

In [None]:
vgg_model_adamw = VGG16(num_classes=3)

In [None]:
vgg_model_adamw.apply(lambda m: init_weights(m, init_type='he'))

In [None]:
optimizer_adamw = optim.AdamW(vgg_model_adamw.parameters(), lr=0.001, weight_decay=1e-4)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg_model_adamw.to(device)

training

In [None]:
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_adamw, mode='min', factor=0.1, patience=5)

num_epochs = 20
best_val_acc = 0

train_loss_list, val_loss_list = [], []
train_acc_list,  val_acc_list  = [], []

for epoch in range(1, num_epochs+1):

    vgg_model_adamw.train()
    running_train_loss = 0.0
    correct_train = 0
    total_train   = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer_adamw.zero_grad()
        outputs = vgg_model_adamw(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_adamw.step()

        running_train_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    train_acc = 100. * correct_train / total_train
    avg_train_loss = running_train_loss / len(train_loader)

    vgg_model_adamw.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val   = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = vgg_model_adamw(images)
            loss = criterion(outputs, labels)

            running_val_loss += loss.item()
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    val_acc = 100. * correct_val / total_val
    avg_val_loss = running_val_loss / len(val_loader)

    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch}/{num_epochs}: "
          f"Train Acc: {train_acc:.2f}% | Train Loss: {avg_train_loss:.4f} | "
          f"Val Acc: {val_acc:.2f}% | Val Loss: {avg_val_loss:.4f}")

    train_loss_list.append(avg_train_loss)
    val_loss_list.append(avg_val_loss)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(vgg_model_adamw.state_dict(), 'vgg16_adamw_best.pth')

High loss on validation

wandb logs

In [None]:
wandb.init(project="my_vgg16_project", name="vgg16_adamw_wandb")
num_epochs = 20

for epoch in range(num_epochs):
    wandb.log({
        "Epoch": epoch + 1,
        "Train Loss": train_loss_list[epoch],
        "Train Accuracy": train_acc_list[epoch],
        "Val Loss": val_loss_list[epoch],
        "Val Accuracy": val_acc_list[epoch]
    })

wandb.finish()


In [None]:
checkpoint = {
    'epoch': num_epochs,
    'model_state_dict': vgg_model_adamw.state_dict(),
    'optimizer_state_dict': optimizer_adamw.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss_history': train_loss_list,
    'val_loss_history': val_loss_list,
    'train_acc_history': train_acc_list,
    'val_acc_history': val_acc_list,
    'best_val_acc': best_val_acc
}
torch.save(checkpoint, 'checkpoint_vgg16_adamw.pth')

evaluation on training

In [None]:
best_model = VGG16(num_classes=3).to(device)
best_model.load_state_dict(torch.load("vgg16_adamw_best.pth", map_location=device, weights_only=True))
best_model.eval()

all_preds = []
all_labels = []
test_loss = 0.0
correct = 0
total = 0

criterion = nn.CrossEntropyLoss()

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = best_model(images)

        loss = criterion(outputs, labels)
        test_loss += loss.item() * images.size(0)

        _, predicted = outputs.max(dim=1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

test_loss /= total
test_acc = 100. * correct / total

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=dataset.classes, yticklabels=dataset.classes)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - VGG16")
plt.show()

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

<span style="color:salmon; font-size:20px;">Model with kaiming initialization, SGD optimizer and 64 batch size (dropout, learning rate scheduler, regularization and transformations used):</span>

In [None]:
vgg_model_sgd = VGG16(num_classes=3)

In [None]:
vgg_model_sgd.apply(lambda m: init_weights(m, init_type='he'))

In [None]:
optimizer_sgd = optim.SGD(vgg_model_sgd.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg_model_sgd.to(device)

training

In [None]:
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sgd, mode='min', factor=0.1, patience=5)

num_epochs = 20
best_val_acc = 0

train_loss_list, val_loss_list = [], []
train_acc_list,  val_acc_list  = [], []

for epoch in range(1, num_epochs+1):

    vgg_model_sgd.train()
    running_train_loss = 0.0
    correct_train = 0
    total_train   = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer_sgd.zero_grad()
        outputs = vgg_model_sgd(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_sgd.step()

        running_train_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    train_acc = 100. * correct_train / total_train
    avg_train_loss = running_train_loss / len(train_loader)

    vgg_model_sgd.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val   = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = vgg_model_sgd(images)
            loss = criterion(outputs, labels)

            running_val_loss += loss.item()
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    val_acc = 100. * correct_val / total_val
    avg_val_loss = running_val_loss / len(val_loader)

    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch}/{num_epochs}: "
          f"Train Acc: {train_acc:.2f}% | Train Loss: {avg_train_loss:.4f} | "
          f"Val Acc: {val_acc:.2f}% | Val Loss: {avg_val_loss:.4f}")

    train_loss_list.append(avg_train_loss)
    val_loss_list.append(avg_val_loss)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(vgg_model_sgd.state_dict(), 'vgg16_sgd_best.pth')

In [None]:
wandb.init(project="my_vgg16_project", name="vgg16_sgd_wandb")
num_epochs = 20

for epoch in range(num_epochs):
    wandb.log({
        "Epoch": epoch + 1,
        "Train Loss": train_loss_list[epoch],
        "Train Accuracy": train_acc_list[epoch],
        "Val Loss": val_loss_list[epoch],
        "Val Accuracy": val_acc_list[epoch]
    })

wandb.finish()

In [None]:
checkpoint = {
    'epoch': num_epochs,
    'model_state_dict': vgg_model_sgd.state_dict(),
    'optimizer_state_dict': optimizer_sgd.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss_history': train_loss_list,
    'val_loss_history': val_loss_list,
    'train_acc_history': train_acc_list,
    'val_acc_history': val_acc_list,
    'best_val_acc': best_val_acc
}
torch.save(checkpoint, 'checkpoint_vgg16_sgd.pth')

evaluation on tesing dataset

In [None]:
best_model = VGG16(num_classes=3).to(device)
best_model.load_state_dict(torch.load("vgg16_sgd_best.pth", map_location=device, weights_only=True))
best_model.eval()

all_preds = []
all_labels = []
test_loss = 0.0
correct = 0
total = 0

criterion = nn.CrossEntropyLoss()

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = best_model(images)

        loss = criterion(outputs, labels)
        test_loss += loss.item() * images.size(0)

        _, predicted = outputs.max(dim=1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

test_loss /= total
test_acc = 100. * correct / total

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=dataset.classes, yticklabels=dataset.classes)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - VGG16")
plt.show()

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Decision: <br>
Model with kaiming initilization, adam optimizer and batch size 64 including some optimization is performing better. Although using AdamW instead of Adam with same configuration is giving slighly high accuracy of *94.32%* compared to *93.67%* from previous configuration. The loss on test is *0.2284* for AdamW and *0.2183* for previous model. Additionally, the validation loss is high on model with AdamW. So we will proceed with (kaiming initilization, adam optimizer and batch size 64)

<span style="color:salmon; font-size:20px;">Model with kaiming initialization, Adam optimizer and 128 batch size (same optimizations):</span>

In [None]:
train_dataset = Subset(dataset, train_idx)
val_dataset   = Subset(dataset, val_idx)
test_dataset  = Subset(dataset, test_idx)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [None]:
vgg_model_batch = VGG16(num_classes=3)

In [None]:
vgg_model_batch.apply(lambda m: init_weights(m, init_type='he'))

In [None]:
optimizer_adam = optim.Adam(vgg_model_batch.parameters(), lr=0.001, weight_decay=1e-4)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg_model_batch.to(device)

training loop

In [None]:
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_adam, mode='min', factor=0.1, patience=5)

num_epochs = 20
best_val_acc = 0

train_loss_list, val_loss_list = [], []
train_acc_list,  val_acc_list  = [], []

for epoch in range(1, num_epochs+1):

    vgg_model_batch.train()
    running_train_loss = 0.0
    correct_train = 0
    total_train   = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer_adam.zero_grad()
        outputs = vgg_model_batch(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_adam.step()

        running_train_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    train_acc = 100. * correct_train / total_train
    avg_train_loss = running_train_loss / len(train_loader)

    vgg_model_batch.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val   = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = vgg_model_batch(images)
            loss = criterion(outputs, labels)

            running_val_loss += loss.item()
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    val_acc = 100. * correct_val / total_val
    avg_val_loss = running_val_loss / len(val_loader)

    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch}/{num_epochs}: "
          f"Train Acc: {train_acc:.2f}% | Train Loss: {avg_train_loss:.4f} | "
          f"Val Acc: {val_acc:.2f}% | Val Loss: {avg_val_loss:.4f}")

    train_loss_list.append(avg_train_loss)
    val_loss_list.append(avg_val_loss)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(vgg_model_batch.state_dict(), 'vgg16_batch_best.pth')

In [None]:
wandb.init(project="my_vgg16_project", name="vgg16_batch_wandb")
num_epochs = 20

for epoch in range(num_epochs):
    wandb.log({
        "Epoch": epoch + 1,
        "Train Loss": train_loss_list[epoch],
        "Train Accuracy": train_acc_list[epoch],
        "Val Loss": val_loss_list[epoch],
        "Val Accuracy": val_acc_list[epoch]
    })

wandb.finish()

evaluation on testing dataset

In [None]:
checkpoint = {
    'epoch': num_epochs,
    'model_state_dict': vgg_model_batch.state_dict(),
    'optimizer_state_dict': optimizer_adam.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss_history': train_loss_list,
    'val_loss_history': val_loss_list,
    'train_acc_history': train_acc_list,
    'val_acc_history': val_acc_list,
    'best_val_acc': best_val_acc
}
torch.save(checkpoint, 'checkpoint_vgg16_batch.pth')

In [None]:
best_model = VGG16(num_classes=3).to(device)
best_model.load_state_dict(torch.load("vgg16_batch_best.pth", map_location=device, weights_only=True))
best_model.eval()

all_preds = []
all_labels = []
test_loss = 0.0
correct = 0
total = 0

criterion = nn.CrossEntropyLoss()

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = best_model(images)

        loss = criterion(outputs, labels)
        test_loss += loss.item() * images.size(0)

        _, predicted = outputs.max(dim=1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

test_loss /= total
test_acc = 100. * correct / total

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=dataset.classes, yticklabels=dataset.classes)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - VGG16")
plt.show()

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Decision: <br>
After trying different initilization, optimizers and batch size with multiple optimization our best choice is kaiming initilization, Adam optimizer, batch size 64 and optimizations like dropout, tranformations, regularization and learning rate scheduler. 

Loading the model for full visualization

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VGG16(num_classes=3).to(device)

checkpoint = torch.load("checkpoint_vgg16_he.pth", map_location=device)

model.load_state_dict(checkpoint['model_state_dict'])

train_loss_history = checkpoint['train_loss_history']
val_loss_history   = checkpoint['val_loss_history']
train_acc_history  = checkpoint['train_acc_history']
val_acc_history    = checkpoint['val_acc_history']

best_val_acc = checkpoint.get('best_val_acc', 0)
print(f"Checkpoint loaded. Best Val Accuracy: {best_val_acc:.2f}%")

In [None]:
epochs = range(1, len(train_acc_history) + 1)

plt.figure(figsize=(8,5))
plt.plot(epochs, train_acc_history, 'b-', label='Train Accuracy')
plt.plot(epochs, val_acc_history,   'r-', label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (fraction)')
plt.title('Training vs. Validation Accuracy')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
plt.plot(epochs, train_loss_history, 'b-', label='Train Loss')
plt.plot(epochs, val_loss_history,   'r-', label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs. Validation Loss')
plt.legend()
plt.show()

In [None]:
model.eval()

all_preds = []
all_labels = []
test_loss = 0.0
correct = 0
total = 0

criterion = nn.CrossEntropyLoss()

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)

        loss = criterion(outputs, labels)
        test_loss += loss.item() * images.size(0)

        _, predicted = outputs.max(dim=1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

test_loss /= total
test_acc = 100. * correct / total

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=dataset.classes, yticklabels=dataset.classes)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - VGG16")
plt.show()

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Visualizing misclassified images

In [None]:
misclassified_images = []
misclassified_preds = []
misclassified_labels = []

model.eval()
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = outputs.max(1)

        for i in range(images.size(0)):
            if predicted[i] != labels[i]:
                misclassified_images.append(images[i].cpu())
                misclassified_preds.append(predicted[i].cpu().item())
                misclassified_labels.append(labels[i].cpu().item())
            if len(misclassified_images) >= 6:
                break
        if len(misclassified_images) >= 6:
            break

fig, axes = plt.subplots(2, 3, figsize=(12,8))
axes = axes.flatten()
for i, ax in enumerate(axes):
    img = misclassified_images[i].permute(1, 2, 0).numpy()
    ax.imshow(img.clip(0,1))
    ax.set_title(f"True: {dataset.classes[misclassified_labels[i]]}\n"
                 f"Pred: {dataset.classes[misclassified_preds[i]]}")
    ax.axis('off')
plt.tight_layout()
plt.show()


# Step 3

<span style="color:salmon; font-size:20px;"> ResNet 18</span>

In [None]:
BATCH_SIZE   = 64
LR           = 0.001
EPOCHS       = 20
NUM_CLASSES  = 3
RANDOM_STATE = 42

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


dataset_dir = "/content/cnn_dataset"


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5],
                         std=[0.5, 0.5, 0.5])
])

full_dataset = datasets.ImageFolder(root=dataset_dir, transform=transform)

In [None]:
train_indices, test_indices = train_test_split(
    range(len(full_dataset)),
    test_size=0.15,
    random_state=RANDOM_STATE,
    stratify=[sample[1] for sample in full_dataset.samples]
)

train_indices, val_indices = train_test_split(
    train_indices,
    test_size=0.1765,
    random_state=RANDOM_STATE,
    stratify=[full_dataset.samples[i][1] for i in train_indices]
)

In [None]:
train_dataset = Subset(full_dataset, train_indices)
val_dataset   = Subset(full_dataset, val_indices)
test_dataset  = Subset(full_dataset, test_indices)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [None]:
print(f"Train set: {len(train_dataset)} images")
print(f"Val set:   {len(val_dataset)} images")
print(f"Test set:  {len(test_dataset)} images")

In [None]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(out_channels)
        self.relu  = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_channels)

        self.downsample = downsample

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)
        return out


class ResNet18(nn.Module):

    def __init__(self, num_classes=3):
        super(ResNet18, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(64)
        self.relu  = nn.ReLU(inplace=True)

        self.maxpool = nn.Identity()

        self.layer1 = self._make_layer(64,  2, stride=1)
        self.layer2 = self._make_layer(128, 2, stride=2)
        self.layer3 = self._make_layer(256, 2, stride=2)
        self.layer4 = self._make_layer(512, 2, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * BasicBlock.expansion, num_classes)

    def _make_layer(self, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

        layers = []
        layers.append(BasicBlock(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(BasicBlock(self.in_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


In [None]:
model_resnet = ResNet18(num_classes=NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_resnet.parameters(), lr=LR, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

train_loss_list = []
val_loss_list   = []
train_acc_list  = []
val_acc_list    = []

best_val_acc = 0.0


for epoch in range(EPOCHS):
    model_resnet.train()
    running_train_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, labels in train_loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()
        outputs = model_resnet(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    avg_train_loss = running_train_loss / len(train_loader)
    train_acc = 100.0 * correct_train / total_train

    model_resnet.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model_resnet(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()

            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    avg_val_loss = running_val_loss / len(val_loader)
    val_acc = 100.0 * correct_val / total_val

    train_loss_list.append(avg_train_loss)
    val_loss_list.append(avg_val_loss)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)

    scheduler.step(avg_val_loss)

    print(f"Epoch [{epoch+1}/{EPOCHS}] "
          f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.2f}%")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model_resnet.state_dict(), "resnet18_best.pth")

print(f"Training Complete. Best Validation Accuracy: {best_val_acc:.2f}%")


In [None]:
checkpoint = {
    'epoch': EPOCHS,
    'model_state_dict': model_resnet.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_loss_history': train_loss_list,
    'val_loss_history': val_loss_list,
    'train_acc_history': train_acc_list,
    'val_acc_history': val_acc_list,
    'best_val_acc': best_val_acc
}

torch.save(checkpoint, 'checkpoint_resnet18.pth')

In [None]:
checkpoint = torch.load('checkpoint_resnet18.pth', map_location='cpu')

train_loss_history = checkpoint['train_loss_history']
val_loss_history   = checkpoint['val_loss_history']
train_acc_history  = checkpoint['train_acc_history']
val_acc_history    = checkpoint['val_acc_history']

In [None]:
plt.figure()
plt.plot(train_loss_history, label='Train Loss')
plt.plot(val_loss_history, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('ResNet-18 Training/Validation Loss')
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(train_acc_history, label='Train Accuracy')
plt.plot(val_acc_history, label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('ResNet-18 Training/Validation Accuracy')
plt.legend()
plt.show()


Evaluation on testing dataset

In [None]:
model_resnet = ResNet18(num_classes=NUM_CLASSES).to(DEVICE)
model_resnet.load_state_dict(torch.load("resnet18_best.pth"))
model_resnet.eval()

criterion = nn.CrossEntropyLoss()


test_loss = 0.0
correct_test = 0
total_test = 0

all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        outputs = model_resnet(images)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        _, predicted = outputs.max(1)
        total_test += labels.size(0)
        correct_test += predicted.eq(labels).sum().item()

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

avg_test_loss = test_loss / len(test_loader)
test_acc = 100.0 * correct_test / total_test

print(f"Test Loss: {avg_test_loss:.4f} | Test Accuracy: {test_acc:.2f}%")

In [None]:
cm = confusion_matrix(all_labels, all_preds)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=full_dataset.classes,
            yticklabels=full_dataset.classes)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("ResNet-18 Confusion Matrix (Test Set)")
plt.show()

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")


In [None]:
import random

showing misclassified images

In [None]:
misclassified_indices = [i for i, (true_lbl, pred_lbl) in enumerate(zip(all_labels, all_preds))
                         if true_lbl != pred_lbl]
random.shuffle(misclassified_indices)

num_to_show = 5
sample_misclassified = misclassified_indices[:num_to_show]

fig, axes = plt.subplots(1, num_to_show, figsize=(5*num_to_show, 4))

for ax, idx in zip(axes, sample_misclassified):

    img, true_label = test_dataset[idx]


    img_np = img.permute(1, 2, 0).numpy()
    img_np = (img_np * 0.5) + 0.5
    img_np = np.clip(img_np, 0, 1)

    ax.imshow(img_np)
    ax.axis('off')
    ax.set_title(f"Pred: {full_dataset.classes[all_preds[idx]]}\nTrue: {full_dataset.classes[true_label]}")

plt.suptitle("Misclassified Examples")
plt.show()

In [None]:
wandb.init(project="my_vgg16_project", name="ResNet_wandb")
num_epochs = 20

for epoch in range(EPOCHS):
    wandb.log({
        "Epoch": epoch + 1,
        "Train Loss": train_loss_list[epoch],
        "Train Accuracy": train_acc_list[epoch],
        "Val Loss": val_loss_list[epoch],
        "Val Accuracy": val_acc_list[epoch]
    })

wandb.finish()

![Alt text](https://lh3.googleusercontent.com/d/1I-n8AlLzQga3Jk_4_BfcvHXZYEIsLmsV=w1000)

In [None]:
checkpoint_vgg = torch.load("checkpoint_vgg16_he.pth", map_location='cpu', weights_only=True)
train_loss_vgg = checkpoint_vgg['train_loss_history']
val_loss_vgg   = checkpoint_vgg['val_loss_history']
train_acc_vgg  = checkpoint_vgg['train_acc_history']
val_acc_vgg    = checkpoint_vgg['val_acc_history']

checkpoint_resnet = torch.load("checkpoint_resnet18.pth", map_location='cpu', weights_only=True)
train_loss_resnet = checkpoint_resnet['train_loss_history']
val_loss_resnet   = checkpoint_resnet['val_loss_history']
train_acc_resnet  = checkpoint_resnet['train_acc_history']
val_acc_resnet    = checkpoint_resnet['val_acc_history']


In [None]:
epochs_vgg = range(1, len(train_loss_vgg) + 1)
epochs_resnet = range(1, len(train_loss_resnet) + 1)


plt.plot(epochs_vgg, train_acc_vgg, 'b-', label='VGG-16 Train Acc')
plt.plot(epochs_vgg, val_acc_vgg,   'b--', label='VGG-16 Val Acc')
plt.plot(epochs_resnet, train_acc_resnet, 'r-', label='ResNet-18 Train Acc')
plt.plot(epochs_resnet, val_acc_resnet,   'r--', label='ResNet-18 Val Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()


plt.tight_layout()
plt.show()


In [None]:
plt.plot(epochs_vgg, train_loss_vgg, 'b-', label='VGG-16 Train Loss')
plt.plot(epochs_vgg, val_loss_vgg,   'b--', label='VGG-16 Val Loss')
plt.plot(epochs_resnet, train_loss_resnet, 'r-', label='ResNet-18 Train Loss')
plt.plot(epochs_resnet, val_loss_resnet,   'r--', label='ResNet-18 Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.tight_layout()
plt.show()


**Final Decision** <br>

ResNet is better option with less loss and accuracy. Although VGG with Adam or AdamW has slightly high accuracy this accuracy is not that bad as the difference is minor. The opther important part is the loss on testing is low as well (0.16). So, we are considering this as final best model.

# Step 4

**Theoretical Concepts**

1. VGG
    - The VGG paper shows that stacking many small 3 * 3 filters instead of larger kernels like 5 * 5 or 7 * 7 can achieve very deep networks and also keep the number of parameters relatively manageable.
    - A single 3 * 3 convolution layer has fewer parameters than a 5 * 5 or 7 * 7, and stacking multiple 3 * 3 layers in sequence increases the effective receptive field and yet maintains the fewer parameters per layer. This design makes the network easier to train and improves generalization.
    - VGG architectures have multiple convolution blocks, each followed by max-pooling. As we go deeper, the network learns progressively more abstract features like from edges -> textures -> parts -> objects.
    - The deeper the network, the richer the representational capacity but very deep networks can suffer from computational costs and might leads to overfitting if not properly regularized.

2. ResNet
    - When networks grow very deep, gradients must propagate back through many layers. This can lead to extremely small which is vanishing or extremely large which is exploding gradient values. Here, vanishing gradients slow or halt learning in early layers and exploding gradients might cause numerical instability.
    - Now, the ResNet introduced the concept of skip connections where the input to a set of layers is added directly to the output of those layers. This residual connection will make it easier for the network to learn an identity mapping if it's optimal. Here, gradients can flow directly through the skip path and alleviate the vanishing gradient issue and allow the networks to be much deeper without a much loss in trainability.
    - Identity mapping is the idea that if a certain set of layers (the residual block) does not need to transform the input significantly, it can just learn to output zero so that the block’s output is effectively the same as its input. So, this flexibility significantly stabilizes and speeds up training.

**Regularization and Optimization Techniques**

1. Data Augmentation:
    - It reduces overfitting by increasing the effective size and variability of the dataset. This might led to higher accuracy on the test/validation set.
    - VGG and ResNet both benefit equally in principle, but deeper networks may use augmented data even more effectively.
    - VGG has a large number of parameters in its fully connected layers, so it will make it prone to overfitting if the dataset is not huge. Data augmentation helps the network generalize better. ResNet handle overfitting slightly better due to the skip connections helps for smoother gradient flow and also requires fewer parameters overall.

2. Dropout:
    - It helps to prevent overfitting by randomly zeroing out neurons. In VGG, dropout is generally placed after the final pooling or in the fully connected layers. But in ResNet, it’s less frequently used in standard blocks.
    - Without droput it is possible that model may overfit on training data and perform slighly less on validation with slighly higher loss. But by using dropout this issue can be avoided. In ResNet it might shom minor improvement if dropout used.

3. Weight Decay (Regularization):
    - Here, the penalty on the size of the weights, controlled by a regularization parameter. VGG is quite deep and has large fully connected layers, so weight decay is crucial to prevent overfitting.
    - ResNet also benefits from weight decay but it may be does not make much difference due to to fewer total parameters than VGG-16.

4. Learning Rate Scheduler:
    - By using ReduceLROnPlateau, it is possible to minimize or maximize metric. If the monitored metric does not improve for patience epochs, the scheduler will reduce the learning rate by a factor. If the network is still improving, the LR remains the same. This helps you continue making fast progress.
    - Large learning rates can cause the model to overshoot or oscillate around a local optimum. Reducing the LR at the right time can lead to better convergence.

**Analyzing**

- VGG-16 may show a slower convergence initially but it can reach a high training accuracy near the end. But in case of ResNet-18 it converges faster, with a steeper climb in training accuracy during the early epochs. Generally, ResNet’s skip connections help gradients flow more easily so it will speed up the training.
- Loss is kind of less and similer for both training and validation in ResNet but with VGG even if validation loss is less the accuracy on training is higher compared to validation accuracy this suggest the possiblity of overfitting.
- ResNet is showing great accuracy on testing dataset. Even if VGG has slighly higher accurracy that difference is much less compared to loss of both model on testing dataset, where ResNet shows much less loss than VGG on testing as well. Additionally, the percent of loss is mostly same amongst training, validation and testing dataset and same for accuracy in  case of Resnet. So, we are considering ResNet as our final best model.
- VGG sometimes misclassifies images with busy backgrounds or low contrast. But ResNet might handle complex textures better due to skip connections that preserves gradient flow in deeper layers.
- In most cases ResNet-18 outperforms VGG-16, especially on moderate to large datasets and the reason is the residual connections which makes training deeper networks easier. VGG-16 is older and has more parameters in the classifier section which might lead to overfitting if the dataset is not large enough. But ResNet-18 has skip connections, which helps it to learn faster and often produce higher validation accuracy with fewer training epochs.


**Summary**

- VGG stacks many 3 * 3 convolutions in succession without skip connections and relies on a large number of parameters in its final fully connected layers. But ResNet uses residual (skip) connections, which enables deeper networks to train more easily by allowing gradients to flow through identity paths.
-  VGG might converge more slowly and might shows a larger gap between training and validation accuracy if the dataset is not large enough. It can also be more prone to overfitting without sufficient regularization. But ResNet converges faster and achieves a better or decent final accuracy and less loss on testing with fewer parameters due to the residual connections that alleviate the vanishing gradient problem
- In ResNet, the skip connections allow gradients to bypass certain layers, which mitigates vanishing gradients in deeper networks. So ResNet might require less epochs to reach the best performance. Also the validation and testing accuracy is very good with very less loss on both.
- So, based on all the analysis and experiments we think that ResNet is the final best model for this task with less overfitting, more generalization and more computational efficiency.

**References:** <br>

https://matplotlib.org/stable/plot_types/index.html <br>
https://numpy.org/doc/stable/user/index.html#user <br>
https://seaborn.pydata.org/tutorial/introduction.html <br>
https://pytorch.org/docs/stable/index.html <br>
https://scikit-learn.org/stable/ <br>
https://arxiv.org/pdf/1512.03385 <br>
https://docs.wandb.ai/guides/track/ <br>
https://arxiv.org/abs/1409.1556 <br>