In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from torchvision import transforms, datasets
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip "/content/drive/MyDrive/DL/cnn_dataset.zip" -d "/content/cnn_dataset"

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

data_dir = '/content/cnn_dataset'
dataset = datasets.ImageFolder(data_dir, transform=transform)

print("Number of images:", len(dataset))
print("Classes:", dataset.classes)


In [None]:
from collections import Counter
class_counts = Counter(label for _, label in dataset.samples)

print("Number of images per class:")
for class_idx, count in class_counts.items():
    class_name = dataset.classes[class_idx]
    print(f"{class_name}: {count} images")

In [None]:
data_indices = list(range(len(dataset)))
train_idx, test_idx = train_test_split(data_indices, test_size=0.2, stratify=[s[1] for s in dataset.imgs], random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, stratify=[dataset.imgs[i][1] for i in train_idx], random_state=42)

In [None]:
from torch.utils.data import Subset, DataLoader
train_dataset = Subset(dataset, train_idx)
val_dataset   = Subset(dataset, val_idx)
test_dataset  = Subset(dataset, test_idx)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

Step 1 - Defining VGG-Deep

In [None]:
import torch.nn as nn

In [None]:
class VGGDeep(nn.Module):
    def __init__(self, num_classes=3, input_size=(3, 64, 64)):
        super(VGGDeep, self).__init__()

        # Feature extraction part (VGG-16 base + extra conv block)
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Extra Block (Block 6)
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(inplace=True)
        )

        self.flattened_size = self._get_conv_output(input_size)
        self.classifier = nn.Sequential(
            nn.Linear(self.flattened_size, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes)
        )

    def _get_conv_output(self, shape):
        with torch.no_grad():
            dummy_input = torch.zeros(1, *shape)
            output_feat = self.features(dummy_input)
            n_size = output_feat.view(1, -1).size(1)
        return n_size

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)  # Flatten all dimensions except batch
        x = self.classifier(x)
        return x

Step 2 - Training VGG-Deep based on restrictions and constraints provided in question to observe effect of depth in isolation.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

num_classes = 3
model = VGGDeep(num_classes=num_classes)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)  # Simple SGD without momentum

num_epochs = 20
train_losses = []
val_losses = []
train_acc = []
val_acc = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = correct / total
    train_losses.append(epoch_loss)
    train_acc.append(epoch_acc)

    # Validation phase
    model.eval()
    running_loss_val = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss_val += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    val_loss = running_loss_val / len(val_loader.dataset)
    val_accuracy = correct_val / total_val
    val_losses.append(val_loss)
    val_acc.append(val_accuracy)

    print(f"Epoch [{epoch+1}/{num_epochs}] "
          f"Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc:.4f} || "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

model.eval()
running_loss_test = 0.0
correct_test = 0
total_test = 0
with torch.no_grad():

    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        running_loss_test += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total_test += labels.size(0)
        correct_test += predicted.eq(labels).sum().item()

test_loss = running_loss_test / len(test_loader.dataset)
test_accuracy = correct_test / total_test
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_accuracy:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Plot training & validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss', marker='o')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Curve')
plt.legend()
plt.grid()

# Plot training & validation accuracy
plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), train_acc, label='Train Accuracy', marker='o')
plt.plot(range(1, num_epochs + 1), val_acc, label='Validation Accuracy', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Curve')
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()


Step 3 - Gradient analysis using hooks

In [None]:
class VGGDeep(nn.Module):
    def __init__(self, num_classes=3, input_size=(3, 64, 64)):
        super(VGGDeep, self).__init__()

        # Feature extraction part (VGG-16 base + extra conv block)
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Extra Block (Block 6)
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU()
        )

        # Determine the size of the flattened features dynamically
        self.flattened_size = self._get_conv_output(input_size)

        # Classifier part
        self.classifier = nn.Sequential(
            nn.Linear(self.flattened_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Linear(4096, num_classes)
        )

    def _get_conv_output(self, shape):
        with torch.no_grad():
            dummy_input = torch.zeros(1, *shape)
            output_feat = self.features(dummy_input)
            n_size = output_feat.view(1, -1).size(1)
        return n_size

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)  # Flatten all dimensions except batch
        x = self.classifier(x)
        return x

In [None]:
gradient_data = {
    'norms': [],
    'layer_names': []
}

def gradient_hook(module, grad_input, grad_output):
    """Hook function to capture gradient norms"""
    if grad_output[0] is not None:
        grad_norm = grad_output[0].norm(p=2).item()
        gradient_data['norms'][-1].append(grad_norm)

# Register hooks for all convolutional layers
conv_layers = [layer for layer in model.features if isinstance(layer, nn.Conv2d)]
for idx, layer in enumerate(conv_layers):
    gradient_data['layer_names'].append(f"Conv{idx+1}")
    layer.register_full_backward_hook(gradient_hook)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

# Gradient tracking setup
gradient_data = {
    'norms': [],
    'layer_names': []
}

def gradient_hook(module, grad_input, grad_output):
    """Hook function to capture gradient norms"""
    if grad_output[0] is not None:
        grad_norm = grad_output[0].norm(p=2).item()
        # Append to current batch's list
        if gradient_data['norms'] and gradient_data['norms'][-1]:
            gradient_data['norms'][-1][-1].append(grad_norm)

# Initialize model
num_classes = 3
model = VGGDeep(num_classes=num_classes)

# Register hooks on convolutional layers
conv_layers = []
for i, layer in enumerate(model.features):
    if isinstance(layer, nn.Conv2d):
        conv_layers.append(layer)
        gradient_data['layer_names'].append(f"Conv{len(conv_layers)}")
        layer.register_full_backward_hook(gradient_hook)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize optimizer and loss
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)

# Training parameters
num_epochs = 20
track_interval = 10  # Track gradients every 10 batches

# Main training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    # Initialize epoch entry with empty list for batches
    gradient_data['norms'].append([])

    for batch_idx, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Store gradients at specified intervals
        if batch_idx % track_interval == 0:
            gradient_data['norms'][-1].append([])

        # Calculate accuracy
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        running_loss += loss.item() * images.size(0)


    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = correct / total

    # Validation phase
    model.eval()
    running_loss_val = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss_val += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    val_loss = running_loss_val / len(val_loader.dataset)
    val_accuracy = correct_val / total_val
    val_losses.append(val_loss)
    val_acc.append(val_accuracy)

    print(f"Epoch [{epoch+1}/{num_epochs}] "
          f"Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc:.4f} || "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

# Visualization code
def plot_gradient_norms(gradient_data, selected_layers=None):
    plt.figure(figsize=(15, 6))

    # Plot all layers
    plt.subplot(1, 2, 1)
    for layer_idx in range(len(gradient_data['layer_names'])):
        layer_grads = []
        for epoch in gradient_data['norms']:
            for batch in epoch:
                if len(batch) > layer_idx:
                    layer_grads.extend([batch[layer_idx]])
        plt.plot(layer_grads, label=gradient_data['layer_names'][layer_idx])

    plt.title('Gradient Norms - All Layers')
    plt.xlabel(f'Batches (Every {track_interval}th)')
    plt.ylabel('L2 Norm')
    plt.legend(bbox_to_anchor=(1.05, 1))
    plt.grid(True)

    # Plot selected layers
    if selected_layers:
        plt.subplot(1, 2, 2)
        for idx in selected_layers:
            layer_grads = []
            for epoch in gradient_data['norms']:
                for batch in epoch:
                    if len(batch) > idx:
                        layer_grads.extend([batch[idx]])
            plt.plot(layer_grads,
                    label=f"{gradient_data['layer_names'][idx]} (Depth {idx+1})")

        plt.title('Gradient Norms - Depth Comparison')
        plt.xlabel(f'Batches (Every {track_interval}th)')
        plt.ylabel('L2 Norm')
        plt.legend()
        plt.grid(True)

    plt.tight_layout()
    plt.show()

# Generate plots
plot_gradient_norms(gradient_data, selected_layers=[1, 4, 7, len(conv_layers)-1])

Step 4 - Comparison with VGG-16 and Resnet-18

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Epochs
epochs = np.arange(1, 21)

resnet18_train = train_acc_resnet
resnet18_val = val_acc_resnet
vgg16_train = train_acc_vgg
vgg16_val = val_acc_vgg
vggdeep_train = train_acc
vggdeep_val = val_acc

plt.figure(figsize=(10,6))

# ResNet-18
plt.plot(epochs,resnet18_train,label="ResNet-18 Train Acc",color="red",linestyle="-")
plt.plot(epochs,resnet18_val,label="ResNet-18 Val Acc",color="red",linestyle="--")

# VGG-16
plt.plot(epochs,vgg16_train,label="VGG-16 Train Acc",color="blue",linestyle="-")
plt.plot(epochs,vgg16_val,label="VGG-16 Val Acc",color="blue",linestyle="--")

# VGG-Deep
plt.plot(epochs,vggdeep_train,label="VGG-Deep Train Acc",color="green",linestyle="-")
plt.plot(epochs,vggdeep_val,label="VGG-Deep Val Acc",color="green",linestyle="--")

# Labels and Legend
plt.title("Training and Validation Accuracy for ResNet-18,VGG-16,and VGG-Deep")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Epochs
epochs = np.arange(1, 21)
resnet18_train_loss = train_loss_resnet
resnet18_val_loss = val_loss_resnet

vgg16_train_loss = train_loss_vgg
vgg16_val_loss = val_loss_vgg

vggdeep_train_loss = train_losses
vggdeep_val_loss = val_losses


plt.figure(figsize=(10,6))

# ResNet-18
plt.plot(epochs, resnet18_train_loss, label="ResNet-18 Train Loss", color="red", linestyle="-")
plt.plot(epochs, resnet18_val_loss, label="ResNet-18 Val Loss", color="red", linestyle="--")

# VGG-16
plt.plot(epochs, vgg16_train_loss, label="VGG-16 Train Loss", color="blue", linestyle="-")
plt.plot(epochs, vgg16_val_loss, label="VGG-16 Val Loss", color="blue", linestyle="--")

# VGG-Deep
plt.plot(epochs, vggdeep_train_loss, label="VGG-Deep Train Loss", color="green", linestyle="-")
plt.plot(epochs, vggdeep_val_loss, label="VGG-Deep Val Loss", color="green", linestyle="--")

# Labels and Legend
plt.title("Training and Validation Loss for ResNet-18, VGG-16, and VGG-Deep")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()


Step 5 - 3 more setups on base VGG. Choosing kernel size, max and avg pooling, activation function variations(subparts a,c,d)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

class VGG16(nn.Module):
    def __init__(self, num_classes=3):
        super(VGG16, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.classifier = nn.Sequential(
            nn.Linear(512 * 4 * 4, 4096),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

vgg_model = VGG16(num_classes=3)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os  # Import the os module

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a common training function
def train_and_validate(model, train_loader, val_loader, optimizer, criterion, num_epochs=10):
    model.to(device)
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = correct / total
        train_losses.append(epoch_loss)
        train_accs.append(epoch_acc)

        # Validation phase
        model.eval()
        running_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                running_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss = running_loss / len(val_loader.dataset)
        val_acc = correct / total
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return train_losses, val_losses, train_accs, val_accs


# Function to plot training and validation curves
def plot_curves(train_losses, val_losses, train_accs, val_accs, title):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, 'b-', label='Training Loss')
    plt.plot(epochs, val_losses, 'r-', label='Validation Loss')
    plt.title('Loss Curves - {}'.format(title))
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accs, 'b-', label='Training Accuracy')
    plt.plot(epochs, val_accs, 'r-', label='Validation Accuracy')
    plt.title('Accuracy Curves - {}'.format(title))
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
from PIL import Image
sample_image_path = dataset.imgs[0][0]
sample_image = Image.open(sample_image_path)
image_width, image_height = sample_image.size
print(f"Image width: {image_width}, height: {image_height}")


In [None]:
class SmallVGG(nn.Module):
    def __init__(self, kernel_size=3, num_classes=len(dataset.classes)):
        super(SmallVGG, self).__init__()
        # Use kernel_size//2 for padding to maintain spatial dimensions
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=kernel_size, padding=kernel_size//2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=kernel_size, padding=kernel_size//2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=kernel_size, padding=kernel_size//2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Calculate feature map size with a dummy input
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, image_height, image_width)
            output = self.features(dummy_input)
            flattened_size = output.numel() // output.size(0)

        self.classifier = nn.Sequential(
            nn.Linear(flattened_size, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, num_classes)
        )

        print(f"Kernel size: {kernel_size}, Feature map output shape: {output.shape}, Flattened size: {flattened_size}")

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

# 2. Max Pooling vs. Average Pooling - FIXED VERSION
class SmallVGGPooling(nn.Module):
    def __init__(self, pooling_type='max', num_classes=len(dataset.classes)):
        super(SmallVGGPooling, self).__init__()
        if pooling_type == 'max':
            pool = nn.MaxPool2d
        elif pooling_type == 'avg':
            pool = nn.AvgPool2d
        else:
            raise ValueError("pooling_type must be 'max' or 'avg'")

        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            pool(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            pool(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            pool(kernel_size=2, stride=2)
        )

        # Calculate feature map size with a dummy input
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, image_height, image_width)
            output = self.features(dummy_input)
            flattened_size = output.numel() // output.size(0)

        self.classifier = nn.Sequential(
            nn.Linear(flattened_size, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, num_classes)
        )

        print(f"Pooling type: {pooling_type}, Feature map output shape: {output.shape}, Flattened size: {flattened_size}")

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

# 3. Activation Functions - FIXED VERSION
class SmallVGGActivation(nn.Module):
    def __init__(self, activation_type='relu', num_classes=len(dataset.classes)):
        super(SmallVGGActivation, self).__init__()
        if activation_type == 'relu':
            activation = nn.ReLU(inplace=True)
        elif activation_type == 'leaky_relu':
            activation = nn.LeakyReLU(inplace=True)
        elif activation_type == 'elu':
            activation = nn.ELU(inplace=True)
        elif activation_type == 'gelu':
            activation = nn.GELU()
        else:
            raise ValueError("activation_type must be 'relu', 'leaky_relu', 'elu', or 'gelu'")

        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            activation,
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            activation,
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            activation,
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, image_height, image_width)
            output = self.features(dummy_input)
            flattened_size = output.numel() // output.size(0)

        self.classifier = nn.Sequential(
            nn.Linear(flattened_size, 128),
            activation,
            nn.Linear(128, num_classes)
        )

        print(f"Activation type: {activation_type}, Feature map output shape: {output.shape}, Flattened size: {flattened_size}")

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [None]:
kernel_sizes = [3, 5, 7]
kernel_results = defaultdict(dict)

for k in kernel_sizes:
    print(f"Training SmallVGG with kernel size: {k}")
    model = SmallVGG(kernel_size=k)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    train_losses, val_losses, train_accs, val_accs = train_and_validate(
        model, train_loader, val_loader, optimizer, criterion, num_epochs=5
    )
    kernel_results[k]['train_losses'] = train_losses
    kernel_results[k]['val_losses'] = val_losses
    kernel_results[k]['train_accs'] = train_accs
    kernel_results[k]['val_accs'] = val_accs
    plot_curves(train_losses, val_losses, train_accs, val_accs, f"Kernel Size {k}")

pooling_types = ['max', 'avg']
pooling_results = defaultdict(dict)

for pool_type in pooling_types:
    print(f"Training SmallVGG with {pool_type} pooling")
    model = SmallVGGPooling(pooling_type=pool_type)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    train_losses, val_losses, train_accs, val_accs = train_and_validate(
        model, train_loader, val_loader, optimizer, criterion, num_epochs=5
    )
    pooling_results[pool_type]['train_losses'] = train_losses
    pooling_results[pool_type]['val_losses'] = val_losses
    pooling_results[pool_type]['train_accs'] = train_accs
    pooling_results[pool_type]['val_accs'] = val_accs
    plot_curves(train_losses, val_losses, train_accs, val_accs, f"{pool_type} Pooling")

activation_types = ['relu', 'leaky_relu', 'elu', 'gelu']
activation_results = defaultdict(dict)

for act_type in activation_types:
    print(f"Training SmallVGG with {act_type} activation")
    model = SmallVGGActivation(activation_type=act_type)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    train_losses, val_losses, train_accs, val_accs = train_and_validate(
        model, train_loader, val_loader, optimizer, criterion, num_epochs=5
    )
    activation_results[act_type]['train_losses'] = train_losses
    activation_results[act_type]['val_losses'] = val_losses
    activation_results[act_type]['train_accs'] = train_accs
    activation_results[act_type]['val_accs'] = val_accs
    plot_curves(train_losses, val_losses, train_accs, val_accs, f"{act_type} Activation")

a. The gradient norm plots demonstrate the vanishing gradient problem. As we move deeper into the network, the gradient norm changes substantially. The gradient norm of layer 1 is 3.5x larger than that of layer 2.

b. The vanishing gradient problem happens in deep neural networks during the backprop phase. We now that during backprop, there is an effect of chain rule of derivates that take place and since the values normally lie between -1 and 1, this multiplication chain leads to values being so small that there is no learning.

c. ResNet introduces residual connections that skip one or more layers. Instead of learning a direct mapping from input to output, the network learns the residual, i.e., the difference between the input and the output. This is achieved by adding the input of a layer to its output, creating a shortcut connection.
The identity mapping x ensures that the gradient can flow directly through the shortcut connection without being multiplied by small weights. This helps in preserving the gradient magnitude, making it easier for the network to propagate gradients back to the earlier layers.

d.Batch normalization, introduced by Ioffe and Szegedy in 2015, helps stabilize and accelerate the training of deep neural networks by normalizing the inputs of each layer.

Impact on Vanishing/Exploding Gradient Problem
Batch normalization addresses the vanishing/exploding gradient problem by normalizing the activations of each layer to have zero mean and unit variance. This normalization helps in several ways:

-Stabilizes Gradient Flow: By normalizing the inputs to each layer, batch normalization prevents the activations from becoming too large or too small, which helps in maintaining a stable gradient flow throughout the network.

-Reduces Internal Covariate Shift: Internal covariate shift refers to the change in the distribution of network activations due to the continual change in network parameters during training. Batch normalization reduces this shift, making the training process more stable.

-Allows Higher Learning Rates: With batch normalization, the network can use higher learning rates without the risk of gradients exploding. This accelerates the training process.

-Regularization Effect: Batch normalization also has a slight regularizing effect, similar to dropout, which can help in reducing overfitting.

e.
Key findings from my three chosen investigataions are

-Kernel sizes 3 and 5 have better performance than kernel size 7.
-Max pooling has a better performance than average pooling
-ReLU and leaky ReLu have almost similar performances but are much better than ELU and GELU activation functions. The performance gap is around 4-5%.

f. References

- https://pytorch.org/vision/main/models/generated/torchvision.models.vgg16.html
- https://www.v7labs.com/blog/neural-networks-activation-functions
- https://pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_full_backward_hook.html
- https://pytorch.org/docs/stable/generated/torch.Tensor.register_hook.html
