In [2]:
# relevant for lightning.ai studio
%cd del_mc1/


/teamspace/studios/this_studio/del_mc1


In [3]:
# base libraries
import numpy as np
import matplotlib.pyplot as plt
import pickle
import random
from collections import OrderedDict

# ML related libraries
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
# todo do I need this?
from torcheval.metrics import MulticlassAccuracy
from torchvision import datasets, transforms

from torch.utils.data import DataLoader
import torch.nn as nn
import copy

import os

# mlops
import wandb

%env WANDB_NOTEBOOK_NAME=mc1.ipynb
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

torch.use_deterministic_algorithms(True)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


## Tag ensure that wandb won't be cluttered
DEVELOPMENT = True
CUSTOM_TAGS = ['draft_5']

NUM_WORKERS = os.cpu_count()
print(NUM_WORKERS)

env: WANDB_NOTEBOOK_NAME=mc1.ipynb
env: CUBLAS_WORKSPACE_CONFIG=:4096:8
4


In [4]:

transform = transforms.Compose([
    transforms.ToTensor(),
])

# Load CIFAR-10 training data
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=len(train_dataset))
train_images, train_labels = next(iter(train_loader))

# Load CIFAR-10 test data
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset))
test_images, test_labels = next(iter(test_loader))


Files already downloaded and verified
Files already downloaded and verified


In [None]:

# Print shapes of the datasets to verify
print('train_images shape:', train_images.shape)
print('train_labels shape:', train_labels.shape)
print('test_images shape:', test_images.shape)
print('test_labels shape:', test_labels.shape)

# Accessing label names
# get all labels 
label_names = train_dataset.classes
print('label_names size:', len(label_names))
print('label_names:', label_names)


## Exploratory Data Analysis 
This section provides some insight into the CIFAR-10 dataset, which will help ht

In [None]:
np.unique(train_labels, return_counts=True)[1], np.unique(test_labels, return_counts=True)[1]

The labels are well distributed.

In [None]:

def verify_label_distribution(labels, label_names):
    unique, counts = np.unique(labels, return_counts=True)
    distribution = dict(zip(unique, counts))
    for label, count in distribution.items():
        print(f'{label_names[label]}: {count}')
    return distribution

def display_example_images(images, labels, label_names):
    fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(20, 8))
    axes = axes.flatten()
    
    for i, label_name in enumerate(label_names):
        index = np.where(labels == i)[0][0]
        image = images[index].permute(1, 2, 0) 
        
        axes[i].imshow(image)
        axes[i].set_title(label_name)
        axes[i].title.set_size(20)
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

print("Training Labels Distribution:")
train_distribution = verify_label_distribution(train_labels, label_names)
print("\nTest Labels Distribution:")
test_distribution = verify_label_distribution(test_labels, label_names)

print("\nExample Images from Each Class:")
display_example_images(train_images, train_labels, label_names)




In [None]:
# todo maybe plot more images?

Let's calculate the mean and standard deviation for each color channel in the CIFAR-10 dataset.

#### Reasoning
This step is crucial for normalizing (relevant for preprocessing) the dataset, ensuring consistent model training and faster convergence.


In [5]:
def calculate_mean_std(loader):
    channels_sum, channels_squared_sum, num_batches = 0, 0, 0

    for data, _ in loader:
        channels_sum += torch.mean(data, dim=[0, 2, 3])
        channels_squared_sum += torch.mean(data ** 2, dim=[0, 2, 3])
        num_batches += 1

    mean = channels_sum / num_batches
    std = (channels_squared_sum / num_batches - mean ** 2) ** 0.5

    return mean, std

train_mean, train_std = calculate_mean_std(train_loader)
test_mean, test_std = calculate_mean_std(test_loader)

print(f"Training Mean: {train_mean}, Training Std: {train_std}")
print(f"Test Mean: {test_mean}, Test Std: {test_std}")


Training Mean: tensor([0.4914, 0.4822, 0.4465]), Training Std: tensor([0.2470, 0.2435, 0.2616])
Test Mean: tensor([0.4942, 0.4851, 0.4504]), Test Std: tensor([0.2467, 0.2429, 0.2616])


Let's plot histograms of pixel values for the Red, Green, and Blue channels.

#### Reasoning
Understanding pixel intensity distribution aids in identifying dataset biases and informs necessary preprocessing adjustments.


In [None]:
def plot_pixel_histograms(images):
    images = images.permute(1, 0, 2, 3).reshape(3, -1)
    plt.figure(figsize=(15, 5))
    for i, color in enumerate(['r', 'g', 'b']):
        plt.subplot(1, 3, i + 1)
        # todo validate bins
        plt.hist(images[i].numpy(), bins=20, color=color, alpha=0.7)
        plt.title(f'{color.upper()} Channel')
        plt.ylabel('Frequency')
        plt.xlabel('Pixel Intensity')
    plt.tight_layout()
    plt.show()

plot_pixel_histograms(train_images)


In [None]:
def plot_color_distribution_per_class(images, labels, label_names):
    num_classes = len(label_names)
    fig, axes = plt.subplots(nrows=num_classes, ncols=3, figsize=(20, num_classes * 4))
    
    for i, label_name in enumerate(label_names):
        class_images = images[labels == i]
        # Flatten the images to a single dimension per channel for plotting
        red_channel = class_images[:, 0, :, :].flatten()
        green_channel = class_images[:, 1, :, :].flatten()
        blue_channel = class_images[:, 2, :, :].flatten()
        
        for j, (channel, color) in enumerate(zip([red_channel, green_channel, blue_channel], ['Red', 'Green', 'Blue'])):
            ax = axes[i, j]
            ax.hist(channel.numpy(), bins=20, color=color.lower(), alpha=0.7)
            ax.set_title(f'{label_name} - {color} Channel')
            ax.set_xlabel('Pixel Intensity')
            ax.set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()


plot_color_distribution_per_class(train_images, train_labels, label_names)


## Preprocessing 


Given that the CIFAR-10 dataset is very well maintained (already labeled, pictures dimensions are consistent and distribution between labels is the same) the only thing we really need to do is properly normalize the data and do some train, validation and test split relevant for training later.

In [6]:
normalize_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=train_mean, std=train_std), # using training mean and std for normalization
])

# Reload the datasets with normalization, keep it simple
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=normalize_transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=normalize_transform)


Files already downloaded and verified
Files already downloaded and verified


### 3. Modeling structure

The metric used for evaluation in this challenge will be Accuracy. While simple, it is a good metric given that the dataset is very well balanced. 

Since this is a multi-class classification, using cross-entropy as the loss function is logical. 

In [7]:

wandb.login()

api = wandb.Api()

class ModelTrainer:
    def __init__(self, model, train_dataset, test_dataset, optimizer, criterion=torch.nn.CrossEntropyLoss(), batch_size=128, epochs=55, seed=None, experiment="development", track=True, run_name=''):
        self.seed = seed    
        if seed is not None:
            self.set_seed(seed)

        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.experiment = experiment
        self.optimizer = optimizer
        self.criterion = criterion
        self.batch_size = batch_size
        self.epochs = epochs
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.model.initialize_weights()

        self.config = self.auto_generate_config()
        self.track = track
        if self.track:
            self.run_name = run_name
            self.init_wandb()

    def set_seed(self, seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)

    def auto_generate_config(self):
        self.lr = self.optimizer.param_groups[0]['lr']
        config = {
            'model_type': self.model.__class__.__name__,
            'experiment': self.experiment,
            'optimizer': self.optimizer.__class__.__name__,
            'learning_rate': self.lr,
            'criterion': self.criterion.__class__.__name__,
            'batch_size': self.batch_size,
            'epochs': self.epochs,
            'device': self.device,
            'seed': self.seed,
            'trainable_params': sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        }
        return config

    def init_wandb(self):
        if not self.run_name:
            self.run_name = f"{self.config['model_type']}_LR{self.config['learning_rate']}_BS{self.config['batch_size']}"

        tags = [self.config['experiment']]
        if DEVELOPMENT:
            tags.append('development')
        
        if CUSTOM_TAGS:
            tags.extend(CUSTOM_TAGS)

        wandb.init(project='del', name=self.run_name, group=self.config['experiment'], config=self.config, tags=tags)
        wandb.watch(self.model, self.criterion, log='all', log_freq=10, log_graph=True)

    def train(self, validate=False):
        self.model.train()
        train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
        total_steps = len(train_loader)
        
        for epoch in range(self.epochs):
            running_loss = 0.0
            correct_predictions = 0
            total_predictions = 0

            for i, (inputs, labels) in enumerate(train_loader):
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

                # L1 reg is not supported by PyTorch optimizers natively
                if hasattr(self.model, 'l1_penalty'):
                    l1_loss = self.model.l1_penalty()
                    loss += l1_loss

                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

            epoch_loss = running_loss / total_steps
            epoch_accuracy = correct_predictions / total_predictions
            if self.track:
                wandb.log({'epoch': epoch + 1, 'train_loss': epoch_loss, 'train_accuracy': epoch_accuracy})
            print(f'Epoch [{epoch+1}/{self.epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

            if validate:
                self.validate_epoch(epoch)

    # todo this is 1:1 with test
    def validate_epoch(self, epoch):
        self.model.eval()
        valid_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

        avg_loss = running_loss / len(valid_loader)
        accuracy = correct_predictions / total_predictions
        if self.track:
            wandb.log({'validation_loss': avg_loss, 'validation_accuracy': accuracy, 'epoch': epoch + 1})
        print(f'Validation - Epoch [{epoch+1}/{self.epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    def test(self):
        self.model.eval()
        test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4, pin_memory=True)
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

        avg_loss = running_loss / len(test_loader)
        accuracy = correct_predictions / total_predictions
        if self.track:
            wandb.log({'test_loss': avg_loss, 'test_accuracy': accuracy})
        print(f'Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}')

    def run(self, validate=True):
        self.train(validate=validate)
        # we either validate or test, as they share the same test set, so doing both is redundant.
        if not validate:
            self.test()
        if self.track:
            w_run = wandb.run
            wandb.finish()
            return w_run


[34m[1mwandb[0m: Currently logged in as: [33mdmnkf[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
class BasicCNN(nn.Module):
    def __init__(self, input_channels=3, num_classes=10, layers=None):
        super(BasicCNN, self).__init__()
        self.input_channels = input_channels
        self.num_classes = num_classes
 
        if layers is None:
            self.layers = [
                nn.Conv2d(input_channels, 32, kernel_size=3, stride=1, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2),
                
                nn.Conv2d(32, 96, kernel_size=3, stride=1, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2),

                nn.Flatten(),
                nn.Linear(96 * 8 * 8, 128),
                nn.ReLU(),
                nn.Linear(128, num_classes),
            ]
        else:
            self.layers = layers

        self.features = nn.Sequential(*self.layers)
        
    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):             
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
            elif hasattr(m, 'reset_parameters'):
                m.reset_parameters()



    def forward(self, x):
        x = self.features(x)
        return x

In [None]:
# print total trainable features
model = BasicCNN()
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Total Trainable Parameters: {total_params}')

The `BasicCNN` class provided here is designed as a simple convolutional neural network (CNN) suitable for image classification tasks. The model architecture is chosen to offer a balanced approach between complexity and performance, making it well-suited for tasks that do not require extremely deep networks. It will serve as the base foundation for all following experiments and analysis.

1. **Layer Configuration**: It starts with an initial convolutional layer that takes in images with `input_channels` (defaulting to 3 for RGB images) and outputs 32 feature maps, using a kernel size of 3x3, stride of 1, and padding of 1 to preserve spatial dimensions. This is followed by a ReLU activation function for non-linearity and a max pooling layer to reduce the spatial dimensions by half, enhancing feature extraction while reducing computational load.

2. **Further Convolution and Pooling**: The process is repeated with another convolutional layer reducing the feature maps to 16, followed by ReLU and max pooling. This further helps in capturing more abstract features from the input images while continually reducing data dimensionality.

3. **Flattening and Output**: The network flattens the output of the last pooling layer and feeds it into a fully connected (linear) layer that maps the features to the `num_classes`, which represent the final classification scores for each class.

4. **Weight Initialization**: The model uses He initialization (`kaiming_uniform_`) for convolutional and linear layers to ensure the initial weights are scaled appropriately, reducing the chance of vanishing or exploding gradients, especially important in networks with ReLU activations. (comparison follows)


### Overfit check

To ensure that the model is capable of learning, the following block will train on a single batch (32 images) for train and test. The expectation here is that the model should overfit the train dataset quite easily hitting a perfect accuracy of 1.0. The train loss should smoothly converge to 0, whereas the validation loss will most likely increase over time. However given the small subset, the validation loss and accuracy should be bad as the model should overfit on train data and pick up every bit of noise so that it no longer is capable of generalizing unseen data.

In [11]:
bs = 32

model = BasicCNN()
optimizer = optim.SGD(model.parameters(), lr=0.001)
train_subset = torch.utils.data.Subset(train_dataset, range(bs))
test_subset = torch.utils.data.Subset(test_dataset, range(bs))
trainer = ModelTrainer(model=model, train_dataset=train_subset, test_dataset=test_subset, optimizer=optimizer, experiment='overfit', epochs=30, batch_size=bs, seed=55, track=True)

trainer.run()


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/30], Loss: 3.8694, Accuracy: 0.0625
Validation - Epoch [1/30], Loss: 3.3277, Accuracy: 0.1250
Epoch [2/30], Loss: 2.7890, Accuracy: 0.1875
Validation - Epoch [2/30], Loss: 3.1339, Accuracy: 0.1562
Epoch [3/30], Loss: 2.4539, Accuracy: 0.2500
Validation - Epoch [3/30], Loss: 3.0606, Accuracy: 0.1562
Epoch [4/30], Loss: 2.2021, Accuracy: 0.2812
Validation - Epoch [4/30], Loss: 2.9917, Accuracy: 0.1875
Epoch [5/30], Loss: 2.0029, Accuracy: 0.3750
Validation - Epoch [5/30], Loss: 2.9584, Accuracy: 0.1875
Epoch [6/30], Loss: 1.8376, Accuracy: 0.4062
Validation - Epoch [6/30], Loss: 2.9064, Accuracy: 0.1562
Epoch [7/30], Loss: 1.6954, Accuracy: 0.4062
Validation - Epoch [7/30], Loss: 2.8724, Accuracy: 0.1875
Epoch [8/30], Loss: 1.5761, Accuracy: 0.3750
Validation - Epoch [8/30], Loss: 2.8494, Accuracy: 0.1562
Epoch [9/30], Loss: 1.4744, Accuracy: 0.4688
Validation - Epoch [9/30], Loss: 2.8341, Accuracy: 0.1562
Epoch [10/30], Loss: 1.3847, Accuracy: 0.5938
Validation - Epoch [10/30],

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train_accuracy,▁▂▂▃▃▄▄▃▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇█████
train_loss,█▆▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▅▅██▅█▅▅█▅███████████████████
validation_loss,█▅▄▃▃▂▂▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄

0,1
epoch,30.0
train_accuracy,1.0
train_loss,0.60264
validation_accuracy,0.1875
validation_loss,3.01776


<iframe src="https://wandb.ai/dmnkf/del/reports/overfit--Vmlldzo3NTIzMDc4" style="border:none;height:1024px;width:100%">

#### Results

The results do align with my theory and understanding. Looking at the train loss we can see how it quickly converges and is hitting a perfect accuracy on the train dataset by epoch 20. The validation loss on the other hand does increase as train loss decreases, futher indicating strong overfitting.

## Baseline Model 

For the upcoming experiments and analysis, we will be training the BasicCNN once to serve as baseline for comparisons at each point. The results of this training will be included in every following report of the notebook. Doing this here improves the overall training time as we are reuse the results from this run instead of rerunning it at every step.

The defaults defined here will be used for all upcoming runs if not specified differently.

In [13]:
LEARNING_RATE = 0.001
BATCH_SIZE = 128
EPOCHS = 55

In [14]:
model = BasicCNN()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='baseline', epochs=EPOCHS, batch_size=BATCH_SIZE, seed=55, track=True, run_name=f'baseline_LR{LEARNING_RATE}_BS{BATCH_SIZE}')

trainer.run()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_accuracy,▁▄▅▅▆▇▇▇██
train_loss,█▅▄▄▃▂▂▂▁▁
validation_accuracy,▁▂▄▅▆▆▆▇██
validation_loss,█▇▅▄▃▂▂▂▁▁

0,1
epoch,10.0
train_accuracy,0.66244
train_loss,0.98152
validation_accuracy,0.6253
validation_loss,1.07294


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112327088888681, max=1.0…

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/55], Loss: 1.9839, Accuracy: 0.2995
Validation - Epoch [1/55], Loss: 1.7816, Accuracy: 0.3657
Epoch [2/55], Loss: 1.7136, Accuracy: 0.4002
Validation - Epoch [2/55], Loss: 1.6730, Accuracy: 0.4089
Epoch [3/55], Loss: 1.6097, Accuracy: 0.4362
Validation - Epoch [3/55], Loss: 1.5843, Accuracy: 0.4429
Epoch [4/55], Loss: 1.5393, Accuracy: 0.4627
Validation - Epoch [4/55], Loss: 1.5468, Accuracy: 0.4513
Epoch [5/55], Loss: 1.4843, Accuracy: 0.4855
Validation - Epoch [5/55], Loss: 1.4749, Accuracy: 0.4817
Epoch [6/55], Loss: 1.4390, Accuracy: 0.4999
Validation - Epoch [6/55], Loss: 1.4472, Accuracy: 0.4907
Epoch [7/55], Loss: 1.3999, Accuracy: 0.5133
Validation - Epoch [7/55], Loss: 1.4214, Accuracy: 0.4982
Epoch [8/55], Loss: 1.3643, Accuracy: 0.5268
Validation - Epoch [8/55], Loss: 1.3857, Accuracy: 0.5112
Epoch [9/55], Loss: 1.3332, Accuracy: 0.5374
Validation - Epoch [9/55], Loss: 1.3458, Accuracy: 0.5279
Epoch [10/55], Loss: 1.3048, Accuracy: 0.5463
Validation - Epoch [10/55],

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
train_loss,█▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▂▃▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇████████████
validation_loss,█▇▆▅▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,55.0
train_accuracy,0.73046
train_loss,0.80049
validation_accuracy,0.6428
validation_loss,1.02851


## Training with SGD, without REG, without BN



### Weight Initialisation
Weight initialisation refers to the process of assigning initial values to the weights of a network's neurons. This setup is crucial because it significantly influences the learning process and the model's performance. Proper initialisation can help avoid issues such as vanishing gradients, where updates to weights become insignificantly small during training, potentially halting the learning process.

**Key Points:**
- **Vanishing Gradient**: This problem can slow down the training or prevent convergence by making updates to the weights too small.
- **Convergence Speed**: Good initialisation methods can speed up convergence by ensuring weights start at a scale that prevents early saturation of neurons.
- **Local Minima**: Properly initialized weights can help the model avoid getting stuck in less optimal local minima during training.

Standards for weight initialization have evolved based on the activation function used in the network. For networks using ReLU activations, He Initialization, which adjusts weights based on the number of incoming nodes to a neuron, is considered best practice. 

**PyTorch Implementation:**
PyTorch applies He initialization for Linear and Conv layers tailored to LeakyReLU activations by default (`gain = sqrt(2 / (1 + negative_slope^2))` the negative slope being `sqrt(5)`). 

[Relevant PyTorch Source Code](https://arc.net/l/quote/juevbrgc)

#### Experiments

To better understand the impact of weight initialisation and its implication we will look at a variety of different initialisations down below and compare them. Given the initialisation done there are different expected outcomes which will then be analysed at the end.


In [None]:
class OneWeightCNN(BasicCNN):

    def __init__(self, input_channels=3, num_classes=10, layers=None):
        super().__init__(input_channels, num_classes, layers)

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.constant_(m.weight, 1.0)
            

model = OneWeightCNN()
# check if all weights are 1
model.initialize_weights()

# Validate that all weights are set to 1
for name, param in model.named_parameters():
    if "weight" in name:  # Ensure we're only checking weights, not biases
        assert torch.all(param == 1), f"{name} not all ones"
print("All weights verified as 1.")


optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='weight_init', seed=55, track=True, run_name='OneWeightCNN')
trainer.run()

In [None]:
class RandomWeightCNN(BasicCNN):

    def __init__(self, input_channels=3, num_classes=10, layers=None):
        super().__init__(input_channels, num_classes, layers)

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                # Initialize weights with a uniform distribution
                nn.init.uniform_(m.weight, -0.1, 0.1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

In [None]:
model = RandomWeightCNN()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='weight_init', seed=55, track=True, run_name='RandomWeightCNN')
trainer.run()

In [None]:
class DefaultWeightCNN(BasicCNN):

    def __init__(self, input_channels=3, num_classes=10, layers=None):
        super().__init__(input_channels, num_classes, layers)

    def initialize_weights(self):
        for layer in self.layers:
            if hasattr(layer, 'reset_parameters'):
                layer.reset_parameters()



model = DefaultWeightCNN()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='weight_init', epochs=EPOCHS, batch_size=BATCH_SIZE, seed=55, track=True, run_name="DefaultCNN")
trainer.run()

In [None]:
## plot wandb weight_init report

%wandb dmnkf/del/reports/weight_init

Given the results of the report we will change the default weight init of Conv2d and Linear Layers

### Hyperparameter Tuning
Für jedes Modell mit gegebener Anzahl Layer und Units pro Layer führe ein sorgfältiges Hyper-Parameter-Tuning durch. Untersuche, wie sich die das Training verändert bei unterschiedlicher Wahl für die Lernrate, in einer separaten Betrachtung auch für die Batch-Grösse. Achte stets darauf, dass das Training stabil läuft. Merke Dir bei jedem Training, den Loss, die Performance Metrik(en) inkl. Schätzfehler, die verwendete Anzahl Epochen, Lernrate und Batch-Grösse. Beachte: Keine Verfahren zur automatischen Hyperparameter-Suche (z.B. kein Bayesian und kein Random Parameter- Sweep Methoden) verwenden!

#### Learning Rate 



In [18]:
lr_epochs = {
    1: 5,
    0.1: 30,
    0.01: 45,
    0.00001: 100
}

for lr, epochs in lr_epochs.items():
    model = BasicCNN()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    trainer = ModelTrainer(model=model, epochs=epochs, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='learning_rate', seed=55, track=True, run_name=f"LR{lr}")
    trainer.run()

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/3], Loss: nan, Accuracy: 0.1000
Validation - Epoch [1/3], Loss: nan, Accuracy: 0.1000
Epoch [2/3], Loss: nan, Accuracy: 0.1000
Validation - Epoch [2/3], Loss: nan, Accuracy: 0.1000
Epoch [3/3], Loss: nan, Accuracy: 0.1000
Validation - Epoch [3/3], Loss: nan, Accuracy: 0.1000


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▅▅██
train_accuracy,▁▁▁
validation_accuracy,▁▁▁

0,1
epoch,3.0
train_accuracy,0.1
train_loss,
validation_accuracy,0.1
validation_loss,


### Batch Sizes

In [14]:

# bs : epochs dictionary to optimize training time
bs_epochs = {
    1000: 100,
    64: 30,
    32: 25,
    16: 15
}


for bs, epochs in bs_epochs.items():
    model = BasicCNN()
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='batch_size', batch_size=bs, epochs=epochs, seed=55, track=True, run_name=f"BS{bs}")
    trainer.run()

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/50], Loss: 2.2946, Accuracy: 0.2038
Validation - Epoch [1/50], Loss: 2.0561, Accuracy: 0.2649
Epoch [2/50], Loss: 1.9627, Accuracy: 0.3010
Validation - Epoch [2/50], Loss: 1.9033, Accuracy: 0.3233
Epoch [3/50], Loss: 1.8513, Accuracy: 0.3410
Validation - Epoch [3/50], Loss: 1.8213, Accuracy: 0.3542
Epoch [4/50], Loss: 1.7826, Accuracy: 0.3677
Validation - Epoch [4/50], Loss: 1.7656, Accuracy: 0.3795
Epoch [5/50], Loss: 1.7330, Accuracy: 0.3860
Validation - Epoch [5/50], Loss: 1.7232, Accuracy: 0.3902
Epoch [6/50], Loss: 1.6932, Accuracy: 0.4012
Validation - Epoch [6/50], Loss: 1.6898, Accuracy: 0.4075
Epoch [7/50], Loss: 1.6597, Accuracy: 0.4132
Validation - Epoch [7/50], Loss: 1.6622, Accuracy: 0.4174
Epoch [8/50], Loss: 1.6309, Accuracy: 0.4234
Validation - Epoch [8/50], Loss: 1.6358, Accuracy: 0.4264
Epoch [9/50], Loss: 1.6061, Accuracy: 0.4334
Validation - Epoch [9/50], Loss: 1.6126, Accuracy: 0.4303
Epoch [10/50], Loss: 1.5839, Accuracy: 0.4423
Validation - Epoch [10/50],

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
train_loss,█▆▅▅▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▂▃▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
validation_loss,█▇▆▅▅▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,50.0
train_accuracy,0.58136
train_loss,1.20244
validation_accuracy,0.5457
validation_loss,1.28399


## Complexity
Variiere die Anzahl Layer und Anzahl Units pro Layer, um eine möglichst gute Performance zu erreichen. Falls auch CNNs (ohne Transfer-Learning) verwendet werden variiere auch Anzahl Filter, Kernel-Grösse, Stride, Padding.

To simplify the training of different complexities without generating too much code I wrote the create_model function which will generate the model given a configuration.

In [10]:

def create_model(num_conv_layers, conv_channels, kernel_size, stride, padding,
                 input_channels=3, num_classes=10, input_size=32, linear_layer_sizes=[]):
    assert len(conv_channels) == num_conv_layers, "Length of conv_channels must match num_conv_layers"
    assert len(kernel_size) == num_conv_layers, "Length of kernel_size must match num_conv_layers"
    assert len(stride) == num_conv_layers, "Length of stride must match num_conv_layers"
    assert len(padding) == num_conv_layers, "Length of padding must match num_conv_layers"

    layers = []
    current_size = input_size
    channels_in = input_channels

    # Building convolutional layers
    for i in range(num_conv_layers):
        layers += [
            nn.Conv2d(channels_in, conv_channels[i], kernel_size=kernel_size[i], stride=stride[i], padding=padding[i]),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        ]
        # Update the current size after each conv and pooling operation
        current_size = (current_size + 2 * padding[i] - kernel_size[i]) // stride[i] + 1
        current_size = (current_size - 2) // 2 + 1
        channels_in = conv_channels[i]

    layers.append(nn.Flatten())

    # Calculating the flattened size for the first linear layer
    current_size = current_size * current_size * channels_in

    if linear_layer_sizes:
        for i, layer_size in enumerate(linear_layer_sizes):
            if i == 0:  # The first linear layer after flattening
                layers.append(nn.Linear(current_size, layer_size))
            else:
                layers.append(nn.Linear(linear_layer_sizes[i-1], layer_size))
            if i < len(linear_layer_sizes) - 1:  # No activation after the last specified layer before output
                layers.append(nn.ReLU())
        # Final layer to number of classes
        layers.append(nn.Linear(linear_layer_sizes[-1], num_classes))
    else:
        layers.append(nn.Linear(current_size, num_classes))

    return BasicCNN(input_channels, num_classes, layers)

### Layers
To get started, we will explore the impact of varying the number of layers in our neural network models. We will experiment with increasing the number of convolutional and fully connected layers to observe how this affects the model's performance across different metrics.

Adding more layers can potentially increase a model’s ability to learn nuanced features of the data due to a higher parameter count, which might lead to improved performance on complex tasks. However, this increase in complexity not only raises computational costs and training duration but also heightens the risk of overfitting. Overfitting can severely impair a model’s ability to generalize from the training data to unseen data, potentially resulting in poorer overall performance.

To mitigate these risks, techniques such as dropout, L2 regularization, and data augmentation can be employed to enhance model generalization. (will be looked at later)


In [None]:
# Filter Size Experiment
model_128_128ch = create_model(
    num_conv_layers=2,
    linear_layer_sizes=[128],
    conv_channels=[128, 128],
    kernel_size=[3, 3],
    stride=[1, 1],
    padding=[1, 1]
)

model_32_16ch = create_model(
    num_conv_layers=2,
    linear_layer_sizes=[128],
    conv_channels=[32, 16],
    kernel_size=[3, 3],
    stride=[1, 1],
    padding=[1, 1]
)

model_32_64_128ch = create_model(
    num_conv_layers=3,
    linear_layer_sizes=[128],
    conv_channels=[32, 64, 128],
    kernel_size=[3, 3, 3],
    stride=[1, 1, 1],
    padding=[1, 1, 1]
)

model_128_256_512ch = create_model(
    num_conv_layers=3,
    linear_layer_sizes=[128],
    conv_channels=[128, 256, 512],
    kernel_size=[3, 3, 3],
    stride=[1, 1, 1],
    padding=[1, 1, 1]
)

model_96_256_512_1024ch = create_model(
    num_conv_layers=4,
    linear_layer_sizes=[128],
    conv_channels=[96, 256, 512, 1024],
    kernel_size=[3, 3, 3, 3],
    stride=[1, 1, 1, 1],
    padding=[1, 1, 1, 1]
)


layer_dict = {
    "128_128ch_128L": model_128_128ch,
    "32_16ch_128L": model_32_16ch,
    "32_64_128ch_128L": model_32_64_128ch,
    "128_256_512ch_128L": model_128_256_512ch,
    "96_256_512_1024ch_128L": model_96_256_512_1024ch,
}

for name, model in layer_dict.items():
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='filter_size', seed=55, track=True, run_name=name)
    trainer.run()

### Kernel Size

The kernel size in convolutional layers plays a pivotal role in determining how effectively a CNN captures spatial information and feature granularity from input images. Larger kernels encompass a broader view of the input at each convolution step, enabling the model to capture more global features within fewer layers. This can be advantageous for identifying larger patterns but may increase computational cost and risk overfitting due to the greater number of parameters. Conversely, smaller kernels focus on finer details and require more layers to achieve a comparable field of view, promoting sensitivity to local features without drastically increasing the parameter count.

#### Expectation for CIFAR-10

Given that CIFAR-10 consists of relatively small images (32x32 pixels) with objects like animals and vehicles that often occupy much of the frame, smaller to medium kernel sizes (e.g., 3x3 or 5x5) are typically preferred. These sizes are likely to be optimal for balancing detailed feature extraction with computational efficiency while avoiding the unnecessary complexity that larger kernels might introduce. For CIFAR-10, we would expect smaller kernels to perform better, allowing the network to learn detailed textures and shapes effectively, leading to superior generalization on this dataset.

In [None]:
model_5x5kern = create_model(num_conv_layers=2, 
                             linear_layer_sizes=[128],
                             conv_channels=[32, 96],
                             kernel_size=[5, 5], 
                             stride=[1, 1],
                             padding=[2, 2])

model_7x7kern = create_model(num_conv_layers=2, 
                             linear_layer_sizes=[128],
                             conv_channels=[32, 96],
                             kernel_size=[7, 7],
                             stride=[1, 1],
                             padding=[3, 3])

model_15x15kern = create_model(num_conv_layers=2, 
                             linear_layer_sizes=[128],
                             conv_channels=[32, 96],
                             kernel_size=[15, 15],
                             stride=[1, 1],
                             padding=[7, 7])

kernel_dict = {
    "5x5K_1S_2P": model_5x5kern,
    "7x7K_1S_3P": model_7x7kern,
    "15x15K_1S_7P": model_15x15kern,
}


for name, model in kernel_dict.items():
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='kernel_size', seed=55, track=True, run_name=name)
    trainer.run()


### Stride

Stride defines the step size at which the kernel moves across the input image during a convolution operation in a convolutional neural network (CNN). Adjusting the stride affects how quickly the spatial dimensions of the output feature maps are reduced. A larger stride results in more aggressive down-sampling, reducing the output size quickly, which can decrease the computational load and speed up the processing time. However, a larger stride may also cause the network to miss finer details in the input, potentially leading to a loss in accuracy if important features are skipped. Conversely, a smaller stride tends to preserve spatial resolution better, capturing more detailed information but at the cost of increased computational complexity.

#### Expectation for CIFAR-10

For CIFAR-10, where the images are relatively small (32x32 pixels) and the important features such as edges and color blocks are closely packed, a smaller stride (e.g., stride of 1) is generally more appropriate. This setting helps in capturing fine details without losing important spatial information, essential for accurately classifying objects in such small images. Using a stride of 1 maximizes the amount of learned detail, enhancing the model's ability to distinguish between similar categories by focusing on subtle differences in features. This approach is likely to yield better performance on the CIFAR-10 dataset by preserving critical information during the convolution phases.

In [None]:
model_Stride2 = create_model(num_conv_layers=2, 
                             conv_channels=[32, 96],
                             linear_layer_sizes=[128],
                             kernel_size=[3, 3], 
                             stride=[2, 2], 
                             padding=[1, 1])

model_Stride3 = create_model(num_conv_layers=2, 
                             conv_channels=[32, 96],
                             linear_layer_sizes=[128],
                             kernel_size=[3, 3], 
                             stride=[3, 3], 
                             padding=[1, 1])

# todo add best kernel size from last check

stride_dict = {
    "3x3K_2S_1P": model_Stride2,
    "3x3K_3S_1P": model_Stride3,
}

for name, model in stride_dict.items():
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='stride_size', seed=55, track=True, run_name=name)
    trainer.run()


### Padding


In [None]:
model_0pad = create_model(num_conv_layers=2, 
                          conv_channels=[32, 96],
                          linear_layer_sizes=[128],
                          kernel_size=[3, 3], 
                          stride=[1, 1], 
                          padding=[0, 0])

# todo add best kernel

padding_dict = {
    "3x3K_1S_0P": model_0pad,
}

for name, model in padding_dict.items():
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='padding', seed=55, track=True, run_name=name)
    trainer.run()

### Linear


In [None]:
# Linear Layers Experiment
# no addtional linear layer
model_1Lin = create_model(num_conv_layers=2, linear_layer_sizes=[], conv_channels=[32, 96],
                          kernel_size=[3, 3], stride=[1, 1], padding=[1, 1])
model_3Lin = create_model(num_conv_layers=2, linear_layer_sizes=[256, 128], conv_channels=[32, 96],
                          kernel_size=[3, 3], stride=[1, 1], padding=[1, 1])

linear_dict = {
    "32_96CH": model_1Lin,
    "32_96CH_256_128L": model_3Lin,
}

for name, model in linear_dict.items():
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='linear_layers', seed=55, track=True, run_name=name)
    trainer.run()

### Regularization

Ziehe nun verschiedene Regularisierungsmethoden bei den MLP Layern in Betracht:
a. L1/L2 Weight Penalty
b. Dropout
Evaluiere den Nutzen der Regularisierung, auch unter Berücksichtigung verschiedener Regularisierungsstärken.
Beschreibe auch kurz, was allgemein das Ziel von Regularisierungsmethoden ist (Regularisierung im Allgemeinen, sowie auch Idee der einzelnen Methoden). Inwiefern wird dieses Ziel im gegebenen Fall erreicht?

In [None]:
weight_decays = [0.1, 0.01, 0.001, 0.0001, 0.00001]

for weight_decay in weight_decays:
    # BasicCNN with LR 0.1 did overfit
    model = BasicCNN()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, weight_decay=weight_decay) 

    trainer = ModelTrainer(model=model, epochs=50, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='regularization', seed=55, track=True, run_name=f'REGL2_{weight_decay}')
    trainer.run()


In [None]:

class CNNRegL1(BasicCNN):
    def __init__(self, l1_strength=0.0005, **kwargs):
        super(CNNRegL1, self).__init__(**kwargs)
        self.l1_strength = l1_strength

    def l1_penalty(self):
        """
        Calculate the L1 penalty for the model's weights only, excluding biases.
        This method iterates over all parameters that require gradients and have more than one dimension,
        which generally corresponds to the weights of the model.
        """
        l1_norm = sum(p.abs().sum() for p in self.parameters() if p.requires_grad and len(p.shape) > 1)
        return self.l1_strength * l1_norm



In [None]:

l1_strengths = [0.1, 0.01, 0.001, 0.0001, 0.00001]

for l1_strength in l1_strengths:
    model = CNNRegL1(input_channels=3, num_classes=10, l1_strength=l1_strength)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1) 

    trainer = ModelTrainer(model=model, epochs=50, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='regularization', seed=55, track=True, run_name=f'REGL1_{l1_strength}')
    trainer.run()


In [None]:
# TODO rethink this
class DropoutCNN(BasicCNN):
    def __init__(self, input_channels=3, num_classes=10, dropout_rate=0.5):
        layers = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate),

            # Third convolutional block
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate),

            # Flattening the output for the fully connected layer
            nn.Flatten(),

            # Fully connected layer
            nn.Linear(128 * 4 * 4, 256),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),

            # Output layer
            nn.Linear(256, num_classes)
        )
        super().__init__(input_channels, num_classes, layers)
        
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5]

for dropout_rate in dropout_rates:
    model = DropoutCNN(input_channels=3, num_classes=10, dropout_rate=dropout_rate)
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='regularization', epochs=EPOCHS, batch_size=BATCH_SIZE, seed=55, track=True, run_name=f'DropoutCNN_{dropout_rate}')
    trainer.run()



## Use of Batchnorm  



In [None]:
class BatchNormCNN(BasicCNN):
    def __init__(self, input_channels=3, num_classes=10):
        layers = nn.Sequential(
            nn.Conv2d(input_channels, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Flatten(),

            nn.Linear(512 * 4 * 4, num_classes)
        )
        super().__init__(input_channels, num_classes, layers)
            

model = BatchNormCNN(input_channels=3, num_classes=10)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='batchnorm', seed=55, track=True, run_name='BatchNormCNN')
trainer.run()


## Use of Adam



In [None]:

adam_learning_rates = [0.1, 0.01, 0.001, 0.0001, 0.00001]
adam_weight_decays = [0.1, 0.01, 0.001, 0.0001, 0.00001]

for lr in adam_learning_rates:
    model = BasicCNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='adam', seed=55, track=True, run_name=f'Adam_LR{lr}')
    trainer.run()

for weight_decay in adam_weight_decays:
    model = BasicCNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=weight_decay)

    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='adam', seed=55, track=True, run_name=f'Adam_WD{weight_decay}')
    trainer.run()
