In [None]:
# relevant for lightning.ai studio
%cd del_mc1/

In [2]:
# base libraries
import numpy as np
import matplotlib.pyplot as plt
import pickle
import random
from collections import OrderedDict

# ML related libraries
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
# todo do I need this?
from torcheval.metrics import MulticlassAccuracy
from torchvision import datasets, transforms

from torch.utils.data import DataLoader
import torch.nn as nn
import copy

# mlops
import wandb

%env WANDB_NOTEBOOK_NAME=mc1.ipynb

torch.use_deterministic_algorithms(True)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Tag ensure that wandb won't be cluttered
DEVELOPMENT = True

env: WANDB_NOTEBOOK_NAME=mc1.ipynb


In [3]:

transform = transforms.Compose([
    transforms.ToTensor(),
])

# Load CIFAR-10 training data
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=len(train_dataset))
train_images, train_labels = next(iter(train_loader))

# Load CIFAR-10 test data
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset))
test_images, test_labels = next(iter(test_loader))


Files already downloaded and verified
Files already downloaded and verified


In [None]:

# Print shapes of the datasets to verify
print('train_images shape:', train_images.shape)
print('train_labels shape:', train_labels.shape)
print('test_images shape:', test_images.shape)
print('test_labels shape:', test_labels.shape)

# Accessing label names
# get all labels 
label_names = train_dataset.classes
print('label_names size:', len(label_names))
print('label_names:', label_names)


## Exploratory Data Analysis 
This section provides some insight into the CIFAR-10 dataset, which will help ht

In [None]:
np.unique(train_labels, return_counts=True)[1], np.unique(test_labels, return_counts=True)[1]

The labels are well distributed.

In [None]:

def verify_label_distribution(labels, label_names):
    unique, counts = np.unique(labels, return_counts=True)
    distribution = dict(zip(unique, counts))
    for label, count in distribution.items():
        print(f'{label_names[label]}: {count}')
    return distribution

def display_example_images(images, labels, label_names):
    fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(20, 8))
    axes = axes.flatten()
    
    for i, label_name in enumerate(label_names):
        index = np.where(labels == i)[0][0]
        image = images[index].permute(1, 2, 0) 
        
        axes[i].imshow(image)
        axes[i].set_title(label_name)
        axes[i].title.set_size(20)
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()


print("Training Labels Distribution:")
train_distribution = verify_label_distribution(train_labels, label_names)
print("\nTest Labels Distribution:")
test_distribution = verify_label_distribution(test_labels, label_names)

print("\nExample Images from Each Class:")
display_example_images(train_images, train_labels, label_names)




In [None]:
# todo maybe plot more images?

Let's calculate the mean and standard deviation for each color channel in the CIFAR-10 dataset.

#### Reasoning
This step is crucial for normalizing (relevant for preprocessing) the dataset, ensuring consistent model training and faster convergence.


In [4]:
def calculate_mean_std(loader):
    channels_sum, channels_squared_sum, num_batches = 0, 0, 0

    for data, _ in loader:
        channels_sum += torch.mean(data, dim=[0, 2, 3])
        channels_squared_sum += torch.mean(data ** 2, dim=[0, 2, 3])
        num_batches += 1

    mean = channels_sum / num_batches
    std = (channels_squared_sum / num_batches - mean ** 2) ** 0.5

    return mean, std

train_mean, train_std = calculate_mean_std(train_loader)
test_mean, test_std = calculate_mean_std(test_loader)

print(f"Training Mean: {train_mean}, Training Std: {train_std}")
print(f"Test Mean: {test_mean}, Test Std: {test_std}")


Training Mean: tensor([0.4914, 0.4822, 0.4465]), Training Std: tensor([0.2470, 0.2435, 0.2616])
Test Mean: tensor([0.4942, 0.4851, 0.4504]), Test Std: tensor([0.2467, 0.2429, 0.2616])


Let's plot histograms of pixel values for the Red, Green, and Blue channels.

#### Reasoning
Understanding pixel intensity distribution aids in identifying dataset biases and informs necessary preprocessing adjustments.


In [None]:
def plot_pixel_histograms(images):
    images = images.permute(1, 0, 2, 3).reshape(3, -1)
    plt.figure(figsize=(15, 5))
    for i, color in enumerate(['r', 'g', 'b']):
        plt.subplot(1, 3, i + 1)
        # todo validate bins
        plt.hist(images[i].numpy(), bins=20, color=color, alpha=0.7)
        plt.title(f'{color.upper()} Channel')
        plt.ylabel('Frequency')
        plt.xlabel('Pixel Intensity')
    plt.tight_layout()
    plt.show()

plot_pixel_histograms(train_images)


In [None]:
def plot_color_distribution_per_class(images, labels, label_names):
    num_classes = len(label_names)
    fig, axes = plt.subplots(nrows=num_classes, ncols=3, figsize=(20, num_classes * 4))
    
    for i, label_name in enumerate(label_names):
        class_images = images[labels == i]
        # Flatten the images to a single dimension per channel for plotting
        red_channel = class_images[:, 0, :, :].flatten()
        green_channel = class_images[:, 1, :, :].flatten()
        blue_channel = class_images[:, 2, :, :].flatten()
        
        for j, (channel, color) in enumerate(zip([red_channel, green_channel, blue_channel], ['Red', 'Green', 'Blue'])):
            ax = axes[i, j]
            ax.hist(channel.numpy(), bins=20, color=color.lower(), alpha=0.7)
            ax.set_title(f'{label_name} - {color} Channel')
            ax.set_xlabel('Pixel Intensity')
            ax.set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()


plot_color_distribution_per_class(train_images, train_labels, label_names)


## Preprocessing 


Given that the CIFAR-10 dataset is very well maintained (already labeled, pictures dimensions are consistent and distribution between labels is the same) the only thing we really need to do is properly normalize the data and do some train, validation and test split relevant for training later.

In [5]:
normalize_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=train_mean, std=train_std), # using training mean and std for normalization
])

# Reload the datasets with normalization, keep it simple
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=normalize_transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=normalize_transform)


Files already downloaded and verified
Files already downloaded and verified


In [None]:
validation_split = 0.2
num_train = len(train_dataset)
indices = list(range(num_train))
split = int(np.floor(validation_split * num_train))

np.random.shuffle(indices)

train_idx, valid_idx = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)


In [None]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
valid_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=valid_sampler)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


### 3. Modeling structure

The metric used for evaluation in this challenge will be Accuracy. While simple, it is a good metric given that the dataset is very well balanced. 

Since this is a multi-class classification, using cross-entropy as the loss function is logical. 

In [None]:
class ModelTrainer:
    def __init__(self, model, train_dataset, test_dataset, optimizer, criterion=nn.CrossEntropyLoss(), batch_size=32, epochs=10, seed=None, regularization=None, experiment="development", track=True, run_name=''):

        self.seed = seed    
        if seed is not None:
            self.set_seed(seed)

        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.experiment = experiment


        self.optimizer = optimizer
        self.criterion = criterion
        self.batch_size = batch_size
        self.epochs = epochs
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        # reinitialize weights
        self.model.initialize_weights()
        self.regularization = regularization


        self.config = self.auto_generate_config()
        self.track = track
        if self.track:
            self.init_wandb()

    def set_seed(self, seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)

    def auto_generate_config(self):
        lr = self.optimizer.param_groups[0]['lr']
        config = {
            'model_type': self.model.__class__.__name__,
            'experiment': self.experiment,
            'optimizer': self.optimizer.__class__.__name__,
            'learning_rate': lr,
            'criterion': self.criterion.__class__.__name__,
            'batch_size': self.batch_size,
            'epochs': self.epochs,
            'device': self.device,
            'seed': self.seed,
            'regularization': self.regularization
        }
        return config

    def init_wandb(self):
        run_name = f"{self.config['model_type']}_LR{self.config['learning_rate']}_BS{self.config['batch_size']}_OP{self.config['optimizer']}_REG{self.config['regularization']}"
        group = f"{self.config['experiment']}"
        wandb.init(project='del', name=run_name, group=group, config=self.config)

    def train(self):
        self.model.train()
        train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
        total_steps = len(train_loader)
        
        for epoch in range(self.epochs):
            running_loss = 0.0
            correct_predictions = 0
            total_predictions = 0

            for i, (inputs, labels) in enumerate(train_loader):
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                
                self.optimizer.zero_grad()
                
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

            epoch_loss = running_loss / total_steps
            epoch_accuracy = correct_predictions / total_predictions
            if self.track:
                wandb.log({'epoch': epoch + 1, 'train_loss': epoch_loss, 'train_accuracy': epoch_accuracy})
            print(f'Epoch [{epoch+1}/{self.epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')


    
    def test(self):
        self.model.eval()
        test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)
        total_steps = len(test_loader)
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

                running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

        avg_loss = running_loss / total_steps
        accuracy = correct_predictions / total_predictions
        if self.track:
            wandb.log({'test_loss': avg_loss, 'test_accuracy': accuracy})
        print(f'Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}')

    def run(self):
        self.train()
        self.test()
        
        if self.track:
            wandb.finish()


In [6]:
import torch
from torch.utils.data import DataLoader
import wandb

class ModelTrainer:
    def __init__(self, model, train_dataset, test_dataset, optimizer, criterion=torch.nn.CrossEntropyLoss(), batch_size=32, epochs=40, seed=None, experiment="development", track=True, run_name=''):
        self.seed = seed    
        if seed is not None:
            self.set_seed(seed)

        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.experiment = experiment
        self.optimizer = optimizer
        self.criterion = criterion
        self.batch_size = batch_size
        self.epochs = epochs
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.model.initialize_weights()

        self.config = self.auto_generate_config()
        self.track = track
        if self.track:
            self.run_name = run_name
            self.init_wandb()

    def set_seed(self, seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)

    def auto_generate_config(self):
        self.lr = self.optimizer.param_groups[0]['lr']
        config = {
            'model_type': self.model.__class__.__name__,
            'experiment': self.experiment,
            'optimizer': self.optimizer.__class__.__name__,
            'learning_rate': self.lr,
            'criterion': self.criterion.__class__.__name__,
            'batch_size': self.batch_size,
            'epochs': self.epochs,
            'device': self.device,
            'seed': self.seed,
        }
        return config

    def init_wandb(self):
        if not self.run_name:
            self.run_name = f"{self.config['model_type']}_LR{self.config['learning_rate']}_BS{self.config['batch_size']}"

        tags = []
        if DEVELOPMENT:
            tags = ['development']

        wandb.init(project='del', name=self.run_name, group=self.config['experiment'], config=self.config, tags=tags)
        wandb.watch(self.model, self.criterion, log='all', log_freq=10, log_graph=True)

    def train(self, validate=False):
        self.model.train()
        train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4, pin_memory=True)
        total_steps = len(train_loader)
        
        for epoch in range(self.epochs):
            running_loss = 0.0
            correct_predictions = 0
            total_predictions = 0

            for i, (inputs, labels) in enumerate(train_loader):
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

                # L1 reg is not supported by PyTorch optimizers natively
                if hasattr(self.model, 'l1_penalty'):
                    l1_loss = self.model.l1_penalty()
                    loss += l1_loss

                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

            epoch_loss = running_loss / total_steps
            epoch_accuracy = correct_predictions / total_predictions
            if self.track:
                wandb.log({'epoch': epoch + 1, 'train_loss': epoch_loss, 'train_accuracy': epoch_accuracy})
            print(f'Epoch [{epoch+1}/{self.epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

            if validate:
                self.validate_epoch(epoch)

    # todo this is 1:1 with test
    def validate_epoch(self, epoch):
        self.model.eval()
        valid_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4, pin_memory=True)
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

        avg_loss = running_loss / len(valid_loader)
        accuracy = correct_predictions / total_predictions
        if self.track:
            wandb.log({'validation_loss': avg_loss, 'validation_accuracy': accuracy, 'epoch': epoch + 1})
        print(f'Validation - Epoch [{epoch+1}/{self.epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    def test(self):
        self.model.eval()
        test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4, pin_memory=True)
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                running_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_predictions += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

        avg_loss = running_loss / len(test_loader)
        accuracy = correct_predictions / total_predictions
        if self.track:
            wandb.log({'test_loss': avg_loss, 'test_accuracy': accuracy})
        print(f'Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}')

    def run(self, validate=True):
        self.train(validate=validate)
        # we either validate or test, as they share the same test set, so doing both is redundant.
        if not validate:
            self.test()
        if self.track:
            wandb.finish()


In [7]:

class BasicCNN(nn.Module):
    def __init__(self, input_channels=3, num_classes=10, layers=None):
        super(BasicCNN, self).__init__()
        self.input_channels = input_channels
        self.num_classes = num_classes
 
        if layers is None:
            self.layers = [
                nn.Conv2d(input_channels, 32, kernel_size=3, stride=1, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2),
                
                nn.Conv2d(32, 96, kernel_size=3, stride=1, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2),

                # global pool
                nn.AdaptiveMaxPool2d(1),              
                nn.Flatten(),
                nn.Linear(96, num_classes),
            ]
        else:
            self.layers = layers

        self.features = nn.Sequential(*self.layers)
        
    def initialize_weights(self):
        for layer in self.layers:
            if hasattr(layer, 'reset_parameters'):
                nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')

    def forward(self, x):
        x = self.features(x)
        return x

### Overfit check

In [9]:

learning_rate = 0.01
batch_size = 32
epochs = 40

model = BasicCNN()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
train_subset = torch.utils.data.Subset(train_dataset, range(batch_size))
test_subset = torch.utils.data.Subset(test_dataset, range(batch_size))
trainer = ModelTrainer(model=model, train_dataset=train_subset, test_dataset=test_subset, optimizer=optimizer, experiment='overfit', epochs=50, batch_size=batch_size, seed=55, track=True)

trainer.run()

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/80], Loss: 5.7202, Accuracy: 0.1875
Validation - Epoch [1/80], Loss: 5.0559, Accuracy: 0.0938
Epoch [2/80], Loss: 4.8981, Accuracy: 0.0938
Validation - Epoch [2/80], Loss: 4.2257, Accuracy: 0.0312
Epoch [3/80], Loss: 3.4456, Accuracy: 0.1562
Validation - Epoch [3/80], Loss: 3.2324, Accuracy: 0.0625
Epoch [4/80], Loss: 2.5553, Accuracy: 0.2188
Validation - Epoch [4/80], Loss: 2.9350, Accuracy: 0.0938
Epoch [5/80], Loss: 2.1686, Accuracy: 0.3750
Validation - Epoch [5/80], Loss: 3.0562, Accuracy: 0.0938
Epoch [6/80], Loss: 2.0041, Accuracy: 0.2812
Validation - Epoch [6/80], Loss: 2.8917, Accuracy: 0.1250
Epoch [7/80], Loss: 1.9113, Accuracy: 0.3438
Validation - Epoch [7/80], Loss: 3.0723, Accuracy: 0.0938
Epoch [8/80], Loss: 1.8289, Accuracy: 0.3438
Validation - Epoch [8/80], Loss: 2.8553, Accuracy: 0.1250
Epoch [9/80], Loss: 1.7903, Accuracy: 0.4688
Validation - Epoch [9/80], Loss: 3.0675, Accuracy: 0.0938
Epoch [10/80], Loss: 1.6781, Accuracy: 0.4688
Validation - Epoch [10/80],

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▁▃▃▄▄▄▅▅▆▇▇▇▇██████████████████████████
train_loss,█▅▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▃▁▃▃▃▆███▆▆▆▆▃▃▃▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
validation_loss,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂

0,1
epoch,80.0
train_accuracy,1.0
train_loss,0.23978
validation_accuracy,0.125
validation_loss,3.04828


Using only a subset of 10 images and batch size 5 on 20 epochs quickly shows that the SimpleCNN model is overfitting, which makes sense given the training data size.

On the test set we can see the consequences of the overfitting with a poor accuracy of about 14%.

## Training with SGD, without REG, without BN




### Weight Initialisation

Weight initialisation refers to the process of assigning initial values to the weights of the network's neurons. It can have a great impact on the learning process and performance of the model. 

Keywords:
- Vanishing Gradient
- Convergence speed
- Local Minima

Over the past years standards have established themselves based on the activation function in use.


#### todo this requires more love.... it is not as clear as I would like it to be.

Fortunately PyTorch has good defaults, and automatically uses the He Initialisation for Conv2d with LeakyReLU activation functions.

Source:
https://arc.net/l/quote/wfgutzqj

In [10]:
## START WRONG weight initialization

class OneWeightCNN(BasicCNN):

    def __init__(self, input_channels=3, num_classes=10, layers=None):
        super().__init__(input_channels, num_classes, layers)

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.constant_(m.weight, 1.0)
            

In [11]:
model = OneWeightCNN()
# check if all weights are 1
model.initialize_weights()

# Validate that all weights are set to 1
for name, param in model.named_parameters():
    if "weight" in name:  # Ensure we're only checking weights, not biases
        assert torch.all(param == 1), f"{name} not all ones"
print("All weights verified as 1.")


optimizer = optim.SGD(model.parameters(), lr=learning_rate)

#todo enable tracking
trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='weight_init', seed=55, track=True, run_name='OneWeightCNN')
trainer.run()

All weights verified as 1.


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/40], Loss: 9745.1190, Accuracy: 0.1003
Validation - Epoch [1/40], Loss: 2.3027, Accuracy: 0.1000
Epoch [2/40], Loss: 2.3027, Accuracy: 0.0995
Validation - Epoch [2/40], Loss: 2.3026, Accuracy: 0.1000
Epoch [3/40], Loss: 2.3027, Accuracy: 0.0973
Validation - Epoch [3/40], Loss: 2.3026, Accuracy: 0.1000
Epoch [4/40], Loss: 2.3027, Accuracy: 0.0975
Validation - Epoch [4/40], Loss: 2.3026, Accuracy: 0.1000
Epoch [5/40], Loss: 2.3027, Accuracy: 0.0960
Validation - Epoch [5/40], Loss: 2.3026, Accuracy: 0.1000
Epoch [6/40], Loss: 2.3027, Accuracy: 0.0992
Validation - Epoch [6/40], Loss: 2.3026, Accuracy: 0.1000
Epoch [7/40], Loss: 2.3027, Accuracy: 0.0961
Validation - Epoch [7/40], Loss: 2.3026, Accuracy: 0.1000
Epoch [8/40], Loss: 2.3027, Accuracy: 0.0982
Validation - Epoch [8/40], Loss: 2.3026, Accuracy: 0.1000
Epoch [9/40], Loss: 2.3027, Accuracy: 0.0998
Validation - Epoch [9/40], Loss: 2.3026, Accuracy: 0.1000
Epoch [10/40], Loss: 2.3027, Accuracy: 0.0976
Validation - Epoch [10/4

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,█▇▄▄▂▆▂▅▇▄▅▆▂▇▅▄▄▅▃▆▁▄▃▁▂▅▃▅▅▃▇▇▇▄▄▄▂▄▅▆
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▂▁▁▁▂▁▂▂▁▁▁▂▂▂▂▁▁▃▁▁▁▁▁▁▁▂▁▂▂▂▁▂▂▂▁▁▁▂▁

0,1
epoch,40.0
train_accuracy,0.09864
train_loss,2.3027
validation_accuracy,0.1
validation_loss,2.3026


In [12]:
## Do RANDOM weight initialization
class RandomWeightCNN(BasicCNN):

    def __init__(self, input_channels=3, num_classes=10, layers=None):
        super().__init__(input_channels, num_classes, layers)

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                # Initialize weights with a uniform distribution
                nn.init.uniform_(m.weight, -0.1, 0.1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

In [13]:
model = RandomWeightCNN()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='weight_init', seed=55, track=True, run_name='RandomWeightCNN')
trainer.run()

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/40], Loss: 2.0966, Accuracy: 0.2390
Validation - Epoch [1/40], Loss: 1.8695, Accuracy: 0.3399
Epoch [2/40], Loss: 1.7547, Accuracy: 0.3708
Validation - Epoch [2/40], Loss: 1.6447, Accuracy: 0.4062
Epoch [3/40], Loss: 1.5437, Accuracy: 0.4488
Validation - Epoch [3/40], Loss: 1.4743, Accuracy: 0.4628
Epoch [4/40], Loss: 1.4151, Accuracy: 0.4988
Validation - Epoch [4/40], Loss: 1.4215, Accuracy: 0.4854
Epoch [5/40], Loss: 1.3342, Accuracy: 0.5267
Validation - Epoch [5/40], Loss: 1.3302, Accuracy: 0.5193
Epoch [6/40], Loss: 1.2777, Accuracy: 0.5504
Validation - Epoch [6/40], Loss: 1.3339, Accuracy: 0.5213
Epoch [7/40], Loss: 1.2310, Accuracy: 0.5652
Validation - Epoch [7/40], Loss: 1.3426, Accuracy: 0.5070
Epoch [8/40], Loss: 1.1926, Accuracy: 0.5813
Validation - Epoch [8/40], Loss: 1.2544, Accuracy: 0.5548
Epoch [9/40], Loss: 1.1583, Accuracy: 0.5933
Validation - Epoch [9/40], Loss: 1.2002, Accuracy: 0.5708
Epoch [10/40], Loss: 1.1283, Accuracy: 0.6064
Validation - Epoch [10/40],

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▃▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
train_loss,█▆▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▂▄▄▅▅▄▅▆▆▆▅▆▆▆▇▆▇▇▇▇▇▇▇▇█▇███▇███████▇█
validation_loss,█▆▅▅▄▄▄▃▃▃▃▃▂▃▃▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▂▁▁▂▁▁▁▂▂▁

0,1
epoch,40.0
train_accuracy,0.73996
train_loss,0.74036
validation_accuracy,0.6727
validation_loss,0.97857


In [14]:
## Do Default
model = BasicCNN()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='weight_init', epochs=epochs, batch_size=batch_size, seed=55, track=True, run_name="HeWeightCNN")
trainer.run()

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/40], Loss: 1.8108, Accuracy: 0.3547
Validation - Epoch [1/40], Loss: 1.5663, Accuracy: 0.4370


Epoch [2/40], Loss: 1.4574, Accuracy: 0.4819
Validation - Epoch [2/40], Loss: 1.4797, Accuracy: 0.4646
Epoch [3/40], Loss: 1.3272, Accuracy: 0.5291
Validation - Epoch [3/40], Loss: 1.3726, Accuracy: 0.5052
Epoch [4/40], Loss: 1.2481, Accuracy: 0.5613
Validation - Epoch [4/40], Loss: 1.3166, Accuracy: 0.5284
Epoch [5/40], Loss: 1.1923, Accuracy: 0.5804
Validation - Epoch [5/40], Loss: 1.2498, Accuracy: 0.5517
Epoch [6/40], Loss: 1.1475, Accuracy: 0.5968
Validation - Epoch [6/40], Loss: 1.2736, Accuracy: 0.5483
Epoch [7/40], Loss: 1.1100, Accuracy: 0.6117
Validation - Epoch [7/40], Loss: 1.2665, Accuracy: 0.5484
Epoch [8/40], Loss: 1.0785, Accuracy: 0.6193
Validation - Epoch [8/40], Loss: 1.1687, Accuracy: 0.5888
Epoch [9/40], Loss: 1.0510, Accuracy: 0.6301
Validation - Epoch [9/40], Loss: 1.1788, Accuracy: 0.5748
Epoch [10/40], Loss: 1.0283, Accuracy: 0.6380
Validation - Epoch [10/40], Loss: 1.1064, Accuracy: 0.6066
Epoch [11/40], Loss: 1.0047, Accuracy: 0.6461
Validation - Epoch [11/40

In [None]:
class HeWeightCNN(BasicCNN):

    def __init__(self, input_channels=3, num_classes=10, layers=None):
        super().__init__(input_channels, num_classes, layers)

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')

In [None]:
model = HeWeightCNN()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

## todo Default 
trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='weight_init', seed=55, track=True, run_name='D')
trainer.run()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112408900064312, max=1.0…

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Epoch [1/40], Loss: 1.5761, Accuracy: 0.4736
Validation - Epoch [1/40], Loss: 1.3386, Accuracy: 0.5385
Epoch [2/40], Loss: 1.1850, Accuracy: 0.5867
Validation - Epoch [2/40], Loss: 1.1699, Accuracy: 0.5943
Epoch [3/40], Loss: 1.0765, Accuracy: 0.6278
Validation - Epoch [3/40], Loss: 1.1894, Accuracy: 0.5870
Epoch [4/40], Loss: 1.0116, Accuracy: 0.6496
Validation - Epoch [4/40], Loss: 1.1180, Accuracy: 0.6118
Epoch [5/40], Loss: 0.9619, Accuracy: 0.6713
Validation - Epoch [5/40], Loss: 1.0904, Accuracy: 0.6269
Epoch [6/40], Loss: 0.9269, Accuracy: 0.6812
Validation - Epoch [6/40], Loss: 1.0566, Accuracy: 0.6424
Epoch [7/40], Loss: 0.8975, Accuracy: 0.6920
Validation - Epoch [7/40], Loss: 1.1283, Accuracy: 0.6159
Epoch [8/40], Loss: 0.8715, Accuracy: 0.7008
Validation - Epoch [8/40], Loss: 1.0881, Accuracy: 0.6384
Epoch [9/40], Loss: 0.8516, Accuracy: 0.7074
Validation - Epoch [9/40], Loss: 1.0831, Accuracy: 0.6280
Epoch [10/40], Loss: 0.8303, Accuracy: 0.7133
Validation - Epoch [10/40],

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▃▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
train_loss,█▅▅▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▅▄▆▇█▆█▇▆█▇█▇▇▇▇██▇▇▇▇▇█▆▇▇▇▇▆▇▇▇▇▇▆▆▆▅
validation_loss,▇▃▄▂▂▁▂▂▂▃▁▂▂▂▂▂▂▂▂▃▃▃▄▃▃▅▄▄▅▅▆▅▅▅▅▅▅▇▆█

0,1
epoch,40.0
train_accuracy,0.8132
train_loss,0.54282
validation_accuracy,0.6014
validation_loss,1.41133


In [15]:
## plot wandb weight_init report

%wandb dmnkf/del/reports/weight_init

Given the results of the report we will change the default weight init of Conv2d and Linear Layers

### Hyperparameter Tuning
Für jedes Modell mit gegebener Anzahl Layer und Units pro Layer führe ein sorgfältiges Hyper-Parameter-Tuning durch. Untersuche, wie sich die das Training verändert bei unterschiedlicher Wahl für die Lernrate, in einer separaten Betrachtung auch für die Batch-Grösse. Achte stets darauf, dass das Training stabil läuft. Merke Dir bei jedem Training, den Loss, die Performance Metrik(en) inkl. Schätzfehler, die verwendete Anzahl Epochen, Lernrate und Batch-Grösse. Beachte: Keine Verfahren zur automatischen Hyperparameter-Suche (z.B. kein Bayesian und kein Random Parameter- Sweep Methoden) verwenden!

#### Learning Rate 



In [None]:
learning_rates = [1, 0.1, 0.01, 0.001, 0.00001]

for lr in learning_rates:
    model = BasicCNN()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='learning_rate', seed=55, track=True, run_name=f"BasicCNN_LR{learning_rate}")
    trainer.run(validate=False)

### Batch Sizes

In [None]:
batch_sizes = [1000, 64, 32, 16, 1]

for bs in batch_sizes:
    model = BasicCNN()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='batch_size', batch_size=bs, seed=55, track=True, run_name=f"BasicCNN_BS{batch_size}")
    trainer.run(validate=False)

## Complexity
Variiere die Anzahl Layer und Anzahl Units pro Layer, um eine möglichst gute Performance zu erreichen. Falls auch CNNs (ohne Transfer-Learning) verwendet werden variiere auch Anzahl Filter, Kernel-Grösse, Stride, Padding.

### Layers
To get started, we will look at the impact of different layers in our neural network model. We will experiment with varying the number of layers and observe how it affects the model's performance.


In [None]:

class CNN_3Conv(BasicCNN):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layers = [
            nn.Conv2d(self.input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 128)
            nn.ReLU(),
            nn.Linear(128, self.num_classes)
        ]
        self.features = nn.Sequential(*self.layers)


class CNN_4Conv(BasicCNN):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layers = [
            nn.Conv2d(self.input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Flatten(),
            nn.Linear(128 * 8 * 8, 128),
            nn.ReLU(),
            nn.Linear(128, self.num_classes)
        ]
        self.features = nn.Sequential(*self.layers)

class CNN_6Conv(BasicCNN):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layers = [
            nn.Conv2d(self.input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),                
            
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),                
            
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Flatten(),
            nn.Linear(128 * 8 * 8, 128),
            nn.ReLU(),
            nn.Linear(128, self.num_classes),
        ]
        self.features = nn.Sequential(*self.layers)


In [None]:
conv_models = {
    'CNN_3Conv': CNN_3Conv,
    'CNN_4Conv': CNN_4Conv,
    'CNN_6Conv': CNN_6Conv
}

for model_name, model_class in conv_models.items():
    model = model_class()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='conv_layers', epochs=epochs, batch_size=batch_size, seed=55, track=True, run_name=model_name)
    trainer.run()

### Kernal Size


The kernel size in a convolution layer plays a significant role in determining the feature extraction capabilities of the layer. The kernel size refers to the width and height of the filter used in the convolution operation.

When the kernel size is increased, the convolution layer is able to capture more global or spatial information from the input. This can be beneficial when the important features in the data are spread out or when the data has a high degree of variability. However, a larger kernel size also means more parameters to learn, which can increase the computational complexity and the risk of overfitting.





### Stride


### Padding


### Regularization

Ziehe nun verschiedene Regularisierungsmethoden bei den MLP Layern in Betracht:
a. L1/L2 Weight Penalty
b. Dropout
Evaluiere den Nutzen der Regularisierung, auch unter Berücksichtigung verschiedener Regularisierungsstärken.
Beschreibe auch kurz, was allgemein das Ziel von Regularisierungsmethoden ist (Regularisierung im Allgemeinen, sowie auch Idee der einzelnen Methoden). Inwiefern wird dieses Ziel im gegebenen Fall erreicht?

In [None]:
# regularization is done by optimizer so using the basicCNN model seems valid
model = BasicCNN(input_channels=3, num_classes=10)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.01) 

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='regularization', seed=55, track=True, run_name='BasicCNN_REGL2')
trainer.run()


In [None]:
import torch
import torch.nn as nn

class CNNRegL1(BasicCNN):
    def __init__(self, l1_strength=0.0005, **kwargs):
        super(CNNRegL1, self).__init__(**kwargs)
        self.l1_strength = l1_strength

    def l1_penalty(self):
        """
        Calculate the L1 penalty for the model's weights only, excluding biases.
        This method iterates over all parameters that require gradients and have more than one dimension,
        which generally corresponds to the weights of the model.
        """
        l1_norm = sum(p.abs().sum() for p in self.parameters() if p.requires_grad and len(p.shape) > 1)
        return self.l1_strength * l1_norm

model = CNNRegL1(input_channels=3, num_classes=10, l1_strength=0.0005)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='regularization', seed=55, track=True, run_name='BasicCNN_REGL1')
trainer.run()


In [None]:
class DropoutCNN(BasicCNN):
    def __init__(self, *args, dropout_rate=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.layers = [
            nn.Conv2d(self.input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveMaxPool2d((8, 8)),
            nn.Flatten(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(128 * 8 * 8, self.num_classes)
        ]
        self.features = nn.Sequential(*self.layers)
        
model = DropoutCNN(input_channels=3, num_classes=10, dropout_rate=0.5)
optimizer = optim.SGD(model.parameters(), lr=0.01)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='regularization', epochs=15, batch_size=32, seed=55, track=True, run_name='DropoutCNN')
trainer.run(validate=True)


## Use of Batchnorm  



In [None]:
## todo common thing to do something completely different here I think, not sure anymore, sigmoid??
class BatchNormCNN(BasicCNN):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.layers = [
            nn.Conv2d(self.input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),                
            nn.Flatten(),
            nn.Linear(128 * 16 * 16, self.num_classes)
        ]
        self.features = nn.Sequential(*self.layers)

        

model = BatchNormCNN(input_channels=3, num_classes=10)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='batchnorm', seed=55, track=True, run_name='BatchNormCNN')
trainer.run(validate=True)


## Use of Adam



In [None]:
# Using BasicCNN for Adam optimization, no changes to the model structure are needed.
model = BasicCNN(input_channels=3, num_classes=10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

trainer = ModelTrainer(model=model, train_dataset=train_dataset, test_dataset=test_dataset, optimizer=optimizer, experiment='adam', seed=55, track=True)
trainer.run(validate=True)
