# Multi-Task CNN on Fashion-MNIST

In [1]:
# Install dependencies first
!pip install wandb torch torchvision matplotlib

# Login to wandb
import wandb
wandb.login()



  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaicharanbakaram30[0m ([33msaicharanbakaram30-iiit-hyderabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Data Loading and Preprocessing

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import FashionMNIST
import numpy as np
import matplotlib.pyplot as plt
import wandb
from typing import Tuple, Dict, List

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)


class FashionMNISTDataset(Dataset):
    """Custom Dataset for Multi-Task Learning on Fashion-MNIST."""

    def __init__(self, images, labels, transform=None):
        """
        Args:
            images: tensor of images
            labels: tensor of class labels
            transform: optional transform to apply to images
        """
        self.images = images
        self.labels = labels
        self.transform = transform

        # Calculate ink_target (normalized pixel intensity)
        self.ink_targets = torch.mean(images.float() / 255.0, dim=(1, 2))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]

        if self.transform:
            image = self.transform(image)

        class_label = self.labels[idx]
        ink_target = self.ink_targets[idx]

        return image, class_label, ink_target


def load_fashion_data(batch_size=128, val_split_ratio=0.1, augment_train=True):
    """
    Load Fashion-MNIST dataset and create train/val/test splits.

    Args:
        batch_size: batch size for training data loader
        val_split_ratio: ratio of validation split (default 0.1 for 90/10 split)
        augment_train: whether to apply augmentations to training data

    Returns:
        train_loader, val_loader, test_loader, mean, std
    """
    # Download full dataset
    full_train = FashionMNIST(root='./data', train=True, download=True)
    test_dataset = FashionMNIST(root='./data', train=False, download=True)

    # Split train into train (90%) and val (10%)
    train_size = int((1 - val_split_ratio) * len(full_train))
    val_size = len(full_train) - train_size

    indices = torch.randperm(len(full_train))
    train_indices = indices[:train_size]
    val_indices = indices[train_size:]

    # Calculate mean and std from training data
    train_images = full_train.data[train_indices].float()
    mean = train_images.mean() / 255.0
    std = train_images.std() / 255.0

    # Define transforms
    train_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomCrop(28, padding=2) if augment_train else transforms.Lambda(lambda x: x),
        transforms.RandomRotation(5) if augment_train else transforms.Lambda(lambda x: x),
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])

    val_test_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor(),
        transforms.Normalize((mean,), (std,))
    ])

    # Create datasets
    train_dataset = FashionMNISTDataset(
        full_train.data[train_indices],
        full_train.targets[train_indices],
        train_transform
    )

    val_dataset = FashionMNISTDataset(
        full_train.data[val_indices],
        full_train.targets[val_indices],
        val_test_transform
    )

    test_dataset = FashionMNISTDataset(
        test_dataset.data,
        test_dataset.targets,
        val_test_transform
    )

    # Create dataloaders with dynamic batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size * 2, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=batch_size * 2, shuffle=False, num_workers=2)

    return train_loader, val_loader, test_loader, mean.item(), std.item()




## Model Implementation: Multi-Task CNN

In [3]:
class MultiTaskCNN(nn.Module):
    """Multi-Task CNN with shared backbone and separate heads."""

    def __init__(self, num_classes=10, num_conv_layers=3, out_channels=[32, 64, 128],
                 dropout_rate=0.3, kernel_size=3, padding=1):
        """
        Args:
            num_classes: number of classification classes
            num_conv_layers: number of convolutional blocks in the backbone
            out_channels: list of output channels for each conv layer
            dropout_rate: dropout probability
            kernel_size: kernel size for all conv layers
            padding: padding for all conv layers
        """
        super(MultiTaskCNN, self).__init__()

        assert len(out_channels) == num_conv_layers, \
            f"Length of out_channels ({len(out_channels)}) must equal num_conv_layers ({num_conv_layers})"

        self.num_conv_layers = num_conv_layers
        self.out_channels = out_channels

        # Dynamically create shared convolutional backbone
        self.conv_layers = nn.ModuleList()
        self.bn_layers = nn.ModuleList()
        self.pool_layers = nn.ModuleList()

        in_channels = 1  # Fashion-MNIST is grayscale
        for i in range(num_conv_layers):
            self.conv_layers.append(
                nn.Conv2d(in_channels, out_channels[i], kernel_size=kernel_size, padding=padding)
            )
            self.bn_layers.append(nn.BatchNorm2d(out_channels[i]))
            self.pool_layers.append(nn.MaxPool2d(2, 2))
            in_channels = out_channels[i]

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

        # Calculate the flattened feature size dynamically
        self.flattened_size = self._get_flattened_size()

        # Classification head
        self.fc_class = nn.Linear(self.flattened_size, 256)
        self.bn_class = nn.BatchNorm1d(256)
        self.fc_class_out = nn.Linear(256, num_classes)

        # Regression head
        self.fc_reg = nn.Linear(self.flattened_size, 128)
        self.bn_reg = nn.BatchNorm1d(128)
        self.fc_reg_out = nn.Linear(128, 1)

        self.feature_maps = []

    def _get_flattened_size(self):
        """
        Calculate the flattened feature size after conv layers by passing a dummy input.
        """
        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, 28, 28)  # Fashion-MNIST input size
            x = dummy_input

            for i in range(self.num_conv_layers):
                x = self.conv_layers[i](x)
                x = self.bn_layers[i](x)
                x = self.relu(x)
                x = self.pool_layers[i](x)

            return x.view(1, -1).size(1)

    def forward(self, x, return_features=False):
        """
        Forward pass through the network.

        Args:
            x: input tensor
            return_features: if True, return intermediate feature maps

        Returns:
            classification logits, regression predictions, (optional) feature maps
        """
        self.feature_maps = []

        # Shared backbone - dynamically iterate through layers
        for i in range(self.num_conv_layers):
            x = self.conv_layers[i](x)
            x = self.bn_layers[i](x)
            x = self.relu(x)
            x = self.pool_layers[i](x)

            if return_features:
                self.feature_maps.append(x.detach())

        x = self.dropout(x)

        # Flatten
        x = x.view(x.size(0), -1)

        # Classification head
        x_class = self.fc_class(x)
        x_class = self.bn_class(x_class)
        x_class = self.relu(x_class)
        logits = self.fc_class_out(x_class)

        # Regression head
        x_reg = self.fc_reg(x)
        x_reg = self.bn_reg(x_reg)
        x_reg = self.relu(x_reg)
        regression = self.fc_reg_out(x_reg).squeeze()

        if return_features:
            return logits, regression, self.feature_maps
        return logits, regression


def compute_loss(logits, class_labels, reg_preds, ink_targets, lambda1, lambda2):
    """
    Compute joint loss: L = λ1*L_classification + λ2*L_regression

    Args:
        logits: classification predictions
        class_labels: true class labels
        reg_preds: regression predictions
        ink_targets: true ink values
        lambda1: weight for classification loss
        lambda2: weight for regression loss

    Returns:
        total_loss, ce_loss, mse_loss
    """
    ce_loss = nn.CrossEntropyLoss()(logits, class_labels)
    mse_loss = nn.MSELoss()(reg_preds, ink_targets)
    total_loss = lambda1 * ce_loss + lambda2 * mse_loss

    return total_loss, ce_loss.item(), mse_loss.item()



## Hyperparameter Tuning and wandb Logging

In [4]:
from tqdm.auto import tqdm

def train_epoch(model, loader, optimizer, device, lambda1, lambda2):
    """Train for one epoch."""
    model.train()
    total_loss_sum, total_ce_sum, total_mse_sum = 0, 0, 0
    correct = 0
    total_samples = 0

    for images, class_labels, ink_targets in loader:
        images = images.to(device)
        class_labels = class_labels.to(device)
        ink_targets = ink_targets.to(device)

        optimizer.zero_grad()
        logits, reg_preds = model(images)

        loss, ce, mse = compute_loss(logits, class_labels, reg_preds, ink_targets, lambda1, lambda2)
        loss.backward()
        optimizer.step()

        batch_size = class_labels.size(0)
        total_samples += batch_size

        total_loss_sum += loss.item() * batch_size
        total_ce_sum += ce * batch_size
        total_mse_sum += mse * batch_size

        _, predicted = torch.max(logits, 1)
        correct += (predicted == class_labels).sum().item()

    return {
        'loss': total_loss_sum / total_samples,
        'ce_loss': total_ce_sum / total_samples,
        'mse_loss': total_mse_sum / total_samples,
        'accuracy': 100 * correct / total_samples
    }


def validate(model, loader, device, lambda1, lambda2):
    """Validate the model."""
    model.eval()
    total_loss_sum, total_ce_sum, total_mse_sum = 0, 0, 0
    correct = 0
    total_samples = 0
    mae_sum = 0

    with torch.no_grad():
        for images, class_labels, ink_targets in loader:
            images = images.to(device)
            class_labels = class_labels.to(device)
            ink_targets = ink_targets.to(device)

            logits, reg_preds = model(images)
            loss, ce, mse = compute_loss(logits, class_labels, reg_preds, ink_targets, lambda1, lambda2)

            batch_size = class_labels.size(0)
            total_samples += batch_size

            total_loss_sum += loss.item() * batch_size
            total_ce_sum += ce * batch_size
            total_mse_sum += mse * batch_size

            _, predicted = torch.max(logits, 1)
            correct += (predicted == class_labels).sum().item()

            mae_sum += torch.abs(reg_preds - ink_targets).sum().item()

    return {
        'loss': total_loss_sum / total_samples,
        'ce_loss': total_ce_sum / total_samples,
        'mse_loss': total_mse_sum / total_samples,
        'accuracy': 100 * correct / total_samples,
        'mae': mae_sum / total_samples,
        'rmse': np.sqrt(total_mse_sum / total_samples)
    }


def train_model(model, train_loader, val_loader, config, device):
    """
    Train the multi-task model.

    Args:
        model: the neural network
        train_loader: training data loader
        val_loader: validation data loader
        config: dictionary with hyperparameters
        device: torch device

    Returns:
        history: dictionary with training history
    """
    # Dynamically select optimizer based on config
    optimizer_name = config.get('optimizer', 'Adam')
    if optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    elif optimizer_name == 'AdamW':
        optimizer = optim.AdamW(model.parameters(), lr=config['lr'])
    elif optimizer_name == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    history = {
        'train_loss': [], 'train_ce': [], 'train_mse': [], 'train_acc': [],
        'val_loss': [], 'val_ce': [], 'val_mse': [], 'val_acc': [], 'val_mae': [], 'val_rmse': []
    }

    epoch_bar = tqdm(range(config['epochs']), desc="Overall Progress", leave=True)

    for epoch in epoch_bar:
        train_metrics = train_epoch(model, train_loader, optimizer, device,
                                    config['lambda1'], config['lambda2'])
        val_metrics = validate(model, val_loader, device,
                              config['lambda1'], config['lambda2'])

        history['train_loss'].append(train_metrics['loss'])
        history['train_ce'].append(train_metrics['ce_loss'])
        history['train_mse'].append(train_metrics['mse_loss'])
        history['train_acc'].append(train_metrics['accuracy'])

        history['val_loss'].append(val_metrics['loss'])
        history['val_ce'].append(val_metrics['ce_loss'])
        history['val_mse'].append(val_metrics['mse_loss'])
        history['val_acc'].append(val_metrics['accuracy'])
        history['val_mae'].append(val_metrics['mae'])
        history['val_rmse'].append(val_metrics['rmse'])

        # if (epoch + 1) % 5 == 0:
        #     print(f"Epoch {epoch+1}/{config['epochs']}: "
        #           f"Val Acc: {val_metrics['accuracy']:.2f}%, "
        #           f"Val RMSE: {val_metrics['rmse']:.4f}")

        epoch_bar.set_postfix(
            Val_Acc=f"{val_metrics['accuracy']:.2f}%",
            Val_RMSE=f"{val_metrics['rmse']:.4f}"
        )

        # Log to wandb
        wandb.log({
            'epoch': epoch + 1,
            'train/loss': train_metrics['loss'],
            'train/ce_loss': train_metrics['ce_loss'],
            'train/mse_loss': train_metrics['mse_loss'],
            'train/accuracy': train_metrics['accuracy'],
            'val/loss': val_metrics['loss'],
            'val/ce_loss': val_metrics['ce_loss'],
            'val/mse_loss': val_metrics['mse_loss'],
            'val/accuracy': val_metrics['accuracy'],
            'val/mae': val_metrics['mae'],
            'val/rmse': val_metrics['rmse']
        })

    return history


def plot_training_curves(history, run_name):
    """Plot training and validation curves."""
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    # Total loss
    axes[0, 0].plot(history['train_loss'], label='Train Loss')
    axes[0, 0].plot(history['val_loss'], label='Val Loss')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Total Loss')
    axes[0, 0].set_title(f'{run_name} - Total Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True)

    # Classification loss
    axes[0, 1].plot(history['train_ce'], label='Train CE')
    axes[0, 1].plot(history['val_ce'], label='Val CE')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Cross-Entropy Loss')
    axes[0, 1].set_title(f'{run_name} - Classification Loss')
    axes[0, 1].legend()
    axes[0, 1].grid(True)

    # Regression loss
    axes[1, 0].plot(history['train_mse'], label='Train MSE')
    axes[1, 0].plot(history['val_mse'], label='Val MSE')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('MSE Loss')
    axes[1, 0].set_title(f'{run_name} - Regression Loss')
    axes[1, 0].legend()
    axes[1, 0].grid(True)

    # Accuracy
    axes[1, 1].plot(history['train_acc'], label='Train Acc')
    axes[1, 1].plot(history['val_acc'], label='Val Acc')
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Accuracy (%)')
    axes[1, 1].set_title(f'{run_name} - Classification Accuracy')
    axes[1, 1].legend()
    axes[1, 1].grid(True)

    plt.tight_layout()
    fig.text(
        0.95, 0.95, "bakaram.charan",
        ha='right', va='top',
        fontsize=10, color='gray', alpha=0.7
    )
    wandb.log({f"{run_name}_training_curves": wandb.Image(fig)})
    plt.close()

def visualize_feature_maps(model, test_loader, device):
    """Visualize feature maps from intermediate layers with enhanced presentation."""
    model.eval()

    # Get 3 test images
    images, labels, _ = next(iter(test_loader))
    images = images[:3].to(device)
    labels = labels[:3].cpu()

    with torch.no_grad():
        _, _, feature_maps = model(images, return_features=True)

    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                   'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

    for img_idx in range(3):
        num_layers = len(feature_maps)

        # Create figure with better layout and more vertical space
        fig = plt.figure(figsize=(5 * (num_layers + 1), 8))
        gs = fig.add_gridspec(2, num_layers + 1, hspace=0.5, wspace=0.35,
                             top=0.92, bottom=0.05, left=0.05, right=0.98)

        # Original image (spans both rows)
        ax_input = fig.add_subplot(gs[:, 0])
        img = images[img_idx].cpu().squeeze()

        # Denormalize for better visualization
        img_display = img.numpy()
        img_display = (img_display - img_display.min()) / (img_display.max() - img_display.min()+ 1e-8)

        ax_input.imshow(img_display, cmap='gray', interpolation='bilinear')
        ax_input.set_title(f'Input Image\n{class_names[labels[img_idx]]}',
                          fontsize=13, fontweight='bold', pad=15)
        ax_input.axis('off')

        # Add border around input
        rect = plt.Rectangle((0, 0), img_display.shape[1]-1, img_display.shape[0]-1,
                            fill=False, edgecolor='blue', linewidth=3)
        ax_input.add_patch(rect)

        # Visualize feature maps from each layer
        for layer_idx in range(num_layers):
            fmap = feature_maps[layer_idx][img_idx]
            num_channels = fmap.shape[0]

            # Top row: Average activation across all channels
            ax_avg = fig.add_subplot(gs[0, layer_idx + 1])
            fmap_avg = fmap.mean(0).cpu().numpy()

            # Normalize for better visualization
            fmap_avg = (fmap_avg - fmap_avg.min()) / (fmap_avg.max() - fmap_avg.min() + 1e-8)

            im1 = ax_avg.imshow(fmap_avg, cmap='jet', interpolation='bilinear')
            ax_avg.set_title(f'Layer {layer_idx + 1}\nAvg Activation\n({num_channels} ch)',
                           fontsize=11, fontweight='bold', pad=10)
            ax_avg.axis('off')
            plt.colorbar(im1, ax=ax_avg, fraction=0.046, pad=0.04)

            # Bottom row: Max activation across channels (shows strongest features)
            ax_max = fig.add_subplot(gs[1, layer_idx + 1])
            fmap_max, _ = fmap.max(0)
            fmap_max = fmap_max.cpu().numpy()

            # Normalize for better visualization
            fmap_max = (fmap_max - fmap_max.min()) / (fmap_max.max() - fmap_max.min() + 1e-8)

            im2 = ax_max.imshow(fmap_max, cmap='plasma', interpolation='bilinear')
            ax_max.set_title(f'Max Activation\nShape: {fmap.shape[1]}×{fmap.shape[2]}',
                           fontsize=11, fontweight='bold', pad=10)
            ax_max.axis('off')
            plt.colorbar(im2, ax=ax_max, fraction=0.046, pad=0.04)

        fig.text(
            0.95, 0.95, "bakaram.charan",
            ha='right', va='top',
            fontsize=10, color='gray', alpha=0.7
        )
        wandb.log({f"feature_maps_image_{img_idx + 1}": wandb.Image(fig)})
        plt.close()

def run_experiment(config, device):
    """Run a single experiment with given configuration."""
    # Create a descriptive run name
    run_name = f"layers={config['num_conv_layers']}_λ1={config['lambda1']}_λ2={config['lambda2']}_opt={config['optimizer']}_bs={config['batch_size']}"

    wandb.init(
        project="multitask-fashion-mnist",
        name=run_name,
        config=config,
        reinit=True
    )

    print(f"\n{'='*60}")
    print(f"Running: {run_name}")
    print(f"{'='*60}")

    # Load data with batch size from config
    print(f"Loading data with batch_size={config['batch_size']}...")
    train_loader, val_loader, test_loader, mean, std = load_fashion_data(
        batch_size=config['batch_size'],
        augment_train=True
    )
    print(f"Dataset loaded. Mean: {mean:.4f}, Std: {std:.4f}")

    # Create model with parameters from config
    model = MultiTaskCNN(
        num_classes=10,
        num_conv_layers=config['num_conv_layers'],
        out_channels=config['out_channels'],
        dropout_rate=config['dropout']
    ).to(device)

    print(f"Model created with {config['num_conv_layers']} conv layers: {config['out_channels']}")
    print(f"Flattened feature size: {model.flattened_size}")

    history = train_model(model, train_loader, val_loader, config, device)
    plot_training_curves(history, run_name)

    # Evaluate on test set
    test_metrics = validate(model, test_loader, device, config['lambda1'], config['lambda2'])
    print(f"\nTest Results:")
    print(f"  Accuracy: {test_metrics['accuracy']:.2f}%")
    print(f"  RMSE: {test_metrics['rmse']:.4f}")
    print(f"  MAE: {test_metrics['mae']:.4f}")

    wandb.log({
        'test/accuracy': test_metrics['accuracy'],
        'test/rmse': test_metrics['rmse'],
        'test/mae': test_metrics['mae']
    })

    # Visualize feature maps
    visualize_feature_maps(model, test_loader, device)

    wandb.finish()

    return model, history, test_metrics

## Main Analysis

In [5]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define diverse experimental configurations
configs = [
    # Experiment 1: shallow model - 2 layers with Adam optimizer
    {
        'optimizer': 'Adam',
        'lr': 0.001,
        'dropout': 0.3,
        'batch_size': 128,
        'num_conv_layers': 2,
        'out_channels': [32, 64],
        'lambda1': 1.0,
        'lambda2': 1.0,
        'epochs': 30
    },
    # Experiment 2: Shallow model - 2 layers with SGD optimizer
    {
        'optimizer': 'SGD',
        'lr': 0.001,
        'dropout': 0.3,
        'batch_size': 32,
        'num_conv_layers': 2,
        'out_channels': [64, 128],
        'lambda1': 1.2,
        'lambda2': 1.0,
        'epochs': 30
    },
    # Experiment 3: Classification-focused (higher λ1)
    {
        'optimizer': 'Adam',
        'lr': 0.001,
        'dropout': 0.5,
        'batch_size': 128,
        'num_conv_layers': 3,
        'out_channels': [32, 64, 128],
        'lambda1': 2.0,
        'lambda2': 0.5,
        'epochs': 30
    },
    # Experiment 4: Regression-focused (higher λ2)
    {
        'optimizer': 'Adam',
        'lr': 0.01,
        'dropout': 0.3,
        'batch_size': 128,
        'num_conv_layers': 3,
        'out_channels': [32, 64, 128],
        'lambda1': 0.5,
        'lambda2': 2.0,
        'epochs': 30
    },
    # Experiment 5: AdamW optimizer with different batch size
    {
        'optimizer': 'AdamW',
        'lr': 0.001,
        'dropout': 0.5,
        'batch_size': 64,
        'num_conv_layers': 3,
        'out_channels': [32, 64, 128],
        'lambda1': 1.0,
        'lambda2': 1.0,
        'epochs': 30
    },
    # Experiment 6: Wider model with more channels
    {
        'optimizer': 'Adam',
        'lr': 0.0005,
        'dropout': 0.4,
        'batch_size': 128,
        'num_conv_layers': 3,
        'out_channels': [64, 128, 256],
        'lambda1': 1.0,
        'lambda2': 1.0,
        'epochs': 30
    }
]

results = []

Using device: cuda


### Experiment 1

In [6]:
print(f"\n{'#'*60}")
print(f"Experiment {1}/{len(configs)}")
print(f"{'#'*60}")

model, history, test_metrics = run_experiment(configs[0], device)

results.append({
    'config': configs[0],
    'test_accuracy': test_metrics['accuracy'],
    'test_rmse': test_metrics['rmse'],
    'val_accuracy': history['val_acc'][-1],
    'val_rmse': history['val_rmse'][-1],
    'model': model
})


############################################################
Experiment 1/6
############################################################





Running: layers=2_λ1=1.0_λ2=1.0_opt=Adam_bs=128
Loading data with batch_size=128...


100%|██████████| 26.4M/26.4M [00:02<00:00, 11.8MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 203kB/s]
100%|██████████| 4.42M/4.42M [00:01<00:00, 3.73MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 2.35MB/s]


Dataset loaded. Mean: 0.2856, Std: 0.3528
Model created with 2 conv layers: [32, 64]
Flattened feature size: 3136


Overall Progress:   0%|          | 0/30 [00:00<?, ?it/s]


Test Results:
  Accuracy: 92.72%
  RMSE: 0.0121
  MAE: 0.0094


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test/accuracy,▁
test/mae,▁
test/rmse,▁
train/accuracy,▁▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇███████████
train/ce_loss,█▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/mse_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/accuracy,▁▂▄▅▅▆▅▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇███▇▇
val/ce_loss,█▇▅▄▃▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▂▁▁▁▁▁

0,1
epoch,30
test/accuracy,92.72
test/mae,0.00938
test/rmse,0.01206
train/accuracy,93.30741
train/ce_loss,0.17695
train/loss,0.17727
train/mse_loss,0.00032
val/accuracy,93.23333
val/ce_loss,0.18749


### Experiment 2

In [7]:
print(f"\n{'#'*60}")
print(f"Experiment {2}/{len(configs)}")
print(f"{'#'*60}")

model, history, test_metrics = run_experiment(configs[1], device)

results.append({
    'config': configs[1],
    'test_accuracy': test_metrics['accuracy'],
    'test_rmse': test_metrics['rmse'],
    'val_accuracy': history['val_acc'][-1],
    'val_rmse': history['val_rmse'][-1],
    'model': model
})


############################################################
Experiment 2/6
############################################################



Running: layers=2_λ1=1.2_λ2=1.0_opt=SGD_bs=32
Loading data with batch_size=32...
Dataset loaded. Mean: 0.2859, Std: 0.3529
Model created with 2 conv layers: [64, 128]
Flattened feature size: 6272


Overall Progress:   0%|          | 0/30 [00:00<?, ?it/s]


Test Results:
  Accuracy: 92.58%
  RMSE: 0.0263
  MAE: 0.0237


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test/accuracy,▁
test/mae,▁
test/rmse,▁
train/accuracy,▁▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇█████████
train/ce_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train/mse_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/accuracy,▁▄▄▄▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▇████
val/ce_loss,█▅▄▅▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁

0,1
epoch,30
test/accuracy,92.58
test/mae,0.02373
test/rmse,0.02635
train/accuracy,92.07222
train/ce_loss,0.21601
train/loss,0.26014
train/mse_loss,0.00093
val/accuracy,93.43333
val/ce_loss,0.18


### Experiment 3

In [8]:
print(f"\n{'#'*60}")
print(f"Experiment {3}/{len(configs)}")
print(f"{'#'*60}")

model, history, test_metrics = run_experiment(configs[2], device)

results.append({
    'config': configs[2],
    'test_accuracy': test_metrics['accuracy'],
    'test_rmse': test_metrics['rmse'],
    'val_accuracy': history['val_acc'][-1],
    'val_rmse': history['val_rmse'][-1],
    'model': model
})


############################################################
Experiment 3/6
############################################################



Running: layers=3_λ1=2.0_λ2=0.5_opt=Adam_bs=128
Loading data with batch_size=128...
Dataset loaded. Mean: 0.2858, Std: 0.3530
Model created with 3 conv layers: [32, 64, 128]
Flattened feature size: 1152


Overall Progress:   0%|          | 0/30 [00:00<?, ?it/s]


Test Results:
  Accuracy: 92.74%
  RMSE: 0.0225
  MAE: 0.0173


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test/accuracy,▁
test/mae,▁
test/rmse,▁
train/accuracy,▁▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇██████████
train/ce_loss,█▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/mse_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/accuracy,▁▂▄▄▄▆▆▆▆▆▆▇▆▇▆▇▆▇▇▇█▇▇▅██████
val/ce_loss,█▆▅▅▅▃▃▃▂▃▂▂▃▂▃▂▃▂▂▁▂▂▁▄▁▁▁▂▁▁

0,1
epoch,30
test/accuracy,92.74
test/mae,0.01732
test/rmse,0.02247
train/accuracy,92.48148
train/ce_loss,0.20281
train/loss,0.40615
train/mse_loss,0.00105
val/accuracy,93.46667
val/ce_loss,0.17659


### Experiment 4

In [9]:
print(f"\n{'#'*60}")
print(f"Experiment {4}/{len(configs)}")
print(f"{'#'*60}")

model, history, test_metrics = run_experiment(configs[3], device)

results.append({
    'config': configs[3],
    'test_accuracy': test_metrics['accuracy'],
    'test_rmse': test_metrics['rmse'],
    'val_accuracy': history['val_acc'][-1],
    'val_rmse': history['val_rmse'][-1],
    'model': model
})


############################################################
Experiment 4/6
############################################################



Running: layers=3_λ1=0.5_λ2=2.0_opt=Adam_bs=128
Loading data with batch_size=128...
Dataset loaded. Mean: 0.2857, Std: 0.3528
Model created with 3 conv layers: [32, 64, 128]
Flattened feature size: 1152


Overall Progress:   0%|          | 0/30 [00:00<?, ?it/s]


Test Results:
  Accuracy: 92.21%
  RMSE: 0.0171
  MAE: 0.0132


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test/accuracy,▁
test/mae,▁
test/rmse,▁
train/accuracy,▁▄▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
train/ce_loss,█▅▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/mse_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/accuracy,▁▄▄▆▅▅▇▆▇▇▆▇▇▇▇▆▇▇██▇██▇█▇███▇
val/ce_loss,█▅▆▄▃▄▂▃▂▂▃▂▂▂▂▂▁▂▁▁▂▁▂▁▁▂▁▁▁▁

0,1
epoch,30
test/accuracy,92.21
test/mae,0.01316
test/rmse,0.01714
train/accuracy,92.54259
train/ce_loss,0.20043
train/loss,0.10144
train/mse_loss,0.00061
val/accuracy,92.1
val/ce_loss,0.21043


### Experiment 5

In [10]:
print(f"\n{'#'*60}")
print(f"Experiment {5}/{len(configs)}")
print(f"{'#'*60}")

model, history, test_metrics = run_experiment(configs[4], device)

results.append({
    'config': configs[4],
    'test_accuracy': test_metrics['accuracy'],
    'test_rmse': test_metrics['rmse'],
    'val_accuracy': history['val_acc'][-1],
    'val_rmse': history['val_rmse'][-1],
    'model': model
})


############################################################
Experiment 5/6
############################################################



Running: layers=3_λ1=1.0_λ2=1.0_opt=AdamW_bs=64
Loading data with batch_size=64...
Dataset loaded. Mean: 0.2860, Std: 0.3530
Model created with 3 conv layers: [32, 64, 128]
Flattened feature size: 1152


Overall Progress:   0%|          | 0/30 [00:00<?, ?it/s]


Test Results:
  Accuracy: 92.58%
  RMSE: 0.0201
  MAE: 0.0156


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test/accuracy,▁
test/mae,▁
test/rmse,▁
train/accuracy,▁▄▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇███████████
train/ce_loss,█▅▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/mse_loss,█▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/accuracy,▁▂▄▅▅▅▆▆▇▇▆▇▆▆▇▆▇███▇█▇▇██████
val/ce_loss,█▆▄▄▃▄▃▃▂▂▂▂▃▃▂▂▂▂▂▁▁▁▂▂▁▁▁▁▁▁

0,1
epoch,30
test/accuracy,92.58
test/mae,0.01559
test/rmse,0.02014
train/accuracy,92.29259
train/ce_loss,0.20635
train/loss,0.20732
train/mse_loss,0.00097
val/accuracy,92.85
val/ce_loss,0.18345


### Experiment 6

In [11]:
print(f"\n{'#'*60}")
print(f"Experiment {6}/{len(configs)}")
print(f"{'#'*60}")

model, history, test_metrics = run_experiment(configs[5], device)

results.append({
    'config': configs[5],
    'test_accuracy': test_metrics['accuracy'],
    'test_rmse': test_metrics['rmse'],
    'val_accuracy': history['val_acc'][-1],
    'val_rmse': history['val_rmse'][-1],
    'model': model
})


############################################################
Experiment 6/6
############################################################



Running: layers=3_λ1=1.0_λ2=1.0_opt=Adam_bs=128
Loading data with batch_size=128...
Dataset loaded. Mean: 0.2863, Std: 0.3532
Model created with 3 conv layers: [64, 128, 256]
Flattened feature size: 2304


Overall Progress:   0%|          | 0/30 [00:00<?, ?it/s]


Test Results:
  Accuracy: 93.37%
  RMSE: 0.0170
  MAE: 0.0134


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
test/accuracy,▁
test/mae,▁
test/rmse,▁
train/accuracy,▁▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████
train/ce_loss,█▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
train/loss,█▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
train/mse_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/accuracy,▁▃▄▅▅▅▅▆▆▆▇▆▆▇▇▇▇▆▇▇▇█▇██▇▇█▇█
val/ce_loss,█▅▆▄▄▃▃▃▃▃▂▂▂▂▂▂▁▂▂▁▂▁▁▁▁▂▁▁▁▁

0,1
epoch,30
test/accuracy,93.37
test/mae,0.01339
test/rmse,0.01697
train/accuracy,94.32407
train/ce_loss,0.15416
train/loss,0.15475
train/mse_loss,0.00059
val/accuracy,93.46667
val/ce_loss,0.17786


In [12]:
# Model selection
print(f"\n{'='*60}")
print("MODEL SELECTION RESULTS")
print(f"{'='*60}")

best_val_acc_idx = max(range(len(results)), key=lambda i: results[i]['val_accuracy'])
best_val_rmse_idx = min(range(len(results)), key=lambda i: results[i]['val_rmse'])

print(f"\nBest Classification Model as per validation (Run {best_val_acc_idx + 1}):")
print(f"  Config: {configs[best_val_acc_idx]}")
print(f"  Test Accuracy: {results[best_val_acc_idx]['test_accuracy']:.2f}%")
print(f"  Test RMSE: {results[best_val_acc_idx]['test_rmse']:.4f}")

print(f"\nBest Regression Model as per validation (Run {best_val_rmse_idx + 1}):")
print(f"  Config: {configs[best_val_rmse_idx]}")
print(f"  Test Accuracy: {results[best_val_rmse_idx]['test_accuracy']:.2f}%")
print(f"  Test RMSE: {results[best_val_rmse_idx]['test_rmse']:.4f}")


MODEL SELECTION RESULTS

Best Classification Model as per validation (Run 3):
  Config: {'optimizer': 'Adam', 'lr': 0.001, 'dropout': 0.5, 'batch_size': 128, 'num_conv_layers': 3, 'out_channels': [32, 64, 128], 'lambda1': 2.0, 'lambda2': 0.5, 'epochs': 30}
  Test Accuracy: 92.74%
  Test RMSE: 0.0225

Best Regression Model as per validation (Run 1):
  Config: {'optimizer': 'Adam', 'lr': 0.001, 'dropout': 0.3, 'batch_size': 128, 'num_conv_layers': 2, 'out_channels': [32, 64], 'lambda1': 1.0, 'lambda2': 1.0, 'epochs': 30}
  Test Accuracy: 92.72%
  Test RMSE: 0.0121
