# Learning Rate Scheduler Sweep (Central Baseline)

This notebook compares different LR schedulers to optimize the **Central Baseline**.

**Configuration:**
- **Model:** DINO ViT-S/16
- **Strategy:** Fine-tune backbone (`finetune_all`), Frozen Head
- **Optimizer:** SGDM (lr=1e-5, momentum=0.9, wd=1e-4)
- **Epochs:** 16 (Less than the actual baseline for faster testing)

**Schedulers tested:**
1. Cosine Annealing
2. Step LR
3. Exponential LR
4. ReduceOnPlateau

In [None]:
# Clone Repository & Install Dependencies
!git clone https://github.com/emanueleR3/AML-Project-2.git
%cd AML-Project-2
!pip install -q -r requirements.txt
!pip install torch torchvision numpy matplotlib tqdm pandas

In [None]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    StepLR,
    ExponentialLR,
    ReduceLROnPlateau
)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from src.utils import set_seed, get_device, ensure_dir, save_metrics_json
from src.data import load_cifar100, create_dataloader
from src.model import build_model
from src.train import train_one_epoch, evaluate

sys.path.append('.')

OUTPUT_DIR = 'output/scheduler_sweep'
ensure_dir(OUTPUT_DIR)
device = get_device()
set_seed(42)

In [None]:
# Load CIFAR-100
train_full, test_data = load_cifar100(data_dir='./data', download=True)

train_size = int(0.9 * len(train_full))
val_size = len(train_full) - train_size
train_data, val_data = torch.utils.data.random_split(
    train_full, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

train_loader = create_dataloader(train_data, batch_size=64, shuffle=True)
val_loader = create_dataloader(val_data, batch_size=64, shuffle=False)
test_loader = create_dataloader(test_data, batch_size=64, shuffle=False)

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

In [None]:
# Model config
config = {
    'model_name': 'dino_vits16',
    'num_classes': 100,
    'freeze_policy': 'finetune_all',
    'freeze_head': True,
    'dropout': 0.1,
    'device': device
}


NUM_EPOCHS = 16        
BASE_LR = 1e-5
WEIGHT_DECAY = 1e-4
MOMENTUM = 0.9
EVAL_FREQ = 2

# Check for pretrained head
BASELINE_PATH = 'output/main/pretrained_head.pt'
if not os.path.exists(BASELINE_PATH):
    print("âš  CAUTION: output/main/pretrained_head.pt not found. Results may be suboptimal.")

In [None]:
def get_scheduler(name, optimizer, num_epochs):
    """Factory for schedulers with dynamic sizing."""
    if name == 'cosine':
        return CosineAnnealingLR(optimizer, T_max=num_epochs)
    elif name == 'step':
        step_size = max(1, num_epochs // 3)
        return StepLR(optimizer, step_size=step_size, gamma=0.1)
    elif name == 'exponential':
        return ExponentialLR(optimizer, gamma=0.9)
    elif name == 'plateau':
        return ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5)
    else:
        raise ValueError(f"Unknown scheduler: {name}")


def train_with_scheduler(scheduler_name):
    print(f"\n{'='*40}")
    print(f"Training with: {scheduler_name.upper()}")
    print(f"{'='*40}")
    
    # 1. Build Model
    model = build_model(config)
    model.to(device)
    
    # Load Pretrained Head
    if os.path.exists(BASELINE_PATH):
        ckpt = torch.load(BASELINE_PATH, map_location=device)
        model.load_state_dict(ckpt['model_state_dict'])
    
    # Helper to count trainable parameters
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    # 3. Setup Optimizer
    optimizer = optim.SGD(
        model.get_trainable_params(), 
        lr=BASE_LR, 
        momentum=MOMENTUM, 
        weight_decay=WEIGHT_DECAY
    )
    scheduler = get_scheduler(scheduler_name, optimizer, NUM_EPOCHS)
    criterion = nn.CrossEntropyLoss()
    
    # 4. Training Loop
    history = {
        'epoch': [], 
        'train_loss': [], 'train_acc': [],
        'val_loss': [], 'val_acc': [], 
        'test_loss': [], 'test_acc': [],
        'lr': []
    }
    
    best_val_acc = 0.0
    
    for epoch in range(1, NUM_EPOCHS + 1):
        loss, acc = train_one_epoch(
            model, train_loader, optimizer, criterion, device, show_progress=False
        )
        current_lr = optimizer.param_groups[0]['lr']
        
        # Eval
        if epoch % EVAL_FREQ == 0 or epoch == NUM_EPOCHS:
            val_loss, val_acc = evaluate(model, val_loader, criterion, device, show_progress=False)
            test_loss, test_acc = evaluate(model, test_loader, criterion, device, show_progress=False)
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
            
            print(f"Epoch {epoch}/{NUM_EPOCHS} | LR: {current_lr:.2e} | Train: {acc:.1f}% | Val: {val_acc:.1f}% | Test: {test_acc:.1f}%")
            
            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc)
            history['test_loss'].append(test_loss)
            history['test_acc'].append(test_acc)
        else:
            history['val_loss'].append(None)
            history['val_acc'].append(None)
            history['test_loss'].append(None)
            history['test_acc'].append(None)
            print(f"Epoch {epoch}/{NUM_EPOCHS} | LR: {current_lr:.2e} | Train: {acc:.1f}%")
        
        history['epoch'].append(epoch)
        history['train_loss'].append(loss)
        history['train_acc'].append(acc)
        history['lr'].append(current_lr)
        
        # Step Scheduler
        if scheduler is not None:
            if scheduler_name == 'plateau':
                metric = history['val_acc'][-1] if history['val_acc'][-1] is not None else 0
                scheduler.step(metric)
            else:
                scheduler.step()
                
    # Capture final stats
    history['best_val_acc'] = best_val_acc
    valid_test = [x for x in history['test_acc'] if x is not None]
    history['final_test_acc'] = valid_test[-1] if valid_test else 0.0
    
    return history

In [None]:
# Run all Schedulers
SCHEDULERS = ['cosine', 'step', 'exponential', 'plateau']
results = {}

for name in SCHEDULERS:
    results[name] = train_with_scheduler(name)
    save_metrics_json(os.path.join(OUTPUT_DIR, f'scheduler_{name}.json'), results[name])