# AML Project 2: Federated Learning

- Setup & Central Baseline
- FedAvg IID
- Non-IID Sweep (with Scaled Rounds)

In [None]:
# Clone Repository & Install Dependencies
!git clone https://github.com/emanueleR3/AML-Project-2.git
%cd AML-Project-2
!pip install -r requirements.txt
!pip install torch torchvision numpy matplotlib tqdm

In [None]:
# Imports & Setup
import sys
import os
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm

from src.utils import set_seed, get_device, ensure_dir, save_checkpoint, save_metrics_json, count_parameters
from src.data import load_cifar100, create_dataloader, partition_iid, partition_non_iid
from src.model import build_model
from src.train import evaluate, train_one_epoch
from src.fedavg import run_fedavg
from src.masking import compute_sensitivity_scores, create_mask, save_mask

sys.path.append('.')

# Setup output dirs
OUTPUT_DIR = 'output/main'
ensure_dir(OUTPUT_DIR)
device = get_device()
print(f"Device: {device}")

## Setup & Central Baseline

In [None]:
# Load DINO & Data
print("Loading CIFAR-100...")
train_trainval, test_dataset = load_cifar100(data_dir='./data', image_size=224, download=True)

# Split Train/Val
train_size = int(0.9 * len(train_trainval))
val_size = len(train_trainval) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    train_trainval, [train_size, val_size], generator=torch.Generator().manual_seed(42)
)

# Create loaders
val_loader = create_dataloader(val_dataset, batch_size=64, shuffle=False)
test_loader = create_dataloader(test_dataset, batch_size=64, shuffle=False)

print("Data loaded successfully.")

In [None]:
# Real Central Baseline
print("\nRunning Real Central Baseline...")
config = {
    'model_name': 'dino_vits16',
    'num_classes': 100,
    'freeze_policy': 'head_only',
    'dropout': 0.1,
    'device': device
}

model = build_model(config)
model.to(device)

# Hyperparameters Baseline
epochs = 20
eval_freq = 2
optimizer = torch.optim.AdamW(model.get_trainable_params(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
criterion = nn.CrossEntropyLoss()

train_loader = create_dataloader(train_dataset, batch_size=64, shuffle=True)
best_acc = 0.0

history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(epochs):
    loss, acc = train_one_epoch(model, train_loader, optimizer, criterion, device, show_progress=False)
    
    # Validation logic
    current_epoch = epoch + 1
    if current_epoch % eval_freq == 0 or current_epoch == epochs: 
        val_loss, val_acc = evaluate(model, val_loader, criterion, device, show_progress=False)
        
        if val_acc > best_acc:
            best_acc = val_acc
            save_checkpoint({'model_state_dict': model.state_dict()}, os.path.join(OUTPUT_DIR, 'central_baseline.pt'))
            
        print(f"Epoch {current_epoch}/{epochs} | Train Acc: {acc:.2f}% | Val Acc: {val_acc:.2f}% | Best: {best_acc:.2f}%")
        
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

    else:
        print(f"Epoch {current_epoch}/{epochs} | Train Acc: {acc:.2f}% | (Skipping Eval)")
    
    scheduler.step()
    
    history['train_loss'].append(loss)
    history['train_acc'].append(acc)

print(f"Baseline finished. Best Val Acc: {best_acc:.2f}%")

save_metrics_json(os.path.join(OUTPUT_DIR, 'central_baseline_metrics.json'), history)

## FedAvg IID

In [None]:
# Config
iid_config = {
    'num_clients': 100,
    'clients_per_round': 0.1,
    'local_steps': 4,
    'num_rounds': 300,
    'batch_size': 64,
    'lr': 0.001,
    'weight_decay': 1e-4,
    'seed': 42,
    'eval_freq': 10
}

# Partition IID
print("Partitioning IID...")
client_datasets = partition_iid(train_dataset, iid_config['num_clients'], iid_config['seed'])
client_loaders = [create_dataloader(ds, iid_config['batch_size'], True, 0) for ds in client_datasets]

# Run
model = build_model(config)
model.to(device)

print("Starting FedAvg IID...")
history = run_fedavg(model, client_loaders, val_loader, test_loader, iid_config, device)

# Save
save_metrics_json(os.path.join(OUTPUT_DIR, 'fedavg_iid_metrics.json'), history)
save_checkpoint({'model_state_dict': model.state_dict()}, os.path.join(OUTPUT_DIR, 'fedavg_iid_best.pt'))

## Non-IID Sweep (Scaled Rounds)

When increasing local steps J, we scale rounds inversely to keep total computation constant:
- J=4 → 100 rounds (baseline)
- J=8 → 50 rounds  
- J=16 → 25 rounds

Total local steps per sampled client = J × Rounds = 400

In [None]:
# Scaled Rounds Configuration
BASE_J = 4
BASE_ROUNDS = 100
TOTAL_STEPS = BASE_J * BASE_ROUNDS  # = 400

def get_scaled_rounds(j):
    """Calculate rounds to keep total computation constant."""
    return TOTAL_STEPS // j

# Sweep Params
NC_VALUES = [1, 5, 10, 50] 
J_VALUES = [4, 8, 16]      

print("Scaled Rounds Configuration:")
for j in J_VALUES:
    print(f"  J={j:2d} → Rounds={get_scaled_rounds(j):3d} | Total Steps={j * get_scaled_rounds(j)}")

for nc in NC_VALUES:
    for j in J_VALUES:
        scaled_rounds = get_scaled_rounds(j)
        print(f"\n--- Runs Non-IID: Nc={nc}, J={j}, Rounds={scaled_rounds} ---")
        
        # Partition 
        client_datasets = partition_non_iid(train_dataset, 100, nc, 42)
        client_loaders = [create_dataloader(ds, 64, True, 0) for ds in client_datasets]
        
        # Config with scaled rounds
        sweep_config = iid_config.copy()
        sweep_config['local_steps'] = j
        sweep_config['num_rounds'] = scaled_rounds
        
        # Run
        model = build_model(config)
        model.to(device)
        history = run_fedavg(model, client_loaders, val_loader, test_loader, sweep_config, device)
        
        # Save
        save_metrics_json(os.path.join(OUTPUT_DIR, f'noniid_nc{nc}_j{j}.json'), history)