In [2]:
import numpy as np
import wandb
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import shutil
import os                              # Import the 'os' module for changing directories
os.chdir('/content/drive/MyDrive/FL')  # Change the directory
import datetime as datetime
import copy
import json

Mounted at /content/drive


In [None]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR

import torch.nn as nn
import torchvision
from torchvision import transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import Subset, DataLoader, random_split

from FederatedLearningProject.data.cifar100_loader import get_cifar100
from FederatedLearningProject.checkpoints.checkpointing import save_checkpoint, load_checkpoint, save_checkpoint_test
from FederatedLearningProject.training.centralized_training import train_and_validate, train_and_test, train_epoch, validate_epoch, test_epoch, log_to_wandb, log_to_wandb_test, generate_configs


import FederatedLearningProject.experiments.models as models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [8]:
wandb.login(relogin = True) # Ask for your API key for logging in to the wandb library.

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcerbellifederico[0m ([33mcerbellifederico-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# Import CIFAR100 dataset: train_set, val_set, test_set
# The transforms are applied before returning the dataset (in the module)
valid_split_perc = 0.2    # of the 50000 training data
train_set, val_set, test_set = get_cifar100(valid_split_perc)

Number of images in Training Set:   40000
Number of images in Validation Set: 10000
Number of images in Test Set:       10000
✅ Datasets loaded successfully


In [None]:
# Create DataLoaders for training, validation, and test sets
# batch_size è in hyperparameter (64, 128, ..), anche num_workers (consigliato per colab 2 o 4)
train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=128, shuffle=False, num_workers=2)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=2)

In [None]:
c = {
    'lr': {
        'values': [0.001, 0.01, 0.005]
    },
    'weight_decay': {
        'values': [0.0001, 0.001]}}


In [9]:
o_model = models.LinearFlexibleDino()     # original model
o_model.freeze(12)
o_model.to_cuda()
o_model.debug()

Downloading: "https://github.com/facebookresearch/dino/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dino_deitsmall16_pretrain.pth
100%|██████████| 82.7M/82.7M [00:00<00:00, 165MB/s]


moving model to cuda

--- Debugging Model ---
Model is primarily on device: cuda:0
Model overall mode: Train

Parameter Details (Name | Device | Requires Grad? | Inferred Block | Module Mode):
- backbone.cls_token                                 | cuda:0     | False           | N/A             | Train
- backbone.pos_embed                                 | cuda:0     | False           | N/A             | Train
- backbone.patch_embed.proj.weight                   | cuda:0     | False           | N/A             | Train
- backbone.patch_embed.proj.bias                     | cuda:0     | False           | N/A             | Train
- backbone.blocks.0.norm1.weight                     | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.norm1.bias                       | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.attn.qkv.weight                  | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.attn.qkv.bias       

In [None]:
# Get hyperparam config
configs = generate_configs(c)

In [None]:
# Directory dei checkpoint
checkpoint_dir = "/content/drive/MyDrive/FL/FederatedLearningProject/checkpoints"

# Percorsi dei file JSON
completed_combinations_path = os.path.join(checkpoint_dir, "completed_combinations.json")
best_model_path = os.path.join(checkpoint_dir, "best_model.pth")
best_combination_path = os.path.join(checkpoint_dir, "best_combination.json")

# Carica le combinazioni completate
if os.path.exists(completed_combinations_path):
    with open(completed_combinations_path, "r") as f:
        completed_combinations = json.load(f)
    print(f"Completed combinations: {completed_combinations}")
else:
    completed_combinations = []
    print("No completed combinations")

# Carica la migliore combinazione salvata, se esiste
if os.path.exists(best_combination_path):
    with open(best_combination_path, "r") as f:
        best_combination_info = json.load(f)
        best_val_accuracy = best_combination_info.get("best_val_accuracy", 0.0)
        best_index = best_combination_info.get("best_index", None)
else:
    best_val_accuracy = 0.0
    best_index = None

'''
Il modello viene salvato nella checkpoint dir with the name {model_name}_run{i}
where i is the idx of the combination of hyperparameters

'''

# Loop su tutte le configurazioni
for i in range(len(configs)):                                       # iterate over combinations dictionaries
    if str(i) in completed_combinations:                            # check if the combination has already been tried
        print(f"Skipping combination {i} (already completed)")
        continue

    config_i = configs[i]

    learning_rate = config_i["lr"]
    weight_decay = config_i["weight_decay"]
    momentum = config_i.get("momentum", 0.9)
    epochs = config_i.get("num_epochs", 30)

    model = copy.deepcopy(o_model)    # load a fresh model
    params_to_optimize = model.parameters()

    optimizer = optim.SGD(params_to_optimize, lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    model_name = "dino_vit-s-16_"
    project_name = "BaselineCentralized_CosineLR"
    run_name = f"{model_name}_run_{i}"

    wandb.init(
        project=project_name,
        name=run_name,
        id=run_name,
        config={
            "model": model_name,
            "epochs": epochs,
            "batch_size": train_loader.batch_size,
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "momentum": momentum,
            "architecture": model.__class__.__name__,
        }
    )

    config = wandb.config



    checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_run_{i}_checkpoint_cosLR.pth")
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    #scheduler = StepLR(optimizer, step_size=10, gamma=0.1) best combination is 3 with 46.64

    start_epoch, checkpoint_data = load_checkpoint(model, optimizer, scheduler, run_name)

    # Esegui training e ottieni la migliore validation accuracy del run
    val_accuracy = train_and_validate(
        start_epoch,
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        test_loader=test_loader,
        scheduler=scheduler,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        checkpoint_path=checkpoint_path,
        num_epochs=epochs,
        checkpoint_interval=config_i.get("checkpoint_interval", 5)
    )

    wandb.finish()

    # Salva il miglior modello se la val_accuracy è migliorata
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_index = i
        torch.save(model.state_dict(), best_model_path)

        # Salva anche l'indice e la val_accuracy nel file JSON
        with open(best_combination_path, "w") as f:
            json.dump({"best_index": best_index, "best_val_accuracy": best_val_accuracy}, f)

        print(f" Best model updated! Combination: {best_index} | Accuracy: {best_val_accuracy:.4f}")
    else:
        print(f"Best combination is {best_index} with val accuracy {best_val_accuracy:.4f}")

    # Segna il run come completato
    completed_combinations.append(str(i))
    with open(completed_combinations_path, "w") as f:
        json.dump(completed_combinations, f)

    print(f" Finished combination {i}")

Completed combinations: ['0', '1', '2', '3', '4', '5']
Skipping combination 0 (already completed)
Skipping combination 1 (already completed)
Skipping combination 2 (already completed)
Skipping combination 3 (already completed)
Skipping combination 4 (already completed)
Skipping combination 5 (already completed)


In [10]:
valid_split_perc = 0   # of the 50000 training data
train_set, test_set = get_cifar100(valid_split_perc)

train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=2)

Number of images in Training Set:   50000
Number of images in Test Set:       10000
✅ Datasets loaded successfully (no validation split)


In [11]:
checkpoint_dir = "/content/drive/MyDrive/FL/FederatedLearningProject/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
best_model_path = os.path.join(checkpoint_dir, "best_model.pth")


In [None]:
learning_rate = 0.01
weight_decay = 0.0001
momentum = 0.9
epochs = 30

model = copy.deepcopy(o_model)    # load a fresh model
params_to_optimize = model.parameters()

optimizer = optim.SGD(params_to_optimize, lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()

model_name = "dino_vit-s-16_"
project_name = "BaselineCentralized_Test_Best_Model"
run_name = f"{model_name}_run"

wandb.init(
    project=project_name,
    name=run_name,
    id=run_name,
    config={
        "model": model_name,
        "epochs": epochs,
        "batch_size": train_loader.batch_size,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "momentum": momentum,
        "architecture": model.__class__.__name__,
    }
)

config = wandb.config

checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_run_checkpoint_Test_Best.pth")
scheduler = CosineAnnealingLR(optimizer, T_max=epochs)

start_epoch, checkpoint_data = load_checkpoint(model, optimizer, scheduler, run_name)

test_accuracy = train_and_test(
    start_epoch,
    model=model,
    train_loader=train_loader,
    test_loader=test_loader,
    scheduler=scheduler,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    checkpoint_path=checkpoint_path,
    num_epochs=epochs,
    checkpoint_interval=5
)

wandb.finish()

