In [1]:
import numpy as np
import wandb
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import shutil
import os                              # Import the 'os' module for changing directories
os.chdir('/content/drive/MyDrive/FL')  # Change the directory
import datetime as datetime
import copy
import json

Mounted at /content/drive


In [12]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

import torch.nn as nn
import torchvision
from torchvision import transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import Subset, DataLoader, random_split

from FederatedLearningProject.data.cifar100_loader import get_cifar100
from FederatedLearningProject.checkpoints.checkpointing import save_checkpoint, load_checkpoint
from FederatedLearningProject.training.centralized_training import train_and_validate, train_epoch, validate_epoch, log_to_wandb, generate_configs


import FederatedLearningProject.experiments.models as models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
wandb.login() # Ask for your API key for logging in to the wandb library.

[34m[1mwandb[0m: Currently logged in as: [33mcerbellifederico[0m ([33mcerbellifederico-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
# Import CIFAR100 dataset: train_set, val_set, test_set
# The transforms are applied before returning the dataset (in the module)
valid_split_perc = 0.2    # of the 50000 training data
train_set, val_set, test_set = get_cifar100(valid_split_perc)

Number of images in Training Set:   40000
Number of images in Validation Set: 10000
Number of images in Test Set:       10000
✅ Datasets loaded successfully


In [5]:
# Create DataLoaders for training, validation, and test sets
# batch_size è in hyperparameter (64, 128, ..), anche num_workers (consigliato per colab 2 o 4)
train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=128, shuffle=False, num_workers=2)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=2)

In [6]:
# c= {
#     'lr': {
#         'values': [1e-2]
#     },
#     'weight_decay': {
#         'values': [1e-3]
#     }
# }

# c = {
#     'lr': {
#         'values': [1e-4, 5e-4, 1e-3, 5e-3]
#     },
#     'weight_decay': {
#         'values': [1e-4, 1e-3]
#     }
# }

c = {
    'lr': {
        'values': [0.003]  # Più basso = più stabile, meno rischio overfitting
    },
    'weight_decay': {
        'values': [0.0001, 0.0005]}}


In [7]:
o_model = models.FlexibleDino()     # original model
o_model.freeze(12)
o_model.to_cuda()
o_model.debug()

Using cache found in /root/.cache/torch/hub/facebookresearch_dino_main


moving model to cuda

--- Debugging Model ---
Model is primarily on device: cuda:0
Model overall mode: Train

Parameter Details (Name | Device | Requires Grad? | Inferred Block | Module Mode):
- backbone.cls_token                                 | cuda:0     | False           | N/A             | Train
- backbone.pos_embed                                 | cuda:0     | False           | N/A             | Train
- backbone.patch_embed.proj.weight                   | cuda:0     | False           | N/A             | Train
- backbone.patch_embed.proj.bias                     | cuda:0     | False           | N/A             | Train
- backbone.blocks.0.norm1.weight                     | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.norm1.bias                       | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.attn.qkv.weight                  | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.attn.qkv.bias       

In [13]:
# Get hyperparam config
configs = generate_configs(c)

In [26]:
# Directory dei checkpoint
checkpoint_dir = "/content/drive/MyDrive/FL/FederatedLearningProject/checkpoints"

# Percorsi dei file JSON
completed_combinations_path = os.path.join(checkpoint_dir, "completed_combinations.json")
best_model_path = os.path.join(checkpoint_dir, "best_model.pth")
best_combination_path = os.path.join(checkpoint_dir, "best_combination.json")

# Carica le combinazioni completate
if os.path.exists(completed_combinations_path):
    with open(completed_combinations_path, "r") as f:
        completed_combinations = json.load(f)
    print(f"Completed combinations: {completed_combinations}")
else:
    completed_combinations = []
    print("No completed combinations")

# Carica la migliore combinazione salvata, se esiste
if os.path.exists(best_combination_path):
    with open(best_combination_path, "r") as f:
        best_combination_info = json.load(f)
        best_val_accuracy = best_combination_info.get("best_val_accuracy", 0.0)
        best_index = best_combination_info.get("best_index", None)
else:
    best_val_accuracy = 0.0
    best_index = None

'''
Il modello viene salvato nella checkpoint dir with the name {model_name}_run{i}
where i is the idx of the combination of hyperparameters

'''

# Loop su tutte le configurazioni
for i in range(len(configs)):                                       # iterate over combinations dictionaries
    if str(i) in completed_combinations:                            # check if the combination has already been tried
        print(f"Skipping combination {i} (already completed)")
        continue

    config_i = configs[i]

    learning_rate = config_i["lr"]
    weight_decay = config_i["weight_decay"]
    momentum = config_i.get("momentum", 0.9)
    epochs = config_i.get("num_epochs", 10)

    model = copy.deepcopy(o_model)    # load a fresh model

    last_blocks = list(model.backbone.blocks) # backbone blocks
    head = model.head                         # head

    # Collect all the parameter, we need the optimizer to have all params to avoid load_checkpoint errors
    # params_to_optiallmize = []
    # for block in last_blocks:
    #     params_to_optimize += list(block.parameters())

    # params_to_optimize += list(head.parameters())
    params_to_optimize = model.parameters()

    optimizer = optim.SGD(params_to_optimize, lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    model_name = "dino_vit-s-16"
    project_name = "BaselineCentralized"
    run_name = f"{model_name}_run_{i}"

    wandb.init(
        project=project_name,
        name=run_name,
        id=run_name,
        config={
            "model": model_name,
            "epochs": epochs,
            "batch_size": train_loader.batch_size,
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "momentum": momentum,
            "architecture": model.__class__.__name__,
        }
    )

    config = wandb.config



    checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_run_{i}_checkpoint.pth")
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    start_epoch, checkpoint_data = load_checkpoint(model, optimizer, scheduler, run_name)

    # Esegui training e ottieni la migliore validation accuracy del run
    val_accuracy = train_and_validate(
        start_epoch,
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        scheduler=scheduler,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        checkpoint_path=checkpoint_path,
        num_epochs=epochs,
        checkpoint_interval=config_i.get("checkpoint_interval", 5)
    )

    wandb.finish()

    # Salva il miglior modello se la val_accuracy è migliorata
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_index = i
        torch.save(model.state_dict(), best_model_path)

        # Salva anche l'indice e la val_accuracy nel file JSON
        with open(best_combination_path, "w") as f:
            json.dump({"best_index": best_index, "best_val_accuracy": best_val_accuracy}, f)

        print(f" Best model updated! Combination: {best_index} | Accuracy: {best_val_accuracy:.4f}")
    else:
        print(f"Best combination is {best_index} with val accuracy {best_val_accuracy:.4f}")

    # Segna il run come completato
    completed_combinations.append(str(i))
    with open(completed_combinations_path, "w") as f:
        json.dump(completed_combinations, f)

    print(f" Finished combination {i}")


No completed combinations


0,1
epoch,▁▂▃▄▅▅▆▇█
train_accuracy,▁▅▆▇▇▇███
train_loss,█▄▃▂▂▁▁▁▁
val_accuracy,▁▄▆▇▇▇███
val_loss,█▄▃▂▂▁▁▁▁

0,1
epoch,9.0
train_accuracy,33.9475
train_loss,2.56651
val_accuracy,40.81
val_loss,2.24834


 Checkpoint caricato da /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/dino_vit-s-16_run_0_checkpoint.pth, riprendo da epoca 6.
[Epoch 6] Train Loss: 2.6593, Train Accuracy: 31.93%, Val Loss: 2.2919, Val Accuracy: 39.37%
current LR: [0.0010364745084375793]
[Epoch 7] Train Loss: 2.6107, Train Accuracy: 32.88%, Val Loss: 2.2743, Val Accuracy: 40.38%
current LR: [0.0006183221215612907]
[Epoch 8] Train Loss: 2.5885, Train Accuracy: 33.25%, Val Loss: 2.2563, Val Accuracy: 40.81%
current LR: [0.0002864745084375791]
[Epoch 9] Train Loss: 2.5703, Train Accuracy: 33.97%, Val Loss: 2.2470, Val Accuracy: 40.68%
current LR: [7.341522555726972e-05]
[Epoch 10] Train Loss: 2.5607, Train Accuracy: 34.19%, Val Loss: 2.2445, Val Accuracy: 40.83%
current LR: [0.0]
Checkpoint salvato su: /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/dino_vit-s-16_run_0_checkpoint.pth
[train and validate]: final val accuracy: 40.83


0,1
epoch,▁▃▅▆█
train_accuracy,▁▄▅▇█
train_loss,█▅▃▂▁
val_accuracy,▁▆█▇█
val_loss,█▅▃▁▁

0,1
epoch,10.0
train_accuracy,34.185
train_loss,2.5607
val_accuracy,40.83
val_loss,2.24454


 Best model updated! Combination: 0 | Accuracy: 40.8300
 Finished combination 0


 Nessun checkpoint trovato, inizio da epoca 1.
[Epoch 1] Train Loss: 3.9742, Train Accuracy: 10.75%, Val Loss: 2.9647, Val Accuracy: 27.62%
current LR: [0.0029265847744427307]
[Epoch 2] Train Loss: 3.1514, Train Accuracy: 22.18%, Val Loss: 2.5860, Val Accuracy: 34.16%
current LR: [0.0027135254915624215]
[Epoch 3] Train Loss: 2.9124, Train Accuracy: 26.70%, Val Loss: 2.4430, Val Accuracy: 36.82%
current LR: [0.0023816778784387102]
[Epoch 4] Train Loss: 2.7972, Train Accuracy: 29.00%, Val Loss: 2.3628, Val Accuracy: 37.97%
current LR: [0.0019635254915624217]
[Epoch 5] Train Loss: 2.7214, Train Accuracy: 30.71%, Val Loss: 2.3180, Val Accuracy: 39.66%
current LR: [0.0015000000000000005]
Checkpoint salvato su: /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/dino_vit-s-16_run_1_checkpoint.pth
[Epoch 6] Train Loss: 2.6614, Train Accuracy: 31.88%, Val Loss: 2.2896, Val Accuracy: 39.92%
current LR: [0.0010364745084375793]
[Epoch 7] Train Loss: 2.6169, Train Accuracy: 32.91%, Val 

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▆▇▇▇████
train_loss,█▄▃▂▂▁▁▁▁▁
val_accuracy,▁▄▆▆▇▇████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_accuracy,33.9525
train_loss,2.56506
val_accuracy,41.55
val_loss,2.24282


 Best model updated! Combination: 1 | Accuracy: 41.5500
 Finished combination 1


In [9]:
# Directory dei checkpoint
checkpoint_dir = "/content/drive/MyDrive/FL/FederatedLearningProject/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

best_model_path = os.path.join(checkpoint_dir, "best_model.pth")

In [15]:
# Percorsi dei file JSON
completed_combinations_path = os.path.join(checkpoint_dir, "completed_combinations2.json")
best_model_path_2 = os.path.join(checkpoint_dir, "best_model_2.pth")
best_combination_path_2 = os.path.join(checkpoint_dir, "best_combination_2.json")

# Carica le combinazioni completate
if os.path.exists(completed_combinations_path):
    with open(completed_combinations_path, "r") as f:
        completed_combinations2 = json.load(f)
    print(f"Completed combinations: {completed_combinations2}")
else:
    completed_combinations2 = []
    print("No completed combinations")

# Carica la migliore combinazione salvata, se esiste
if os.path.exists(best_combination_path_2):
    with open(best_combination_path_2, "r") as f:
        best_combination_info = json.load(f)
        best_val_accuracy = best_combination_info.get("best_val_accuracy", 0.0)
        best_index = best_combination_info.get("best_index", None)
else:
    best_val_accuracy = 0.0
    best_index = None


# Loop su tutte le configurazioni
for i in range(len(configs)):
    if str(i) in completed_combinations2:
        print(f"Skipping combination {i} (already completed)")
        continue

    config_i = configs[i]

    learning_rate = config_i["lr"]
    weight_decay = config_i["weight_decay"]
    momentum = config_i.get("momentum", 0.9)
    epochs = config_i.get("num_epochs", 50)

    model = copy.deepcopy(o_model)
    model.freeze(9)
    model.load_state_dict(torch.load(best_model_path))

    params_to_optimize = model.parameters()

    optimizer = optim.SGD(params_to_optimize, lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    model_name = "dino_vit-s-16"
    project_name = "BaselineCentralized_unfreeze"
    run_name = f"{model_name}_run_{i}"

    wandb.init(
        project=project_name,
        name=run_name,
        id=run_name,
        config={
            "model": model_name,
            "epochs": epochs,
            "batch_size": train_loader.batch_size,
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "momentum": momentum,
            "architecture": model.__class__.__name__,
        }
    )

    config = wandb.config

    checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_run_{i}_checkpoint.pth")
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    start_epoch, checkpoint_data = load_checkpoint(model, optimizer, scheduler, run_name)

    # Esegui training e ottieni la migliore validation accuracy del run
    val_accuracy = train_and_validate(
        start_epoch,
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        scheduler=scheduler,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        checkpoint_path=checkpoint_path,
        num_epochs=epochs,
        checkpoint_interval=config_i.get("checkpoint_interval", 2)
    )

    wandb.finish()

    # Salva il miglior modello se la val_accuracy è migliorata
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_index = i
        torch.save(model.state_dict(), best_model_path_2)

        # Salva anche l'indice e la val_accuracy nel file JSON
        with open(best_combination_path_2, "w") as f:
            json.dump({"best_index": best_index, "best_val_accuracy": best_val_accuracy}, f)

        print(f" Best model updated! Combination: {best_index} | Accuracy: {best_val_accuracy:.4f}")
    else:
        print(f"Best combination is {best_index} with val accuracy {best_val_accuracy:.4f}")

    # Segna il run come completato
    completed_combinations2.append(str(i))
    with open(completed_combinations_path, "w") as f:
        json.dump(completed_combinations2, f)

    print(f" Finished combination {i}")

No completed combinations


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▂▃▄▅▇█
train_loss,█▇▆▅▄▂▁
val_accuracy,▁▅▆▇▆█▆
val_loss,▁▂▆▅▅▅█

0,1
epoch,37.0
train_accuracy,68.95
train_loss,1.09093
val_accuracy,54.26
val_loss,1.8681


 Checkpoint caricato da /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/dino_vit-s-16_run_0_checkpoint.pth, riprendo da epoca 11.


KeyboardInterrupt: 