In [1]:
import numpy as np
import wandb
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import shutil
import os                              # Import the 'os' module for changing directories
os.chdir('/content/drive/MyDrive/FL')  # Change the directory

Mounted at /content/drive


In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
import torchvision
from torchvision import transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import Subset, DataLoader, random_split

from FederatedLearningProject.data.cifar100_loader import get_cifar100
import FederatedLearningProject.checkpoints.checkpointing as checkpointing
from FederatedLearningProject.training.FL_training import train_server_model_editing
from FederatedLearningProject.experiments import models

In [3]:
### TRAIN SPLIT WITH EVALUATION ###

valid_split_perc = 0.2
# train_set, val_set, test_set = get_cifar100(valid_split_perc=valid_split_perc)

val_set = torch.load('FederatedLearningProject/masks/val_set.pth', weights_only=False)
train_set = torch.load('FederatedLearningProject/masks/train_set.pth', weights_only=False)
test_set = torch.load('FederatedLearningProject/masks/test_set.pth', weights_only=False)


train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=2)

In [19]:
### CARICAMENTO DEL MODELLO ###
model = models.LinearFlexibleDino(num_layers_to_freeze=12)
model_checkpoint = torch.load("FederatedLearningProject/checkpoints/FL_IID_300round/dino_vits_16_iid_local_steps_4_checkpoint.pth")
model.load_state_dict(model_checkpoint['model_state_dict'])
model.to_cuda()

Using cache found in /root/.cache/torch/hub/facebookresearch_dino_main


moving model to cuda


In [5]:
### CARICAMENTO DEI CLIENTS ###

num_clients = 100
from FederatedLearningProject.data.cifar100_loader import create_non_iid_splits, create_iid_splits
client_dataset_iid = create_iid_splits(train_set, num_clients = num_clients)
# client_dataset_non_iid_50 = create_non_iid_splits(train_set, num_clients = num_clients, classes_per_client = 50)

Dataset has 40000 samples across 100 classes.
Creating 100 IID splits with 100 classes each.


Each of the 100 classes split into 100 shards.

Checking unique classes that each client sees:
Client 0 has samples from classes: {np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), 

In [7]:
### MASK COMPUTATIONS ###
# We computed them once and saved in google drive
# loading them

from FederatedLearningProject.training.model_editing import compute_mask_clients, plot_all_layers_mask_sparsity
# client_mask_iid = compute_mask_clients(model, client_dataset_iid, num_examples=100, num_classes=100, n_per_class=1)
# torch.save(client_masks_iid, "FederatedLearningProject/masks/client_masks_iid.pth")
client_masks_iid = torch.load("FederatedLearningProject/masks/client_masks_iid.pth")

In [13]:
# --- OPTIMIZER AND LOSS FUNCTION ---

num_rounds = 300
optimizer_config = {
  "lr" : 0.01,  # best hyperparameter of the centralized
  "momentum" : 0.9,
  "weight_decay" : 0.0001 # best hyperparameter of the centralized
}

num_clients = 100

# Default hyperparameters for FedAvg
num_local_steps = 4 # Fixed number of local steps
fraction = 0.1
criterion = nn.CrossEntropyLoss()
model_name = "dino_vits16_J4"

checkpoint_dir = "/content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/FL/"
os.makedirs(checkpoint_dir, exist_ok=True)
# Make checkpoint path unique to the run if you want to store separate checkpoints
checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_model_editing_iid.pth")

In [9]:
wandb.login() # Ask for your APIw key for logging in to the wandb library.

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdepetrofabio[0m ([33mdepetrofabio-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [18]:
device = "cuda"
project_name = "FedAvg_ModelEditing_Corretto"

# Generate a unique run name for each iteration
run_name = f"{model_name}_FedAvg_model_editing_iid"
# INITIALIZE W&B for each new run
wandb.init(
    project=project_name,
    name=run_name,
    config={
        "model": model_name,
        "num_rounds": 300, # Use the current num_rounds_val
        "batch_size": 128, # Using test_loader's batch_size as a placeholder
    },
    reinit=True # Important: Allows re-initialization of wandb in a loop
)

# Copy your config
config = wandb.config


0,1
client_avg_accuracy,█▅▁
client_avg_loss,▁▆█
round,▁▅█
server_val_accuracy,▁▁▁
server_val_loss,▁▁▁

0,1
client_avg_accuracy,45.42969
client_avg_loss,2.03744
round,14.0
server_val_accuracy,44.6
server_val_loss,2.20149


In [20]:
model.unfreeze(12)
model.to_cuda()

moving model to cuda


In [None]:
train_server_model_editing(model=model,
             num_clients = 100,
             num_rounds=num_rounds,
             client_dataset = client_dataset_iid,
             frac=0.1,
             batch_size=128,
             client_masks = client_masks_iid,
             optimizer_config=optimizer_config,
             val_loader = val_loader,
             criterion = criterion,
             num_client_steps = 4,
             model_name = model_name,
             checkpoint_path = checkpoint_path,
             device = device)

Checkpoint salvato su: /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/FL/dino_vits16_J4_model_editing_iid.pth

Round 5/300
Selected Clients: [39 72 59 84 47  2 60  6  3 86]
Avg Client Loss: 1.9832 | Avg Client Accuracy: 47.23%
Evaluation Loss: 2.1572 | Val Accuracy: 45.90%
--------------------------------------------------
Checkpoint salvato su: /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/FL/dino_vits16_J4_model_editing_iid.pth

Round 10/300
Selected Clients: [65 46 79 72  6 90 27  1 75 24]
Avg Client Loss: 1.9509 | Avg Client Accuracy: 48.24%
Evaluation Loss: 2.1257 | Val Accuracy: 46.67%
--------------------------------------------------
Checkpoint salvato su: /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/FL/dino_vits16_J4_model_editing_iid.pth

Round 15/300
Selected Clients: [52 35 76 42 34 37  1 65 61 36]
Avg Client Loss: 1.8158 | Avg Client Accuracy: 50.43%
Evaluation Loss: 2.1025 | Val Accuracy: 47.14%
--------------------------