In [1]:
# IMPORT
import numpy as np
import wandb
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import shutil
import os                              # Import the 'os' module for changing directories
os.chdir('/content/drive/MyDrive/FL')  # Change the directory

Mounted at /content/drive


In [19]:
import torch
import torch.optim as optim
import torch.nn as nn
import torchvision
from torchvision import transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import Subset, DataLoader, random_split

from FederatedLearningProject.data.cifar100_loader import get_cifar100, create_iid_splits, create_non_iid_splits
import FederatedLearningProject.checkpoints.checkpointing as checkpointing
from FederatedLearningProject.training.FedMETA import aggregate_with_task_arithmetic, aggregate_masks, distribution_function, train_server
from FederatedLearningProject.training.model_editing import plot_all_layers_mask_sparsity

from FederatedLearningProject.experiments import models
import copy

In [3]:
import importlib

# Importa i moduli del tuo progetto
from FederatedLearningProject.data import cifar100_loader
from FederatedLearningProject import checkpoints
from FederatedLearningProject.training import FedMETA, model_editing
from FederatedLearningProject import experiments

# Ricarica solo i moduli custom (NO torch)
importlib.reload(cifar100_loader)
importlib.reload(checkpoints.checkpointing)
importlib.reload(FedMETA)
importlib.reload(model_editing)
importlib.reload(experiments.models)

# Re-bind: importa di nuovo funzioni/classi/alias che usi nel codice
from FederatedLearningProject.data.cifar100_loader import (
    get_cifar100, create_iid_splits, create_non_iid_splits
)

import FederatedLearningProject.checkpoints.checkpointing as checkpointing

from FederatedLearningProject.training.FedMETA import (
    aggregate_with_task_arithmetic,
    aggregate_masks,
    distribution_function,
    train_server
)

from FederatedLearningProject.training.model_editing import (
    plot_all_layers_mask_sparsity
)

from FederatedLearningProject.experiments import models


In [13]:
wandb.login() # Ask for your APIw key for logging in to the wandb library.

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdepetrofabio[0m ([33mdepetrofabio-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [15]:
device = "cuda"
model_name = "dino_vits16_J4"
project_name = "FederatedProjectPROVA_PARTE4"

# Generate a unique run name for each iteration
run_name = f"{model_name}_rounds_prova"
# INITIALIZE W&B for each new run
wandb.init(
    project=project_name,
    name=run_name,
    config={
        "model": model_name,
        "num_rounds": 100, # Use the current num_rounds_val
        "batch_size": 128, # Using test_loader's batch_size as a placeholder
    },
    reinit=True # Important: Allows re-initialization of wandb in a loop
)

# Copy your config
config = wandb.config

In [6]:
# print the content of the folder FederatedLearningProject.data.masks
print(os.listdir('FederatedLearningProject/masks'))

val_set = torch.load('FederatedLearningProject/masks/val_set.pth', weights_only=False)
train_set = torch.load('FederatedLearningProject/masks/train_set.pth', weights_only=False)
test_set = torch.load('FederatedLearningProject/masks/test_set.pth', weights_only=False)

['train_set.pth', 'val_set.pth', 'test_set.pth', 'client_masks_iid.pth', 'client_masks_non_iid_1.pth']


In [8]:
o_model = models.LinearFlexibleDino(num_layers_to_freeze=12)
local_masks = torch.load('FederatedLearningProject/masks/client_masks_non_iid_1.pth')

Using cache found in /root/.cache/torch/hub/facebookresearch_dino_main


In [20]:
model = copy.deepcopy(o_model)
model_checkpoint = torch.load("FederatedLearningProject/checkpoints/FL_NON_IID(1)/dino_vits_16_non_iid(1)_local_steps_4_checkpoint.pth")

In [21]:
model.load_state_dict(model_checkpoint['model_state_dict'])
model.debug()


--- Debugging Model ---
Model is primarily on device: cuda:0
Model overall mode: Train

Parameter Details (Name | Device | Requires Grad? | Inferred Block | Module Mode):
- backbone.cls_token                                 | cuda:0     | False           | N/A             | Train
- backbone.pos_embed                                 | cuda:0     | False           | N/A             | Train
- backbone.patch_embed.proj.weight                   | cuda:0     | False           | N/A             | Train
- backbone.patch_embed.proj.bias                     | cuda:0     | False           | N/A             | Train
- backbone.blocks.0.norm1.weight                     | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.norm1.bias                       | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.attn.qkv.weight                  | cuda:0     | False           | Block 0         | Eval
- backbone.blocks.0.attn.qkv.bias                    | cuda:0

In [22]:
model.unfreeze(12)
model.debug()


--- Debugging Model ---
Model is primarily on device: cuda:0
Model overall mode: Train

Parameter Details (Name | Device | Requires Grad? | Inferred Block | Module Mode):
- backbone.cls_token                                 | cuda:0     | True            | N/A             | Train
- backbone.pos_embed                                 | cuda:0     | True            | N/A             | Train
- backbone.patch_embed.proj.weight                   | cuda:0     | True            | N/A             | Train
- backbone.patch_embed.proj.bias                     | cuda:0     | True            | N/A             | Train
- backbone.blocks.0.norm1.weight                     | cuda:0     | True            | Block 0         | Train
- backbone.blocks.0.norm1.bias                       | cuda:0     | True            | Block 0         | Train
- backbone.blocks.0.attn.qkv.weight                  | cuda:0     | True            | Block 0         | Train
- backbone.blocks.0.attn.qkv.bias                    | cud

In [9]:
final_mask = aggregate_masks(local_masks)

In [10]:
partition_masks = distribution_function(final_mask, number_clients=100)

Total parameters: 21293568
Masked parameters (zeros): 19902014
Unmasked parameters (ones): 1391554


In [11]:
client_dataset = create_non_iid_splits(train_set, num_clients=100, classes_per_client=1)

Dataset has 40000 samples across 100 classes.
Creating 100 non IID splits with 1 classes each.


Each of the 100 classes split into 1 shards.

Checking unique classes that each client sees:
Client 0 has samples from classes: {np.int64(0)}
Total: 1
Client 1 has samples from classes: {np.int64(1)}
Total: 1
Client 2 has samples from classes: {np.int64(2)}
Total: 1
Client 3 has samples from classes: {np.int64(3)}
Total: 1
Client 4 has samples from classes: {np.int64(4)}
Total: 1
Client 5 has samples from classes: {np.int64(5)}
Total: 1
Client 6 has samples from classes: {np.int64(6)}
Total: 1
Client 7 has samples from classes: {np.int64(7)}
Total: 1
Client 8 has samples from classes: {np.int64(8)}
Total: 1
Client 9 has samples from classes: {np.int64(9)}
Total: 1
Client 10 has samples from classes: {np.int64(10)}
Total: 1
Client 11 has samples from classes: {np.int64(11)}
Total: 1
Client 12 has samples from classes: {np.int64(12)}
Total: 1
Client 13 has samples from classes: {np.int64(13)}

In [24]:
optimizer_config = {
    'lr': 0.01,
    'momentum': 0.9,
    'weight_decay': 0.0001
}

model.to_cuda()

checkpoint_path = 'FederatedLearningProject/checkpoints/'
val_loader = DataLoader(val_set, batch_size=128, shuffle=True)
criterion = nn.CrossEntropyLoss()

train_server(model, num_rounds=100, client_dataset=client_dataset, client_masks=partition_masks, optimizer_config=optimizer_config, device='cuda', frac=0.1, batch_size=128, val_loader=val_loader, checkpoint_path=checkpoint_path, criterion=criterion)

moving model to cuda

Round 5/100
Selected Clients: [71 90 80 72  5  1 45 95 85 39]
Avg Client Loss: 5.2103 | Avg Client Accuracy: 22.59%
Evaluation Loss: 4.4985 | Val Accuracy: 32.76%
--------------------------------------------------

Round 10/100
Selected Clients: [ 3 97 27 89  5 45  7 15 36 46]
Avg Client Loss: 7.4155 | Avg Client Accuracy: 9.65%
Evaluation Loss: 4.4985 | Val Accuracy: 32.76%
--------------------------------------------------

Round 15/100
Selected Clients: [74 72 24 43 88 82 97  7 92 19]
Avg Client Loss: 5.7687 | Avg Client Accuracy: 28.48%
Evaluation Loss: 4.4985 | Val Accuracy: 32.76%
--------------------------------------------------


KeyboardInterrupt: 