In [1]:
import numpy as np
import wandb
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import shutil
import os                              # Import the 'os' module for changing directories
os.chdir('/content/drive/MyDrive/FL')  # Change the directory

Mounted at /content/drive


In [2]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

import torch.nn as nn
import torchvision
from torchvision import transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import Subset, DataLoader, random_split

from FederatedLearningProject.data.cifar100_loader import get_cifar100
import FederatedLearningProject.checkpoints.checkpointing as checkpointing
from FederatedLearningProject.training.centralized_training import train_and_validate

In [3]:
wandb.login() # Ask for your API key for logging in to the wandb library.

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnicco-to[0m ([33mnicco-to-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
# Import CIFAR100 dataset: train_set, val_set, test_set
# The transforms are applied before returning the dataset (in the module)

valid_split_perc = 0.2    # of the 50000 training data
train_set, val_set, test_set = get_cifar100(valid_split_perc)

Number of images in Training Set:   40000
Number of images in Validation Set: 10000
Number of images in Test Set:       10000
✅ Datasets loaded successfully


In [5]:
# Create DataLoaders for training, validation, and test sets

# batch_size è in hyperparameter (64, 128, ..), anche num_workers (consigliato per colab 2 o 4)

train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=128, shuffle=False, num_workers=2)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=2)


### Possible Models
|                        | **Simple linear head**                               | **MLP head w/ Dropout**                                                      |
| :--------------------- | :--------------------------------------------------- | :--------------------------------------------------------------------------- |
| **Definition**         | `nn.Linear(384 → 100)`                               | `Dropout → Linear(384 → 256) → ReLU → Dropout → Linear(256 → 100)`           |
| **# trainable params** | 384×100 + 100 ≈ **38 500**                           | 384×256+256 + 256×100+100 ≈ **123 000**                                      |
| **Regularization**     | none                                                 | dropout on both layers                                                       |
| **Expressive power**   | low  – just a single hyperplane on the CLS embedding | higher – small nonlinear bottleneck can learn more complex features in heads |
| **Compute / memory**   | minimal                                              | \~3× more weights, a bit more forward/backward cost                          |

---

**Appunto sui layer di testa:**

1. **`self.classifier`**

   * **Cosa contiene?** Un singolo `nn.Linear(embed_dim → num_classes)`.
   * **Quando usarlo?** Se vuoi un *linear probe* puro: un solo layer che prende il CLS token e mappa direttamente alle classi.
   * **Pro:** estremamente leggero (∼38 K parametri), veloce da addestrare e da inferire.
   * **Contro:** capacità espressiva minima (è solo un’iper‐superficie lineare sullo spazio degli embedding).

2. **`self.head`**

   * **Cosa contiene?** Una piccola sequenza (`nn.Sequential`) di layer:

     * Dropout
     * Linear (embed\_dim → hidden\_dim)
     * ReLU
     * Dropout
     * Linear (hidden\_dim → num\_classes)
   * **Quando usarlo?** Se vuoi dare al tuo “probe” un po’ più di potenza di calcolo, trasformando non-linearmente il CLS prima della classificazione.
   * **Pro:** maggiore capacità di apprendere rappresentazioni complesse nella testa, un minimo di regolarizzazione via dropout.
   * **Contro:** più pesante (∼3× parametri in più rispetto al solo `classifier`), leggermente più lento da addestrare e inferire.

---

### Perché una piuttosto che l’altra?

* **Vincoli di risorse** (GPU/RAM, tempo d’addestramento):

  * Se sei sotto forte pressione computazionale o vuoi risultati rapidi, opti per `self.classifier`.
* **Prestazioni** (accuratezza su dataset piccolo/mediamente grande come CIFAR-100):

  * Se noti che il linear probe raggiunge un plateau basso, un piccolo MLP (`self.head`) può guadagnare qualche punto percentuale in più.
* **Semplicità vs flessibilità**:

  * Con una sola `classifier` hai un codice più pulito e diretto.
  * Con `head` puoi sperimentare — cambiare `hidden_dim`, aggiungere altro dropout, batchnorm o ulteriori layer.

In definitiva, **il nome** (`classifier` vs `head`) è arbitrario: serve a rendere più chiaro nel codice di che “peso” stiamo parlando. Se hai un solo layer, chiamalo `classifier`; se invece è un blocco più articolato, chiamalo `head` o `projection_head`, per tener separata la parte “feature extractor” (backbone) dalla parte “feature consumer” (testa di classificazione).


In [None]:
# Freeze only the first 9 blocks of the ViT backbone
import torch

import torch.nn as nn
backbone = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')

# Freeze patch embedding and dropout
for p in backbone.patch_embed.parameters():  # embedding vectors sono rappresentazioni numeriche dei dati
    p.requires_grad = False

backbone.pos_embed.requires_grad = False
backbone.cls_token.requires_grad = False

"""
pos_drop is likely an nn.Dropout layer associated with positional embeddings. nn.Dropout layers themselves don't have learnable parameters
that are updated during backpropagation (only a dropout rate, which is a hyperparameter). So, iterating through parameters() of a standard
dropout layer might yield an empty iterator or no parameters that gradients flow through. The main control over dropout is its train() or
eval() mode. However, if pos_drop were a custom module with learnable parameters, this would freeze them. This line is unlikely to cause
issues but might not have a significant effect if pos_drop is a standard nn.Dropout.

for p in backbone.pos_drop.parameters():     # da verificare, non dovrebbero esserci parametri trainabili nei drop out layers
    p.requires_grad = False
"""

# Define the classifier head with optional dropout/MLP
class DinoClassifier(nn.Module):
    def __init__(self, backbone, num_classes=100, hidden_dim=256, drop=0.5): # hidden_dim = dimensione del layer della nn
        super().__init__()
        self.backbone = backbone
        embed_dim = backbone.embed_dim  # 384 for ViT-S/16
        self.classifier = nn.Sequential(
            # nn.Dropout(drop),                # solitamnete non si fa il dropout prima dell'input layer, da capire
            nn.Linear(embed_dim, hidden_dim),     # from 384 to 256
            nn.ReLU(inplace=True), # capire meglio inplace
            nn.Dropout(drop),
            nn.Linear(hidden_dim, num_classes)    # from 256 to 100
        )

    def forward(self, x):
        feats = self.backbone.get_intermediate_layers(x, n=1)[0] # take the output features from DiNo's backbone
        cls = feats[:, 0]                                        #
        return self.classifier(cls)

# Instantiate model and move to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DinoClassifier(backbone, num_classes=100).to(device)

# Freeze first 9 blocks
for block in model.backbone.blocks[0:9]:
    block.eval()
    for param in block.parameters():
        param.requires_grad = False

# Unfreeze remaining blocks (if needed)
for block in model.backbone.blocks[9:]:
    block.train()
    for param in block.parameters():
        param.requires_grad = True

model.classifier.train()

# Set backbone to train mode (so dropout works during training)
# model.backbone.train()  # Ensure backbone is in training mode, non dovrebbe modificare i blocchi freezzati


Downloading: "https://github.com/facebookresearch/dino/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dino_deitsmall16_pretrain.pth
100%|██████████| 82.7M/82.7M [00:00<00:00, 213MB/s]


Sequential(
  (0): Linear(in_features=384, out_features=256, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=256, out_features=100, bias=True)
)

In [7]:
def debug_model(model: nn.Module, model_name: str = "Model"):
    """
    Prints debugging information about a PyTorch model.

    Information includes:
    - Overall device of the first parameter (indicative of model's primary device).
    - For each named parameter:
        - Full parameter name.
        - Device of the parameter.
        - Whether the parameter requires gradients (is frozen or not).
        - Inferred block index if the name matches a ViT-like structure.
    """
    print(f"\n--- Debugging {model_name} ---")

    # Check overall model device (based on the first parameter)
    try:
        first_param_device = next(model.parameters()).device
        print(f"{model_name} is primarily on device: {first_param_device}")
    except StopIteration:
        print(f"{model_name} has no parameters.")
        return

    print("\nParameter Details (Name | Device | Requires Grad? | Inferred Block):")
    for name, param in model.named_parameters():
        device = param.device
        requires_grad = param.requires_grad

        block_info = "N/A"
        # Try to infer block index for ViT-like models
        if "blocks." in name:
            try:
                # e.g., name = "blocks.0.attn.qkv.weight"
                block_idx_str = name.split("blocks.")[1].split(".")[0]
                if block_idx_str.isdigit():
                    block_info = f"Block {block_idx_str}"
            except IndexError:
                block_info = "Block (parse error)"

        print(f"- {name:<50} | {str(device):<10} | {str(requires_grad):<15} | {block_info}")

    # You can add more specific checks here, e.g., for model mode (train/eval)
    print(f"{model_name} is in {'training' if model.training else 'evaluation'} mode.")
    print(f"--- End Debugging {model_name} ---\n")


In [8]:
debug_model(model=model)


--- Debugging Model ---
Model is primarily on device: cuda:0

Parameter Details (Name | Device | Requires Grad? | Inferred Block):
- backbone.cls_token                                 | cuda:0     | False           | N/A
- backbone.pos_embed                                 | cuda:0     | False           | N/A
- backbone.patch_embed.proj.weight                   | cuda:0     | False           | N/A
- backbone.patch_embed.proj.bias                     | cuda:0     | False           | N/A
- backbone.blocks.0.norm1.weight                     | cuda:0     | False           | Block 0
- backbone.blocks.0.norm1.bias                       | cuda:0     | False           | Block 0
- backbone.blocks.0.attn.qkv.weight                  | cuda:0     | False           | Block 0
- backbone.blocks.0.attn.qkv.bias                    | cuda:0     | False           | Block 0
- backbone.blocks.0.attn.proj.weight                 | cuda:0     | False           | Block 0
- backbone.blocks.0.attn.proj.bias    

In [10]:
print(model)

DinoClassifier(
  (backbone): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-11): 12 x Block(
        (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (norm): L

In [11]:
# --- OPTIMIZER AND LOSS FUNCTION ---
learning_rate = 1e-4
momentum = 0.9
weight_decay = 5e-5
epochs = 50

"""
# Example for differential learning rates:
optimizer = torch.optim.AdamW([
    {'params': model.backbone.blocks[9:].parameters(), 'lr': 1e-5}, # Adjust block indices if needed
    # You might also want to fine-tune backbone.norm if it exists and is not frozen
    # {'params': model.backbone.norm.parameters(), 'lr': 1e-5},
    {'params': model.classifier.parameters(), 'lr': 1e-4}
], weight_decay=0.05) # example weight decay
"""
# optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
# Example optimizer instantiation:
optimizer = torch.optim.SGD(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate, # Example LR
    weight_decay=weight_decay,
    momentum=momentum
)
scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
criterion = nn.CrossEntropyLoss()

In [12]:
# wandb.init() prepares the tracking of hyperparameters/metrics for later recording performance using wandb.log()

model_name = "dino_vits16"
project_name = "FederatedProject"
run_name = f"{model_name}_run"

# INITIALIZE W&B
wandb.init(
    project=project_name,
    name=run_name,
    config={
        "model": "dino_vits16",
        "epochs": epochs,
        "batch_size": train_loader.batch_size,
        "learning_rate": learning_rate,  # Use fixed value
        "weight_decay": weight_decay,
        "momentum": momentum,
        "architecture": model.__class__.__name__,
    }
)

# Copy your config
config = wandb.config


In [None]:
#  PERCORSO CHECKPOINT
checkpoint_dir = "/content/drive/MyDrive/FL/FederatedLearningProject/checkpoints"

In [None]:
# RECOVER CHECKPOINT
# # the function already do model.load_state_dict(checkpoint_data["model_state_dict"])
#                           scheduler.load_state_dict(checkpoint_data["scheduler_state_dict"])
#                           optimizer.load_state_dict(checkpoint_data["optimizer_state_dict"])
# anyway it returns all the data (model data) if needed

start_epoch, model_data = checkpointing.load_checkpoint(model, optimizer, checkpoint_dir)


 Nessun checkpoint trovato, inizio da epoca 1.



In [18]:
# --- TRAINING LOOP ---
# Call to the training loop function
train_and_validate(start_epoch, model=model, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, optimizer=optimizer, criterion=criterion, device=device, checkpoint_path=checkpoint_path, num_epochs=50, checkpoint_interval=10)

[Epoch 1] Train Loss: 4.6615, Train Accuracy: 1.84%, Val Loss: 4.4592, Val Accuracy: 3.93%
[Epoch 2] Train Loss: 3.9377, Train Accuracy: 12.24%, Val Loss: 3.3714, Val Accuracy: 22.09%
[Epoch 3] Train Loss: 3.0359, Train Accuracy: 27.64%, Val Loss: 2.8054, Val Accuracy: 31.85%
[Epoch 4] Train Loss: 2.6574, Train Accuracy: 33.95%, Val Loss: 2.5774, Val Accuracy: 35.54%
[Epoch 5] Train Loss: 2.4771, Train Accuracy: 37.35%, Val Loss: 2.4359, Val Accuracy: 38.24%
[Epoch 6] Train Loss: 2.3522, Train Accuracy: 39.91%, Val Loss: 2.3486, Val Accuracy: 40.09%
[Epoch 7] Train Loss: 2.2645, Train Accuracy: 41.39%, Val Loss: 2.2770, Val Accuracy: 41.36%
[Epoch 8] Train Loss: 2.1905, Train Accuracy: 42.92%, Val Loss: 2.2269, Val Accuracy: 42.97%
[Epoch 9] Train Loss: 2.1295, Train Accuracy: 44.65%, Val Loss: 2.1706, Val Accuracy: 43.40%
[Epoch 10] Train Loss: 2.0821, Train Accuracy: 45.40%, Val Loss: 2.1414, Val Accuracy: 44.14%
Checkpoint salvato su: /content/drive/MyDrive/FL/FederatedLearningProje

KeyboardInterrupt: 

In [None]:
## Display some informations ##

print("Model:", model_name)
print("Train set size:", len(train_set))
print("Validation set size:", len(val_set))
print("Batch size:", train_loader.batch_size)
print("Number of epochs:", config.epochs)
print("DataLoader: ")
print("Learning rate:", optimizer.param_groups[0]['lr'])
print("Architecture:", model.__class__.__name__)
print("Device:", device)
print("Optimizer:", optimizer)
print("Loss function:", criterion)
print("Checkpoint directory:", checkpoint_dir)
print("Checkpoint path:", checkpoint_path)
print("Current epoch:", epoch)
print()

print("Train Loader Information:")
print(f"  Number of batches: {len(train_loader)}")
print(f"  Batch size: {train_loader.batch_size}")
# Get the dimension of a single batch
for images, labels in train_loader:
  print(f"  Dimension of 1 batch (images): {images.shape}")
  print(f"  Dimension of 1 batch (labels): {labels.shape}")
  break  # Exit the loop after processing one batch
print()

print("\nValidation Loader Information:")
print(f"  Number of batches: {len(val_loader)}")
print(f"  Batch size: {val_loader.batch_size}")
# Get the dimension of a single batch
for images, labels in val_loader:
  print(f"  Dimension of 1 batch (images): {images.shape}")
  print(f"  Dimension of 1 batch (labels): {labels.shape}")
  break  # Exit the loop after processing one batch
print()

# Check for CUDA availability
print("CUDA AVAIABILITY:")
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    print("Number of GPUs:", torch.cuda.device_count())
    print("Current GPU:", torch.cuda.current_device())
    print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available. Using CPU.")

# Print model architecture summary
print("\nMODEL ARCHITECTURE:")
print(model)
