In [6]:
import torch
import torch.optim as optim
import torchvision
from torchvision import transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import Subset, DataLoader, random_split

import numpy as np
import wandb
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import shutil
import os                              # Import the 'os' module for changing directories
os.chdir('/content/drive/MyDrive/FL')  # Change the directory

#import git_manager
import FederatedLearningProject.data.dataset_utils as dataset_utils
import FederatedLearningProject.checkpoints.checkpointing as checkpointing



Mounted at /content/drive


In [None]:
wandb.login() # Ask for your API key for logging in to the wandb library.

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnicco-to[0m ([33mnicco-to-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
# Import CIFAR100 dataset: train_set, val_set, test_set
# The transforms are applied before returning the dataset (in the module)

valid_split_perc = 0.2    # of the 50000 training data
train_set, val_set, test_set = dataset_utils.get_datasets(valid_split_perc)

Number of images in the Training Set: 40000
Number of images in the Validation Set: 10000
Number of images in the Test Set: 10000

✅ Datasets loaded successfully: training, validation, and test sets are ready.


In [8]:
# Create DataLoaders for training, validation, and test sets

# batch_size è in hyperparameter (64, 128, ..), anche num_workers (consigliato per colab 2 o 4)

train_loader = DataLoader(train_set.dataset, batch_size=64, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set.dataset, batch_size=64, shuffle=False, num_workers=2)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=2)

In [9]:
# --- MODEL LOADING AND SETUP ---

# Load the pre-trained DINO ViT-S/16 model from PyTorch Hub
model = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')

# Device selection: Use GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Move the model to the selected device

Downloading: "https://github.com/facebookresearch/dino/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dino_deitsmall16_pretrain.pth
100%|██████████| 82.7M/82.7M [00:00<00:00, 237MB/s]


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
  (head): Identity()
)

In [10]:
print(device)

cuda


In [11]:
# --- OPTIMIZER AND LOSS FUNCTION ---
# Define the optimizer (Adam is used best Federico found for now)
# optimizer = optim.Adam(model.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)  # momentum=0.9, weight_decay=5e-4 -> optimizer consigliato

# Define the loss function (Cross-entropy for multi-class classification)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# wandb.init() prepares the tracking of hyperparameters/metrics for later recording performance using wandb.log()

model_name = "dino_vits16"
project_name = "FederatedProject"
run_name = f"{model_name}_run"

# INITIALIZE W&B
wandb.init(
    project=project_name,
    name=run_name,
    config={
        "model": model_name,
        "epochs": 50,
        "batch_size": train_loader.batch_size,
        "learning_rate": optimizer.param_groups[0]['lr'],
        "architecture": model.__class__.__name__,
})

# Copy your config
config = wandb.config


In [None]:
#  PERCORSO CHECKPOINT
checkpoint_dir = "/content/drive/MyDrive/FL/FederatedLearningProject/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_checkpoint.pth")    # we predefine the name of the file inside the specified folder (dir)

In [None]:
# RECOVER CHECKPOINT
epoch, model_data = checkpointing.load_checkpoint(model, optimizer, checkpoint_dir)

try:
  print()
  print(f"The 'model_data' dictionary contains the following keys: {list(model_data.keys())}")
  model.load_state_dict(model_data["model_state_dict"])
  optimizer.load_state_dict(model_data["optimizer_state_dict"])
except: None



 Checkpoint caricato da /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/dino_vits16_checkpoint.pth, riprendo da epoca 21.

The 'model_data' dictionary contains the following keys: ['epoch', 'model_state_dict', 'optimizer_state_dict', 'train_loss', 'val_loss']


In [12]:
## Display some informations ##

print("Model:", model_name)
print("Train set size:", len(train_set))
print("Validation set size:", len(val_set))
print("Batch size:", train_loader.batch_size)
print("Number of epochs:", config.epochs)
print("DataLoader: ")
print("Learning rate:", optimizer.param_groups[0]['lr'])
print("Architecture:", model.__class__.__name__)
print("Device:", device)
print("Optimizer:", optimizer)
print("Loss function:", loss_fn)
print("Checkpoint directory:", checkpoint_dir)
print("Checkpoint path:", checkpoint_path)
print("Current epoch:", epoch)
print()

print("Train Loader Information:")
print(f"  Number of batches: {len(train_loader)}")
print(f"  Batch size: {train_loader.batch_size}")
# Get the dimension of a single batch
for images, labels in train_loader:
  print(f"  Dimension of 1 batch (images): {images.shape}")
  print(f"  Dimension of 1 batch (labels): {labels.shape}")
  break  # Exit the loop after processing one batch
print()

print("\nValidation Loader Information:")
print(f"  Number of batches: {len(val_loader)}")
print(f"  Batch size: {val_loader.batch_size}")
# Get the dimension of a single batch
for images, labels in val_loader:
  print(f"  Dimension of 1 batch (images): {images.shape}")
  print(f"  Dimension of 1 batch (labels): {labels.shape}")
  break  # Exit the loop after processing one batch
print()

# Check for CUDA availability
print("CUDA AVAIABILITY:")
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    print("Number of GPUs:", torch.cuda.device_count())
    print("Current GPU:", torch.cuda.current_device())
    print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available. Using CPU.")

# Print model architecture summary
print("\nMODEL ARCHITECTURE:")
print(model)



NameError: name 'model_name' is not defined

In [None]:
start_epoch = epoch if epoch is not None else 1
num_epochs = wandb.config.epochs    # retrieve the desired number of training epochs you've previously specified in your WandB setup
checkpoint_interval = 10


# TRAINING
for epoch in range(start_epoch, num_epochs + 1):
    model.train()     # Attiva modalità training
    train_loss = 0.0  # Reset ad ogni epoca
    correct_train = 0
    total_train = 0

    # Loop sui Batch di Training
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device) # sposta i dati su gpu quando disponibile
        outputs = model(images)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()    # backpropagation
        optimizer.step()   # aggiorna i pesi

        train_loss += loss.item() * images.size(0) # serve per accumulare la loss ad ogni epoca
        _, predicted = torch.max(outputs, 1) # Ignora i valori massimi (_), e tiene solo gli indici (predicted) -> questo perchè poi confrontiamo gli indici con le labels
        total_train += labels.size(0) # Numero di immagini nel batch corrent
        correct_train += (predicted == labels).sum().item() # Crea un tensore booleano poi somma le TRUE e infine converte in un numero

    train_accuracy = 100 * correct_train / total_train
    avg_train_loss = train_loss / total_train

    #  VALIDAZIONE
    model.eval() # Attiva modalità di validation
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad(): # no calcolo gradienti
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device) # sposta i dati su gpu quando disponibile
            outputs = model(images)

            # Calcolo loss
            loss = loss_fn(outputs, labels)
            val_loss += loss.item() * images.size(0) # images.size(0) restituisce la dimensione lungo il primo asse tipo numpy, dovrebbe variare con il batch size
                                                     # potremmo usare anche len(labels) ma con .size(0) gestiamo il caso in cui ultimo batch contenga un numero inferiore di osservazioni

            # Calcolo accuratezza
            _, predicted = torch.max(outputs, 1) # Ignora i valori massimi (_), e tiene solo gli indici (predicted) -> questo perchè poi confrontiamo gli indici con le labels
            total += labels.size(0)   # Numero di immagini nel batch corrent
            correct += (predicted == labels).sum().item() # Crea un tensore booleano poi somma le TRUE e infine converte in un numero

    avg_val_loss = val_loss / total  # Loss media
    val_accuracy = 100 * correct / total

    # LOG SU W&B
    wandb.log({
        "train_loss": avg_train_loss,
        "train_accuracy": train_accuracy,
        "val_loss": avg_val_loss,
        "val_accuracy": val_accuracy,
        "epoch": epoch
    }, step=epoch)

    print(f"[Epoch {epoch}] Train Loss: {avg_train_loss:.4f},  Train Accuracy: {train_accuracy:.2f}%, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

    # SALVATAGGIO CHECKPOINT -> contien stato dell'optimizer per riprendere l'addestramento
    if epoch % checkpoint_interval == 0:
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),  # restituisce un dizionario Python che contiene tutti i parametri apprendibili del modello (pesi e bias)
            "optimizer_state_dict": optimizer.state_dict(),
            "train_loss": avg_train_loss,
            "val_loss": avg_val_loss,
        }, checkpoint_path)
        print(f" Checkpoint salvato su Drive: {checkpoint_path}")

        #  (Opzionale) LOG ARTIFACT SU W&B -> dobbiamo capire se farlo o no, se i checkpoint sono molto grandi (es. >1GB), meglio salvarli localmente
        # potrebbe avere senso salvarci il miglior modello ma ora con solo il modello consigliato si pùò tralasciare

        # artifact = wandb.Artifact(f"{model_name}_checkpoint_ep{epoch}", type="model")
        # artifact.add_file(checkpoint_path)
        # wandb.log_artifact(artifact)

wandb.finish()

[Epoch 21] Train Loss: 1.2003,  Train Accuracy: 65.24%, Val Loss: 1.5816, Val Accuracy: 56.32%
[Epoch 22] Train Loss: 1.1766,  Train Accuracy: 65.65%, Val Loss: 1.5118, Val Accuracy: 57.83%
[Epoch 23] Train Loss: 1.1551,  Train Accuracy: 66.42%, Val Loss: 1.5372, Val Accuracy: 58.05%
[Epoch 24] Train Loss: 1.1258,  Train Accuracy: 67.06%, Val Loss: 1.5267, Val Accuracy: 58.06%
[Epoch 25] Train Loss: 1.1004,  Train Accuracy: 67.69%, Val Loss: 1.5259, Val Accuracy: 58.27%
[Epoch 26] Train Loss: 1.0813,  Train Accuracy: 68.37%, Val Loss: 1.5273, Val Accuracy: 58.16%
[Epoch 27] Train Loss: 1.0487,  Train Accuracy: 69.16%, Val Loss: 1.5250, Val Accuracy: 58.28%
[Epoch 28] Train Loss: 1.0257,  Train Accuracy: 69.54%, Val Loss: 1.5097, Val Accuracy: 58.66%
[Epoch 29] Train Loss: 1.0047,  Train Accuracy: 70.47%, Val Loss: 1.5137, Val Accuracy: 58.67%
[Epoch 30] Train Loss: 0.9903,  Train Accuracy: 70.76%, Val Loss: 1.5267, Val Accuracy: 58.78%
 Checkpoint salvato su Drive: /content/drive/MyDri

KeyboardInterrupt: 