In [1]:
import torch
from torchvision.datasets import CIFAR100
from torch.utils.data import Subset, DataLoader, random_split
import numpy as np
import wandb
# import torchvision
from torchvision import transforms
import torch.optim as optim

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import shutil
import os                              # Import the 'os' module for changing directories
os.chdir('/content/drive/MyDrive/FL')  # Change the directory

import git_manager
import FederatedLearningProject.data.cifar100_loader as loader
import FederatedLearningProject.checkpoints.checkpointing as checkpointing



Mounted at /content/drive


In [2]:
wandb.login() # Ask for your API key for logging in to the wandb library.

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnicco-to[0m ([33mnicco-to-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
dataset = loader.get_cifar100()
train_set, val_set = loader.split_train_val(dataset)

# Create DataLoader
# DataLoader is a class from PyTorch's torch.utils.data module. It helps iterate through your dataset during training.
# batch_size=128: This specifies that during training, the model will process data in batches of 128 samples at a time.
# shuffle=True: It randomly shuffles the data within the train_set before each epoch (a full pass through the dataset). Shuffling helps prevent the model from learning patterns based on the order of data, leading to better generalization.
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
val_loader = DataLoader(val_set, batch_size=128, shuffle=False)


In [None]:
## Useful only to train a small fragment of dataset!

# subset_indices = np.random.choice(len(train_set), size=10000, replace=False)
# subset_train_dataset = Subset(train_set, subset_indices)
# train_set, val_set = loader.split_train_val(subset_train_dataset)
# train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
# val_loader = DataLoader(val_set, batch_size=128, shuffle=False)


In [4]:
# --- MODEL LOADING AND SETUP ---

# Load the pre-trained DINO ViT-S/16 model from PyTorch Hub
model = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')

# Device selection: Use GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Move the model to the selected device

Using cache found in /root/.cache/torch/hub/facebookresearch_dino_main


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
  (head): Identity()
)

In [5]:
# --- OPTIMIZER AND LOSS FUNCTION ---
# Define the optimizer (Adam is used best Federico found for now)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
# Define the loss function (Cross-entropy for multi-class classification)
loss_fn = torch.nn.CrossEntropyLoss()


In [6]:
# wandb.init() prepares the tracking of hyperparameters/metrics for later recording performance using wandb.log()

model_name = "dino_vits16"
project_name = "FederatedProject"
run_name = f"{model_name}_run"

# INITIALIZE W&B
wandb.init(
    project=project_name,
    name=run_name,
    config={
        "model": model_name,
        "epochs": 20,
        "batch_size": train_loader.batch_size,
        "learning_rate": optimizer.param_groups[0]['lr'],
        "architecture": model.__class__.__name__,
})

# Copy your config
config = wandb.config


In [7]:
#  PERCORSO CHECKPOINT
checkpoint_dir = "/content/drive/MyDrive/FL/FederatedLearningProject/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_path = os.path.join(checkpoint_dir, f"{model_name}_checkpoint.pth")    # we predefine the name of the file inside the specified folder (dir)

In [9]:
# RECOVER CHECKPOINT
epoch, model_data = checkpointing.load_checkpoint(model, optimizer, checkpoint_dir)
try:
  print()
  print(f"The 'model_data' dictionary contains the following keys: {list(model_data.keys())}")
except: None

 Nessun checkpoint trovato, inizio da epoca 1.



In [13]:
## Display some informations ##

print("Model:", model_name)
print("Train set size:", len(train_set))
print("Validation set size:", len(val_set))
print("Batch size:", train_loader.batch_size)
print("Number of epochs:", config.epochs)
print("DataLoader: ")
print("Learning rate:", optimizer.param_groups[0]['lr'])
print("Architecture:", model.__class__.__name__)
print("Device:", device)
print("Optimizer:", optimizer)
print("Loss function:", loss_fn)
print("Checkpoint directory:", checkpoint_dir)
print("Checkpoint path:", checkpoint_path)
print("Current epoch:", epoch)
print()

print("Train Loader Information:")
print(f"  Number of batches: {len(train_loader)}")
print(f"  Batch size: {train_loader.batch_size}")
# Get the dimension of a single batch
for images, labels in train_loader:
  print(f"  Dimension of 1 batch (images): {images.shape}")
  print(f"  Dimension of 1 batch (labels): {labels.shape}")
  break  # Exit the loop after processing one batch
print()

print("\nValidation Loader Information:")
print(f"  Number of batches: {len(val_loader)}")
print(f"  Batch size: {val_loader.batch_size}")
# Get the dimension of a single batch
for images, labels in val_loader:
  print(f"  Dimension of 1 batch (images): {images.shape}")
  print(f"  Dimension of 1 batch (labels): {labels.shape}")
  break  # Exit the loop after processing one batch
print()

# Check for CUDA availability
print("CUDA AVAIABILITY:")
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    print("Number of GPUs:", torch.cuda.device_count())
    print("Current GPU:", torch.cuda.current_device())
    print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available. Using CPU.")

# Print model architecture summary
print("\nMODEL ARCHITECTURE:")
print(model)



Model: dino_vits16
Train set size: 40000
Validation set size: 10000
Batch size: 128
Number of epochs: 20
DataLoader: 
Learning rate: 0.0001
Architecture: VisionTransformer
Device: cuda
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 0
)
Loss function: CrossEntropyLoss()
Checkpoint directory: /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints
Checkpoint path: /content/drive/MyDrive/FL/FederatedLearningProject/checkpoints/dino_vits16_checkpoint.pth
Current epoch: 1

Train Loader Information:
  Number of batches: 313
  Batch size: 128
  Dimension of 1 batch (images): torch.Size([128, 3, 32, 32])
  Dimension of 1 batch (labels): torch.Size([128])


Validation Loader Information:
  Number of batches: 79
  Batch size: 128
  Dimension of 1 batch (images): torch.Size([128, 3, 32, 32])
  Dimension of 1 ba

In [None]:
start_epoch = epoch if epoch is not None else 1
num_epochs = wandb.config.epochs    # retrieve the desired number of training epochs you've previously specified in your WandB setup
checkpoint_interval = 10

# TRAINING
for epoch in range(start_epoch, num_epochs + 1):
    model.train()     # Attiva modalità training
    train_loss = 0.0  # Reset ad ogni epoca
    correct_train = 0
    total_train = 0

    # Loop sui Batch di Training
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device) # sposta i dati su gpu quando disponibile
        outputs = model(images)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()    # backpropagation
        optimizer.step()   # aggiorna i pesi

        train_loss += loss.item() * images.size(0) # serve per accumulare la loss ad ogni epoca
        _, predicted = torch.max(outputs, 1) # Ignora i valori massimi (_), e tiene solo gli indici (predicted) -> questo perchè poi confrontiamo gli indici con le labels
        total_train += labels.size(0) # Numero di immagini nel batch corrent
        correct_train += (predicted == labels).sum().item() # Crea un tensore booleano poi somma le TRUE e infine converte in un numero

    train_accuracy = 100 * correct_train / total_train
    avg_train_loss = train_loss / total_train

    #  VALIDAZIONE
    model.eval() # Attiva modalità di validation
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad(): # no calcolo gradienti
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device) # sposta i dati su gpu quando disponibile
            outputs = model(images)

            # Calcolo loss
            loss = loss_fn(outputs, labels)
            val_loss += loss.item() * images.size(0) # images.size(0) restituisce la dimensione lungo il primo asse tipo numpy, dovrebbe variare con il batch size
                                                     # potremmo usare anche len(labels) ma con .size(0) gestiamo il caso in cui ultimo batch contenga un numero inferiore di osservazioni

            # Calcolo accuratezza
            _, predicted = torch.max(outputs, 1) # Ignora i valori massimi (_), e tiene solo gli indici (predicted) -> questo perchè poi confrontiamo gli indici con le labels
            total += labels.size(0)   # Numero di immagini nel batch corrent
            correct += (predicted == labels).sum().item() # Crea un tensore booleano poi somma le TRUE e infine converte in un numero

    avg_val_loss = val_loss / total  # Loss media
    val_accuracy = 100 * correct / total

    # LOG SU W&B
    wandb.log({
        "train_loss": avg_train_loss,
        "train_accuracy": train_accuracy,
        "val_loss": avg_val_loss,
        "val_accuracy": val_accuracy,
        "epoch": epoch
    }, step=epoch)

    print(f"[Epoch {epoch}] Train Loss: {avg_train_loss:.4f},  Train Accuracy: {train_accuracy:.2f}%, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

    # SALVATAGGIO CHECKPOINT -> contien stato dell'optimizer per riprendere l'addestramento
    if epoch % checkpoint_interval == 0:
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),  # restituisce un dizionario Python che contiene tutti i parametri apprendibili del modello (pesi e bias)
            "optimizer_state_dict": optimizer.state_dict(),
            "train_loss": avg_train_loss,
            "val_loss": avg_val_loss,
        }, checkpoint_path)
        print(f" Checkpoint salvato su Drive: {checkpoint_path}")

        #  (Opzionale) LOG ARTIFACT SU W&B -> dobbiamo capire se farlo o no, se i checkpoint sono molto grandi (es. >1GB), meglio salvarli localmente
        # potrebbe avere senso salvarci il miglior modello ma ora con solo il modello consigliato si pùò tralasciare

        # artifact = wandb.Artifact(f"{model_name}_checkpoint_ep{epoch}", type="model")
        # artifact.add_file(checkpoint_path)
        # wandb.log_artifact(artifact)

wandb.finish()

[Epoch 1] Train Loss: 0.4402,  Train Accuracy: 88.17%, Val Loss: 0.4505, Val Accuracy: 87.97%
[Epoch 2] Train Loss: 0.2831,  Train Accuracy: 91.53%, Val Loss: 0.5059, Val Accuracy: 86.36%
[Epoch 3] Train Loss: 0.2367,  Train Accuracy: 92.57%, Val Loss: 0.5721, Val Accuracy: 85.18%
[Epoch 4] Train Loss: 0.2128,  Train Accuracy: 93.45%, Val Loss: 0.5689, Val Accuracy: 84.49%
[Epoch 5] Train Loss: 0.2028,  Train Accuracy: 93.60%, Val Loss: 0.6534, Val Accuracy: 82.74%
[Epoch 6] Train Loss: 0.1840,  Train Accuracy: 94.09%, Val Loss: 0.6785, Val Accuracy: 82.39%
[Epoch 7] Train Loss: 0.1799,  Train Accuracy: 94.22%, Val Loss: 0.7283, Val Accuracy: 80.80%
[Epoch 8] Train Loss: 0.1696,  Train Accuracy: 94.59%, Val Loss: 0.7433, Val Accuracy: 80.24%
[Epoch 9] Train Loss: 0.1700,  Train Accuracy: 94.61%, Val Loss: 0.7925, Val Accuracy: 79.85%
[Epoch 10] Train Loss: 0.1486,  Train Accuracy: 95.31%, Val Loss: 0.8560, Val Accuracy: 78.49%


RuntimeError: Parent directory /content/drive/MyDrive/FL/FL_Project_FedeExperiments/checkpoints does not exist.

In [19]:
git = git_manager.GitManager()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
git.commit_and_push()


🔑 Already logged in
