In [2]:
import sys
import os

def running_in_colab():
    return 'google.colab' in sys.modules or os.path.exists('/content')

branch = "giovanna/centralized-baseline"
username = "giovanna-brod-zamojska"
repo = "federated-learning-project"

is_private = True

def clone_repo_if_needed(exists_ok: bool, username: str, repository: str, is_private: bool, branch: str = None):

  colab_repo_path = f'/content/{repository}/'
  
  if running_in_colab():

    if exists_ok and os.path.exists(colab_repo_path):
        print(f"Repository already exists at {colab_repo_path}")
        return

    if not os.path.exists(colab_repo_path) or not exists_ok:

        # Remove any existing repo
        print(f"Removing content of {colab_repo_path}")
        os.system(f"rm -rf {colab_repo_path}")
        print("Current directory files and folders:", os.system("ls"))

        print("Cloning GitHub repo...")

        if is_private:
            # Clone private repository
            # Clone the GitHub repo (only needed once, if not already cloned)
            from getpass import getpass


            # Prompt for GitHub token (ensure token has access to the repo)
            token = getpass('Enter GitHub token: ')

            if branch:
              !git clone --branch {branch} https://{username}:{token}@github.com/{username}/{repo}.git
            else: 
              !git clone https://{username}:{token}@github.com/{username}/{repo}.git

        else:
            # Clone public repository
            if branch:
              !git clone --branch {branch} https://github.com/{username}/{repo}.git
            else:
              !git clone https://github.com/{username}/{repo}.git


    requirements_path = f"{colab_repo_path}/colab-requirements.txt"
    !pip install -r "$requirements_path"

  else:
    print("Not running in Google Colab. Skipping repository cloning.")#



def setup_notebook(repo_root_name: str = "federated-learning-project"):
    import sys
    from pathlib import Path

    if running_in_colab():
        print("Sys.path: ", sys.path)

        colab_repo_path = f'/content/{repo_root_name}/'
         # Add the repository root to sys.path so modules can be imported
        if str(colab_repo_path) not in sys.path:
            sys.path.insert(0, colab_repo_path)
            print(f"Added {colab_repo_path} to sys.path")
    else:
      
        notebook_dir = Path().absolute()
        project_root = notebook_dir.parent.parent

        # Add project root to Python path if not already present
        if str(project_root) not in sys.path:
            sys.path.insert(0, str(project_root))
            print(f"Added {project_root} to Python path")

        
clone_repo_if_needed(branch=branch, exists_ok=True, username=username, repository=repo, is_private=is_private)

setup_notebook()

    

Not running in Google Colab. Skipping repository cloning.
Added /Users/giovanna/Desktop/federated-learning-project to Python path


In [None]:
import os
import json
import torch
import random
import numpy as np
from src.classes.trainer import Trainer as CentralizedBaselineTrainer
from src.classes.cifar100_dataset import CIFAR100Dataset_v2 as CIFAR100Dataset
from src.classes.experiment_manager import ExperimentManager
from itertools import product

checkpoint_dir = "./checkpoints"
experiments_dir = "./output"

if running_in_colab():
    from google.colab import drive
    drive.mount('/content/drive')

    experiments_dir = "/content/drive/MyDrive/polito2025/MLDL/Project/experiments"
    checkpoint_dir = experiments_dir + "/checkpoints"


def set_seed(seed):
    """Set random seed for reproducibility"""
    print(f"Setting random seed to {seed}")
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def run_experiments(seed: int, resume: str = None):

    set_seed(seed)

    # Define hyperparameter search space dynamically
    grid_dict = {
        "batch_size": [32, 64, 128],
        "lr": [0.1, 0.01, 0.001],
        "weight_decay": [1e-5, 1e-4, 1e-3, 1e-2],
        "momentum": [0.8, 0.9, 0.95],
        "epochs": [4],
        "seed": [seed],
    }

    # Generate param grid from all combinations
    keys, values = zip(*grid_dict.items())
    param_grid = [dict(zip(keys, v)) for v in product(*values)]

    manager = ExperimentManager(
        base_config={
            "seed": seed,
            "lr": 0.1,
            "momentum": 0.9,
            "weight_decay": 1e-4,
            "epochs": 30,
            "batch_size": 64, 
            "num_workers": 4,
        },
        param_grid=param_grid,
        use_wandb=True,
        project_name="federated-learning-project",
        group_name="centralized-baseline",
        checkpoint_dir=checkpoint_dir,
    )
    _, _, results = manager.run(
        trainer_class=CentralizedBaselineTrainer,
        dataset=CIFAR100Dataset(),
        run_name="baseline",
        run_tags=["test", "convergence", "v0"],
        resume=resume,
    )
    print("Experiments completed.\n")


    filename = "experiment_results_baseline_test_convergence_v2.json"  

    os.makedirs(experiments_dir, exist_ok=True)
    file_path = os.path.join(experiments_dir, filename)
    with open(file_path, "w") as f:
        json.dump(results, f, indent=4)

    print(f"Results saved to {file_path}")


run_experiments(seed=42)
# run_experiments(seed=42, resume="checkpoints/checkpoint.pth")

# comments:
# higher batch sizes lead in general to a lower accuracy (128, 256) but a faster training time
#  (256 faster than 128 which is much faster than 64)
#  - 32 and 64 seems to be better 

# a learning rate of 0.01 is far better than 0.1, 0.2 (faster convergence) - but it can also lead to overfitting if not properly managed (the validation loss can start increasing - warning)
# 0.01 > 0.2 > 0.1



Setting random seed to 42
Dataset found at ./data. Loading...

Running experiment 1/192 with config: {'batch_size': 32, 'lr': 0.2, 'weight_decay': 1e-05, 'momentum': 0.8, 'epochs': 5, 'seed': 42}


[34m[1mwandb[0m: Currently logged in as: [33mgiovannabrod[0m ([33mfederated-learning-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Data loaders created.



Using cache found in /Users/giovanna/.cache/torch/hub/facebookresearch_dino_main


Loaded default DINO ViT-S/16 model. Model architecture: VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): LayerNorm((384,), eps=1

Training:   6%|▌         | 76/1250 [00:54<07:16,  2.69batch/s, loss=0.182, f1_macro=0.00198, f1_micro=0.00905, accuracy=0.00905] 