# Script per eseguire la grid search sul modello open clip rn50 quickgelu.

Questo script serve per far girare il modello senza grid search

In [1]:
import os
import torch
import open_clip
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import json
import matplotlib.pyplot as plt

# Setup 
device = "cuda" if torch.cuda.is_available() else "cpu"

model, _, preprocess = open_clip.create_model_and_transforms(
    model_name="RN50",
    pretrained="openai",
    device=device
)
tokenizer = open_clip.get_tokenizer("RN50")

# Funzione di utility per caricare le annotazioni
def load_annotations(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Dataset 
class RawMultimodalDataset(Dataset):
    def __init__(self, annotations, img_folder, label_encoder):
        self.annotations = annotations
        self.img_folder = img_folder
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):           
        item = self.annotations[idx]
        text = item["text"].lower().replace("\n", " ").strip()
        
        # Pulizia del testo
        text = text.lower()
        text = re.sub(r"\\n", " ", text).strip() 
        text = re.sub(r"\n", " ", text).strip() 
        text = re.sub(r"\\", " ", text).strip() 
        text = re.sub(r"  ", " ", text).strip()
        
        image_path = os.path.join(self.img_folder, item["label"], item["image"])
        image = Image.open(image_path).convert("RGB")
        label = self.label_encoder.transform([item["label"]])[0]
        return text, image, torch.tensor(label, dtype=torch.float32)

# Compute Embeddings 
def compute_embeddings(dataset):
    features, labels = [], []
    for text, image, label in tqdm(dataset, desc="Computing Embeddings"):
        image_tensor = preprocess(image).unsqueeze(0).to(device)
        text_tokens = tokenizer([text]).to(device)

        with torch.no_grad():
            img_feat = model.encode_image(image_tensor)
            txt_feat = model.encode_text(text_tokens)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)
            txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
            fused = torch.cat([img_feat, txt_feat], dim=-1)  # shape: [1, 2048]

        features.append(fused.squeeze(0).cpu())
        labels.append(label)

    return torch.stack(features), torch.tensor(labels)

# Dataset wrapper per le feature pre computate 
class PrecomputedDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Classificatore 
class MultimodalClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, dropout=0.1):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.classifier(x)

# Training loop 
def train_classifier(train_loader, val_loader, input_dim, epochs=10, lr=1e-4):
    model = MultimodalClassifier(input_dim=input_dim).to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features).squeeze()
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for features, labels in val_loader:
                features, labels = features.to(device), labels.to(device)
                preds = (model(features).squeeze() > 0.5).float()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}/{epochs} - Validation Accuracy: {acc:.4f}")

    return model

#  Evaluation 
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for features, labels in loader:
            features, labels = features.to(device), labels.to(device)
            preds = (model(features).squeeze() > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    rec = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

    print(f"Test Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")
    return acc, prec, rec, f1


if __name__ == "__main__":
    # Percorsi ai dati
    TRAIN_DATA_DIR = "../pre_processing/dataset/train/"
    VAL_DATA_DIR = "../pre_processing/dataset/val/"
    TEST_DATA_DIR = "../pre_processing/dataset/test/"
    TRAIN_ANNOTATIONS_PATH = '../pre_processing/dataset/train.json'
    VAL_ANNOTATIONS_PATH = '../pre_processing/dataset/val.json'
    TEST_ANNOTATIONS_PATH = '../pre_processing/dataset/test.json'

    train_annotations = load_annotations(TRAIN_ANNOTATIONS_PATH)
    val_annotations = load_annotations(VAL_ANNOTATIONS_PATH)
    test_annotations = load_annotations(TEST_ANNOTATIONS_PATH)

    all_labels = [a["label"] for a in train_annotations + val_annotations + test_annotations]
    label_encoder = LabelEncoder()
    label_encoder.fit(all_labels)

    train_raw = RawMultimodalDataset(train_annotations, TRAIN_DATA_DIR, label_encoder)
    val_raw = RawMultimodalDataset(val_annotations, VAL_DATA_DIR, label_encoder)
    test_raw = RawMultimodalDataset(test_annotations, TEST_DATA_DIR, label_encoder)

    train_features, train_labels = compute_embeddings(train_raw)
    val_features, val_labels = compute_embeddings(val_raw)
    test_features, test_labels = compute_embeddings(test_raw)

    train_dataset = PrecomputedDataset(train_features, train_labels)
    val_dataset = PrecomputedDataset(val_features, val_labels)
    test_dataset = PrecomputedDataset(test_features, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)
    test_loader = DataLoader(test_dataset, batch_size=8)

    input_dim = train_features.shape[1]
    trained_model = train_classifier(train_loader, val_loader, input_dim=input_dim, epochs=10, lr=1e-3)

    evaluate(trained_model, test_loader)

    torch.save(trained_model.state_dict(), "clip_rn50_classifier.pt")


open_clip_model.safetensors:   0%|          | 0.00/408M [00:00<?, ?B/s]

Computing Embeddings: 100%|██████████| 1200/1200 [01:08<00:00, 17.59it/s]
Computing Embeddings: 100%|██████████| 150/150 [00:08<00:00, 16.70it/s]
Computing Embeddings: 100%|██████████| 300/300 [00:18<00:00, 16.36it/s]


Epoch 1/10 - Validation Accuracy: 0.7667
Epoch 2/10 - Validation Accuracy: 0.7933
Epoch 3/10 - Validation Accuracy: 0.7667
Epoch 4/10 - Validation Accuracy: 0.7600
Epoch 5/10 - Validation Accuracy: 0.7333
Epoch 6/10 - Validation Accuracy: 0.7533
Epoch 7/10 - Validation Accuracy: 0.7667
Epoch 8/10 - Validation Accuracy: 0.7600
Epoch 9/10 - Validation Accuracy: 0.7467
Epoch 10/10 - Validation Accuracy: 0.7333
Test Accuracy: 0.7900, Precision: 0.7737, Recall: 0.7350, F1: 0.7476


Questo script esegue invece la grid search e salva i risultati

In [15]:
import os
import re
import torch
import open_clip
import json
import random
import numpy as np
from itertools import product
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn
import csv

# Setup 
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = "cuda" if torch.cuda.is_available() else "cpu"

model, _, preprocess = open_clip.create_model_and_transforms(
    model_name="RN50-quickgelu",
    pretrained="openai",
    device=device
)
tokenizer = open_clip.get_tokenizer("RN50")

# Funzione di utility per caricare le annotazioni
def load_annotations(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)


# Dataset 
class RawMultimodalDataset(Dataset):
    def __init__(self, annotations, img_folder, label_encoder):
        self.annotations = annotations
        self.img_folder = img_folder
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        item = self.annotations[idx]
        text = item["text"].lower()
        text = re.sub(r"\\n|\n|\\|\s+", " ", text).strip()

        image_path = os.path.join(self.img_folder, item["label"], item["image"])
        image = Image.open(image_path).convert("RGB")
        label = self.label_encoder.transform([item["label"]])[0]
        return text, image, torch.tensor(label, dtype=torch.float32)


def compute_embeddings(dataset):
    features, labels = [], []
    for text, image, label in tqdm(dataset, desc="Computing Embeddings"):
        image_tensor = preprocess(image).unsqueeze(0).to(device)
        text_tokens = tokenizer([text]).to(device)

        with torch.no_grad():
            img_feat = model.encode_image(image_tensor)
            txt_feat = model.encode_text(text_tokens)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)
            txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
            fused = torch.cat([img_feat, txt_feat], dim=-1)

        features.append(fused.squeeze(0).cpu())
        labels.append(label)

    return torch.stack(features), torch.tensor(labels)


class PrecomputedDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Classifier 
class MultimodalClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, dropout=0.0):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.classifier(x)

# Evaluation 
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for features, labels in loader:
            features, labels = features.to(device), labels.to(device)
            preds = (model(features).squeeze() > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    rec = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

    return acc, prec, rec, f1

 
if __name__ == "__main__":
    TRAIN_DATA_DIR = "../pre_processing/dataset/train/"
    VAL_DATA_DIR = "../pre_processing/dataset/val/"
    TEST_DATA_DIR = "../pre_processing/dataset/test/"
    TRAIN_ANNOTATIONS_PATH = '../pre_processing/dataset/train.json'
    VAL_ANNOTATIONS_PATH = '../pre_processing/dataset/val.json'
    TEST_ANNOTATIONS_PATH = '../pre_processing/dataset/test.json'

    train_annotations = load_annotations(TRAIN_ANNOTATIONS_PATH)
    val_annotations = load_annotations(VAL_ANNOTATIONS_PATH)
    test_annotations = load_annotations(TEST_ANNOTATIONS_PATH)

    all_labels = [a["label"] for a in train_annotations + val_annotations + test_annotations]
    label_encoder = LabelEncoder()
    label_encoder.fit(all_labels)

    train_raw = RawMultimodalDataset(train_annotations, TRAIN_DATA_DIR, label_encoder)
    val_raw = RawMultimodalDataset(val_annotations, VAL_DATA_DIR, label_encoder)
    test_raw = RawMultimodalDataset(test_annotations, TEST_DATA_DIR, label_encoder)

    train_features, train_labels = compute_embeddings(train_raw)
    val_features, val_labels = compute_embeddings(val_raw)
    test_features, test_labels = compute_embeddings(test_raw)

    train_dataset = PrecomputedDataset(train_features, train_labels)
    val_dataset = PrecomputedDataset(val_features, val_labels)
    test_dataset = PrecomputedDataset(test_features, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)
    test_loader = DataLoader(test_dataset, batch_size=8)

    input_dim = train_features.shape[1]

    dropouts = [0.0, 0.1, 0.2, 0.3]
    lrs = [0.01, 0.001, 0.0001]
    weight_decays = [0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
    schedulers = ['none', 'StepLR', 'LinearLR', 'ExponentialLR', 'CosineAnnealingLR']
    unlock_settings = [False, True]
    EPOCHS = 10

    results_path = "clip_rn50_grid_results_v3.csv"
    with open(results_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['dropout', 'lr', 'weight_decay', 'scheduler', 'unlocked',
                         'val_accuracy', 'val_precision', 'val_recall', 'val_f1',
                         'test_accuracy', 'test_precision', 'test_recall', 'test_f1'])


    combinations = list(product(dropouts, lrs, weight_decays, schedulers, unlock_settings))
    for dropout, lr, wd, sched_name, unlock in tqdm(combinations, desc="Grid Search", leave=True):
        model = MultimodalClassifier(input_dim=input_dim, dropout=dropout).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
        criterion = nn.BCELoss()

        if sched_name == 'StepLR':
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
        elif sched_name == 'LinearLR':
            scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.9, total_iters=EPOCHS)
        elif sched_name == 'ExponentialLR':
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
        elif sched_name == 'CosineAnnealingLR':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
        else:
            scheduler = None
            
        train_losses = []
        val_losses = []
            
        if unlock:
            #clip_model = CLIPModel.from_pretrained(NOME_DEL_MODELLO_PRETRAINED).to(DEVICE)
            clip_model, _, preprocess_temp =  open_clip.create_model_and_transforms(
                model_name="RN50-quickgelu",
                pretrained="openai",
                device=device
            )
            clip_model.train()
        
            for param in clip_model.parameters():
                param.requires_grad = False  # Freeza tutto
                
            # Visual: sblocca layer4 della ResNet
            for name, param in clip_model.visual.named_parameters():
                param.requires_grad = "layer4" in name or "attnpool" in name
        
            # Text: sblocca ultimi 3 transformer block
            for name, param in clip_model.transformer.named_parameters():
                param.requires_grad = any(f"resblocks.{i}." in name for i in [9, 10, 11])
        
            # Fusione finale: sblocca projection e logit scale
            for name, param in clip_model.named_parameters():
                if any(k in name for k in ["text_projection", "logit_scale"]):
                    param.requires_grad = True
            

        for epoch in range(10):
            model.train()
            epoch_loss = 0
            
            for features, labels in train_loader:
                features, labels = features.to(device), labels.to(device)
                outputs = model(features).squeeze()
                loss = criterion(outputs, labels)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            train_losses.append(epoch_loss / len(train_loader))
            
            
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for features, labels in val_loader:
                    features = features.to(device)
                    labels = labels.to(device)
                    logits = model(features).squeeze()
                    loss = criterion(logits, labels)
                    val_loss += loss.item()
            val_losses.append(val_loss / len(val_loader))
            
            if scheduler:
                scheduler.step()

        val_metrics = evaluate(model, val_loader)
        test_metrics = evaluate(model, test_loader)

        # with open(results_path, 'a') as f:
        #     f.write(f"{dropout},{lr},{wd},{sched_name},{','.join(map(str, val_metrics))},{','.join(map(str, test_metrics))}\n")
            
        with open(results_path, 'a', newline='') as f:
            writer = csv.writer(f)
            #f.write("dropout,lr,weight_decay,scheduler,unlock,val_acc,val_prec,val_rec,val_f1,test_acc,test_prec,test_rec,test_f1\n")
            writer.writerow([dropout, lr, wd, sched_name, unlock, *val_metrics, *test_metrics])
            

        #model_name = f"./saves_model2.2/clip_rn50_d{dropout}_lr{lr}_wd{wd}_sched{sched_name}.pt"
        safe_model_name = "openai_rn50"
        model_name = f"./saves_model2.2/{safe_model_name}_drop{dropout}_lrs{lr}_weights_{wd}_sched_{sched_name}_unlock{unlock}_augmented"
        #torch.save(model.state_dict(), model_name)
        #Salvo anche immagine
        # Save loss plot
        plt.figure()
        plt.plot(train_losses, label="Train Loss")
        plt.plot(val_losses, label="Val Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title(f"Loss Curve - {model_name}")
        plt.legend()
        plt.tight_layout()
        
        plt.savefig(f"{model_name}_loss_curve.png")
        plt.close()


Computing Embeddings: 100%|██████████| 1200/1200 [00:52<00:00, 22.96it/s]
Computing Embeddings: 100%|██████████| 150/150 [00:06<00:00, 21.51it/s]
Computing Embeddings: 100%|██████████| 300/300 [00:15<00:00, 19.64it/s]
Grid Search: 100%|██████████| 720/720 [44:07<00:00,  3.68s/it]  
