Baseline per il confronto con la varianete con data augmentation con t-test

In [1]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from PIL import Image
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import LinearLR
import open_clip


DATA_DIR  = "../pre_processing/dataset_train_e_val_assieme/train/"
TEST_DIR  = "../pre_processing/dataset_train_e_val_assieme/test/"
TRAIN_JSON = "../pre_processing/dataset_train_e_val_assieme/train.json"
TEST_JSON  = "../pre_processing/dataset_train_e_val_assieme/test.json"

BATCH_SIZE = 8
EPOCHS     = 10
CV_K       = 5
SEED       = 42
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")

np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)


class MultimodalClassifier(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 512):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.classifier(x)

class MultimodalDataset(Dataset):
    def __init__(self, annotations, img_folder, label_encoder, transform=None):
        self.annotations   = annotations
        self.img_folder    = img_folder
        self.label_encoder = label_encoder
        self.transform     = transform or transforms.Compose([
            transforms.Resize((224, 224)), transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        item       = self.annotations[idx]
        text       = item["text"]
        label_name = item["label"]
        img_file   = item["image"]
        img_path   = os.path.join(self.img_folder, label_name, img_file)

        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)
        label = self.label_encoder.transform([label_name])[0]
        return text, image, torch.tensor(label, dtype=torch.float32)


with open(TRAIN_JSON, "r", encoding="utf-8") as f:
    annotations = json.load(f)

all_labels    = [a["label"] for a in annotations]
label_encoder = LabelEncoder().fit(all_labels)


clip_model, _, preprocess = open_clip.create_model_and_transforms(
    model_name="RN50-quickgelu",
    pretrained="openai",
    device=DEVICE
)
tokenizer = open_clip.get_tokenizer("RN50")
clip_model.eval()
for p in clip_model.parameters():
    p.requires_grad = False

# Unfreeze ultimi layer per fine-tuning 
for name, param in list(clip_model.named_parameters())[-10:]:
    param.requires_grad = True

EMBED_DIM = 1024 + 1024 # concat(text | image)


def run_epoch(classifier, loader, criterion, optimizer=None):
    is_train = optimizer is not None
    classifier.train() if is_train else classifier.eval()

    epoch_loss  = 0.0
    all_preds   = []
    all_targets = []

    with torch.set_grad_enabled(is_train):
        for texts, images, labels in loader:
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)

            with torch.no_grad():
                text_tokens = tokenizer(texts).to(DEVICE)
                text_embeds = clip_model.encode_text(text_tokens)
                image_embeds = clip_model.encode_image(images)
            feats = torch.cat([text_embeds, image_embeds], dim=1)

            logits = classifier(feats).squeeze()
            loss   = criterion(logits, labels)
            epoch_loss += loss.item()

            preds = (logits > 0.5).float().cpu().numpy()
            all_preds.extend(preds)
            all_targets.extend(labels.cpu().numpy())

            if is_train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

    epoch_loss /= len(loader)
    metrics = {
        "acc" : accuracy_score(all_targets, all_preds),
        "prec": precision_score(all_targets, all_preds, average="macro", zero_division=0),
        "rec" : recall_score(all_targets, all_preds, average="macro", zero_division=0),
        "f1"  : f1_score(all_targets, all_preds, average="macro", zero_division=0),
    }
    return epoch_loss, metrics

def run_test(test_annotations_path, data_dir, classifier):
    with open(test_annotations_path, "r", encoding="utf-8") as f:
        test_ann = json.load(f)

    test_loader = DataLoader(
        MultimodalDataset(test_ann, data_dir, label_encoder, transform=preprocess),
        batch_size=BATCH_SIZE,
    )

    classifier.eval()
    all_preds, all_targets = [], []

    with torch.no_grad():
        for texts, images, labels in tqdm(test_loader, desc="Testing"):
            text_tokens = tokenizer(texts).to(DEVICE)
            text_embeds = clip_model.encode_text(text_tokens)
            image_embeds = clip_model.encode_image(images.to(DEVICE))
            feats = torch.cat([text_embeds, image_embeds], dim=1)
            logits = classifier(feats).squeeze()
            preds = (logits > 0.5).float().cpu()
            all_preds.extend(preds.tolist())
            all_targets.extend(labels.tolist())

    acc  = accuracy_score(all_targets, all_preds)
    prec = precision_score(all_targets, all_preds, average="macro", zero_division=0)
    rec  = recall_score(all_targets, all_preds, average="macro", zero_division=0)
    f1   = f1_score(all_targets, all_preds, average="macro", zero_division=0)

    print("\n===== TEST RESULTS =====")
    print(f"Accuracy          : {acc:.4f}")
    print(f"Precision (macro) : {prec:.4f}")
    print(f"Recall   (macro)  : {rec:.4f}")
    print(f"F1-score (macro)  : {f1:.4f}")

# ========= K-fold training =========
print(f"Running {CV_K}-fold CV on {len(annotations)} samples …")
skf = StratifiedKFold(n_splits=CV_K, shuffle=True, random_state=SEED)
writer = SummaryWriter(log_dir=f"runs/cv_{CV_K}_folds")
fold_results = []
test_metrics_all = []
classifiers = []

def tb_tag(fold, metric):
    return f"Fold{fold}/{metric}"

classifier = None

for fold_idx, (train_ids, val_ids) in enumerate(skf.split(np.zeros(len(all_labels)), all_labels), 1):
    print(f"\n=== Fold {fold_idx}/{CV_K} ===")

    train_ann = [annotations[i] for i in train_ids]
    val_ann   = [annotations[i] for i in val_ids]

    train_loader = DataLoader(MultimodalDataset(train_ann, DATA_DIR, label_encoder, transform=preprocess), batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(MultimodalDataset(val_ann,   DATA_DIR, label_encoder, transform=preprocess), batch_size=BATCH_SIZE)

    classifier = MultimodalClassifier(EMBED_DIM).to(DEVICE)
    criterion  = nn.BCELoss()
    optimizer  = torch.optim.Adam(classifier.parameters(), lr=1e-2, weight_decay=0.0001)
    scheduler  = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.9, total_iters=EPOCHS)
    

    train_losses, val_losses = [], []
    best_f1 = 0.0

    for epoch in range(1, EPOCHS + 1):
        train_loss, train_m = run_epoch(classifier, train_loader, criterion, optimizer)
        val_loss,   val_m   = run_epoch(classifier, val_loader,   criterion)

        writer.add_scalar(tb_tag(fold_idx, "Loss/Train"), train_loss, epoch)
        writer.add_scalar(tb_tag(fold_idx, "Loss/Val"),   val_loss,   epoch)
        writer.add_scalar(tb_tag(fold_idx, "F1/Val"),     val_m["f1"], epoch)

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        print(
            f"[Fold {fold_idx} | Ep {epoch:02d}]  Train-loss: {train_loss:.4f} | Val-loss: {val_loss:.4f} | Val F1: {val_m['f1']:.4f}"
        )

        best_f1 = max(best_f1, val_m["f1"])
        scheduler.step()

    plt.figure()
    plt.plot(range(1, EPOCHS + 1), train_losses, label="Train loss")
    plt.plot(range(1, EPOCHS + 1), val_losses,   label="Val loss")
    plt.title(f"Loss curves – Fold {fold_idx}")
    plt.xlabel("Epoch"); plt.ylabel("Binary cross-entropy")
    plt.legend(); plt.grid(True, linestyle=":", linewidth=0.5)
    plt.savefig(f"loss_fold{fold_idx}.png", dpi=150, bbox_inches="tight")
    plt.close()

    fold_results.append({"fold": fold_idx, **val_m, "best_f1": best_f1})
    classifiers.append(classifier)

writer.close()

print("\n======== Cross-validation summary ========")
print(f"Mean best-F1 across folds: {np.mean([fr['best_f1'] for fr in fold_results]):.4f}")
print(f"Mean accuracy            : {np.mean([fr['acc'] for fr in fold_results]):.4f}")


print("\n======== Test set evaluation (per fold) ========")
for i, clf in enumerate(classifiers, 1):
    print(f"\n--- Fold {i} ---")
    run_test(TEST_JSON, TEST_DIR, clf)

test_metrics_all.append(run_test(TEST_JSON, TEST_DIR, classifier))

Running 5-fold CV on 1350 samples …

=== Fold 1/5 ===
[Fold 1 | Ep 01]  Train-loss: 0.5394 | Val-loss: 0.4442 | Val F1: 0.7690
[Fold 1 | Ep 02]  Train-loss: 0.3707 | Val-loss: 0.5132 | Val F1: 0.7727
[Fold 1 | Ep 03]  Train-loss: 0.3064 | Val-loss: 0.6058 | Val F1: 0.7310
[Fold 1 | Ep 04]  Train-loss: 0.2157 | Val-loss: 0.5251 | Val F1: 0.7596
[Fold 1 | Ep 05]  Train-loss: 0.1773 | Val-loss: 0.5829 | Val F1: 0.7789
[Fold 1 | Ep 06]  Train-loss: 0.1613 | Val-loss: 0.7717 | Val F1: 0.7516
[Fold 1 | Ep 07]  Train-loss: 0.1890 | Val-loss: 0.5467 | Val F1: 0.7753
[Fold 1 | Ep 08]  Train-loss: 0.1339 | Val-loss: 0.6132 | Val F1: 0.7963
[Fold 1 | Ep 09]  Train-loss: 0.1025 | Val-loss: 0.6598 | Val F1: 0.7459
[Fold 1 | Ep 10]  Train-loss: 0.1353 | Val-loss: 1.1022 | Val F1: 0.7619

=== Fold 2/5 ===
[Fold 2 | Ep 01]  Train-loss: 0.5352 | Val-loss: 0.5494 | Val F1: 0.6667
[Fold 2 | Ep 02]  Train-loss: 0.3694 | Val-loss: 0.5148 | Val F1: 0.7491
[Fold 2 | Ep 03]  Train-loss: 0.2838 | Val-loss: 0.5

Testing: 100%|██████████| 38/38 [00:09<00:00,  4.14it/s]



===== TEST RESULTS =====
Accuracy          : 0.8033
Precision (macro) : 0.7983
Recall   (macro)  : 0.7425
F1-score (macro)  : 0.7587

--- Fold 2 ---


Testing: 100%|██████████| 38/38 [00:09<00:00,  4.18it/s]



===== TEST RESULTS =====
Accuracy          : 0.7967
Precision (macro) : 0.7790
Recall   (macro)  : 0.7475
F1-score (macro)  : 0.7587

--- Fold 3 ---


Testing: 100%|██████████| 38/38 [00:09<00:00,  4.05it/s]



===== TEST RESULTS =====
Accuracy          : 0.7900
Precision (macro) : 0.7639
Recall   (macro)  : 0.7725
F1-score (macro)  : 0.7677

--- Fold 4 ---


Testing: 100%|██████████| 38/38 [00:08<00:00,  4.26it/s]



===== TEST RESULTS =====
Accuracy          : 0.7667
Precision (macro) : 0.7374
Recall   (macro)  : 0.7350
F1-score (macro)  : 0.7362

--- Fold 5 ---


Testing: 100%|██████████| 38/38 [00:09<00:00,  3.92it/s]



===== TEST RESULTS =====
Accuracy          : 0.7933
Precision (macro) : 0.7756
Recall   (macro)  : 0.7425
F1-score (macro)  : 0.7540


Testing: 100%|██████████| 38/38 [00:09<00:00,  4.15it/s]



===== TEST RESULTS =====
Accuracy          : 0.7933
Precision (macro) : 0.7756
Recall   (macro)  : 0.7425
F1-score (macro)  : 0.7540


In [1]:
import numpy as np
from scipy.stats import ttest_1samp

# Supponiamo che questi siano i punteggi F1 del modello originale (da k-fold CV)
f1_model_original = np.array([0.7587, 0.7587, 0.7677, 0.7362, 0.7540])  

# Supponiamo che questo sia il punteggio F1 del modello con augmentazione (unico valore)
f1_model_variant = 0.7412  

# Calcola la media e la deviazione standard del primo modello
mean_original = np.mean(f1_model_original)
std_original = np.std(f1_model_original, ddof=1)

# Test t per un campione (compara il valore del modello variante con la media del primo modello)
statistic, p_value = ttest_1samp(f1_model_original, f1_model_variant)

# Stampare i risultati
print(f"T-statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpretare i risultati
alpha = 0.05
if p_value < alpha:
    print("Il modello variante ha una performance significativamente migliore del modello originale.")
else:
    print("Non ci sono prove sufficienti per affermare che il modello variante si comporti meglio del modello originale.")


T-statistic: 2.6597543348225687
P-value: 0.05640480416136819
Non ci sono prove sufficienti per affermare che il modello variante si comporti meglio del modello originale.
