# Evaluation: BLEU and ROUGE


## Import and download libraries

In [1]:
%%capture
import re, torch

# Rileva versione di torch per scegliere xformers compatibile (come nel tuo codice unsloth)
v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
if v == "2.9":
    xformers = "xformers==0.0.33.post1"
elif v == "2.8":
    xformers = "xformers==0.0.32.post2"
else:
    xformers = "xformers==0.0.29.post3"

# --------------------------------------------------------------------
# Stack base per UNSLOTH (stesso schema che hai usato nei training)
# --------------------------------------------------------------------
!pip install --no-deps bitsandbytes accelerate {xformers} peft trl==0.22.2 triton cut_cross_entropy unsloth_zoo evaluate rouge_score
!pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth

# --------------------------------------------------------------------
# Extra richiesti da SmolVLM2 (in aggiunta allo stack sopra)
# --------------------------------------------------------------------
# (accelerate, datasets, peft, bitsandbytes sono già installati sopra)
!pip install tensorboard av num2words sentence-transformers

# --------------------------------------------------------------------
# Versione di transformers comune a tutti i modelli
#   - compatibile con requirement SmolVLM2: ">=4.41.0,<5.0.0"
#   - già usata con unsloth nei tuoi notebook (4.56.2)
# --------------------------------------------------------------------
!pip install "transformers==4.56.2"


In [2]:
from datetime import datetime
from google.colab import drive
import os, zipfile

from datasets import load_dataset
from PIL import Image

import numpy as np
import json
from pathlib import Path
from torch.utils.data import Subset
import io
import base64
from datasets import load_dataset
from PIL import Image
import os
from google.colab import userdata
from openai import AzureOpenAI
import re, gc, torch, json
from pathlib import Path
import evaluate

# ------------------------------------------------------------------
# Mount Drive
# ------------------------------------------------------------------
drive.mount("/content/drive")

# ------------------------------------------------------------------
# Global paths (persist on Drive)
# ------------------------------------------------------------------
THESIS_ROOT = "/content/drive/MyDrive/thesis"
EVAL_ROOT   = os.path.join(THESIS_ROOT, "eval_bt_metrics")
Path(EVAL_ROOT).mkdir(parents=True, exist_ok=True)

TIMESTAMP = datetime.now().strftime("_%d%m%Y_%H%M")
print("TIMESTAMP:", TIMESTAMP)
print("THESIS_ROOT:", THESIS_ROOT)
print("EVAL_ROOT:  ", EVAL_ROOT)

# ------------------------------------------------------------------
# LoRA paths for each model (adjust folder names if needed)
# ------------------------------------------------------------------
LORA_PATHS = {
    "gemma": os.path.join(
        THESIS_ROOT, "gemma", "gemma3_4b_vision_bt_lora"
    ),
    "qwen2dot5": os.path.join(
        THESIS_ROOT, "qwen2dot5", "qwen2dot5-3B-Instruct_bt_lora"
    ),
    "qwen3": os.path.join(
        THESIS_ROOT, "qwen3", "qwen3_vl_8b_bt_lora"
    ),
    "smolvlm2": os.path.join(
        THESIS_ROOT, "smolvlm2", "lora_adapter"
    ),
}

for name, path in LORA_PATHS.items():
    print(name, "→", path, "   exists:", os.path.isdir(path))


Mounted at /content/drive
TIMESTAMP: _11122025_1805
THESIS_ROOT: /content/drive/MyDrive/thesis
EVAL_ROOT:   /content/drive/MyDrive/thesis/eval_bt_metrics
gemma → /content/drive/MyDrive/thesis/gemma/gemma3_4b_vision_bt_lora    exists: True
qwen2dot5 → /content/drive/MyDrive/thesis/qwen2dot5/qwen2dot5-3B-Instruct_bt_lora    exists: True
qwen3 → /content/drive/MyDrive/thesis/qwen3/qwen3_vl_8b_bt_lora    exists: True
smolvlm2 → /content/drive/MyDrive/thesis/smolvlm2/lora_adapter    exists: True


## Load Dataset

In [3]:
os.chdir('/content')

# Unzip dataset_oxe.zip (adjust path if needed)
zip_path = "/content/drive/MyDrive/dataset_oxe.zip"
if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall("/content")
    print("✅ Unzipped dataset_oxe.zip into /content")
else:
    print("⚠️ dataset_oxe.zip not found at", zip_path)


✅ Unzipped dataset_oxe.zip into /content


In [4]:
os.chdir('/content')

# Load dataset
train_dataset_raw = load_dataset("json", data_files="dataset_oxe/train/data.jsonl", split="train")
val_dataset_raw = load_dataset("json", data_files="dataset_oxe/val/data.jsonl", split="train")

# ========================================
# FIX PER QWEN3-VL: Convert format
# ========================================
def convert_positions_text_image(example, base_path):
    """
    Qwen3-VL richiede formato specifico per image placeholder
    """
    # Load image
    img_path = os.path.join(base_path, example["messages"][0]["content"][1]["image"])
    image = Image.open(img_path).convert("RGB")

    # Get texts
    user_text = example["messages"][0]["content"][0]["text"]
    assistant_text = example["messages"][1]["content"][0]["text"]

    new_example = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},  # ← Image PRIMA
                    {"type": "text", "text": user_text}  # ← Text DOPO
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": assistant_text}
                ]
            }
        ]
    }

    return new_example

# Convert to list format (no Arrow serialization issues)
print("Converting dataset for Qwen3-VL...")
train_dataset = []
for example in train_dataset_raw:
    converted = convert_positions_text_image(example, "/content/dataset_oxe/train")
    train_dataset.append(converted)

val_dataset = []
for example in val_dataset_raw:
    converted = convert_positions_text_image(example, "/content/dataset_oxe/val")
    val_dataset.append(converted)

print(f"✅ Dataset ready! Train: {len(train_dataset)}, Val: {len(val_dataset)}")


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Converting dataset for Qwen3-VL...
✅ Dataset ready! Train: 1497, Val: 167


In [5]:
from torch.utils.data import Dataset

class MessagesDataset(Dataset):
    def __init__(self, list_examples):
        self.data = list_examples

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_ds = MessagesDataset(train_dataset)
eval_ds  = MessagesDataset(val_dataset)

print("train_ds size:", len(train_ds))
print("eval_ds size:", len(eval_ds))


train_ds size: 1497
eval_ds size: 167


In [6]:
SUBSET_SIZE = 50
INDICES_PATH = Path(EVAL_ROOT) / "eval_indices_50.json"

if INDICES_PATH.exists():
    # Reload the same indices used in previous runs
    with open(INDICES_PATH, "r") as f:
        subset_indices = json.load(f)
    print(f"✅ Loaded {len(subset_indices)} indices from {INDICES_PATH}")
else:
    # Generate once a random but deterministic subset
    rng = np.random.RandomState(42)  # fixed seed
    all_indices = np.arange(len(eval_ds))
    subset_indices = rng.choice(all_indices, size=SUBSET_SIZE, replace=False)
    subset_indices = sorted(subset_indices.tolist())

    with open(INDICES_PATH, "w") as f:
        json.dump(subset_indices, f)
    print(f"✅ Generated and saved {len(subset_indices)} indices to {INDICES_PATH}")

# Create the PyTorch subset dataset
eval_subset_ds = Subset(eval_ds, subset_indices)

print("eval_ds size:", len(eval_ds))
print("eval_subset_ds size:", len(eval_subset_ds))
print("First 10 indices used for subset:", subset_indices[:10])


✅ Loaded 50 indices from /content/drive/MyDrive/thesis/eval_bt_metrics/eval_indices_50.json
eval_ds size: 167
eval_subset_ds size: 50
First 10 indices used for subset: [2, 9, 12, 15, 16, 18, 19, 22, 24, 26]


## LOADER

In [7]:
# ========================================
# AZURE OPENAI CLIENT (GPT-5)
# ========================================


print("🔑 Setup Azure OpenAI client (GPT-5)")

azure_openai_key         = userdata.get("azure_openai_key")
azure_openai_endpoint    = userdata.get("azure_openai_endpoint")
azure_openai_api_version = userdata.get("azure_openai_api_version")
azure_openai_region      = userdata.get("azure_openai_region")  # opzionale

if not azure_openai_key or not azure_openai_endpoint or not azure_openai_api_version:
    raise RuntimeError(
        "Missing one of: azure_openai_key, azure_openai_endpoint, azure_openai_api_version "
        "in Colab userdata secrets."
    )

client_azure = AzureOpenAI(
    api_key        = azure_openai_key,
    azure_endpoint = azure_openai_endpoint,
    api_version    = azure_openai_api_version,
)

print("✅ Azure OpenAI client initialised!")

# Nomi dei deployment per i due modelli (adattali ai tuoi deployment Azure)
GPT5_INSTANT_DEPLOYMENT  = "gpt-5.1-mini"      # es: deployment per GPT-5 instant
GPT5_THINKING_DEPLOYMENT = "gpt-5.1-thinking"  # es: deployment per GPT-5 thinking

# Prompt di sistema opzionale (puoi arricchirlo con le stesse istruzioni del training)
GPT_SYSTEM_PROMPT = (
    "You are an assistant that generates BehaviorTree.CPP XML trees for robot tasks. "
    "Given a textual description of an episode, you must output ONLY the XML for the behavior tree."
)


🔑 Setup Azure OpenAI client (GPT-5)
✅ Azure OpenAI client initialised!


In [15]:
from unsloth import FastVisionModel, get_chat_template
from peft import PeftModel
from transformers import AutoProcessor, AutoModelForImageTextToText

# Base model ids (adjust if needed)
GEMMA_BASE_ID    = "unsloth/gemma-3-4b-pt"
QWEN25_BASE_ID   = "unsloth/Qwen2.5-VL-3B-Instruct"
QWEN3_BASE_ID    = "unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit"
SMOLVLM2_BASE_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"


def get_tokenizer_from_processor(processor):
    """
    Alcuni processor hanno .tokenizer, altri lo sono direttamente.
    Restituisce sempre un oggetto con .decode(...).
    """
    return getattr(processor, "tokenizer", processor)


def make_cleanup_fn(objects_to_free):
    def cleanup():
        for obj in objects_to_free:
            try:
                del obj
            except Exception:
                pass
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    return cleanup

def make_vision_inference_fn(model, processor, device, use_text_images: bool):
    """
    use_text_images = False  → unsloth: processor(image, input_text, ...)
    use_text_images = True   → SmolVLM2: processor(text=input_text, images=image, ...)
    """
    tokenizer = get_tokenizer_from_processor(processor)

    def inference_fn(image, user_text):
        # 1) Messages identici per tutti i modelli
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": user_text},
                ],
            }
        ]

        # 2) Applica il chat template del processor
        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

        # 3) Costruisci gli inputs con la firma corretta
        if use_text_images:
            # SmolVLM2-style
            inputs = processor(
                text   = input_text,
                images = image,
                add_special_tokens=False,
                return_tensors="pt",
            ).to(device)
        else:
            # unsloth FastVisionModel-style
            inputs = processor(
                image,
                input_text,
                add_special_tokens=False,
                return_tensors="pt",
            ).to(device)

        # 4) Generazione deterministica
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens = 512,
                do_sample      = False,
                temperature    = 0.0,
                use_cache      = True,
            )

        # 5) Rimuovi il prompt dagli output prima del decode
        input_len = inputs["input_ids"].shape[-1]
        gen_ids = outputs[0, input_len:]
        text = tokenizer.decode(gen_ids, skip_special_tokens=True)
        return text

    return inference_fn




def load_model(model_key: str):
    """
    model_key ∈ {"gemma", "qwen2dot5", "qwen3", "smolvlm2"}

    Returns:
      inference_fn(image, user_text) -> str
      cleanup_fn() -> None
    """
    model_key = model_key.lower()

    # --------------------------- GEMMA ---------------------------
    if model_key == "gemma":
        lora_dir = LORA_PATHS["gemma"]
        print(f"[gemma] Loading base model + LoRA from: {lora_dir}")

        model, processor = FastVisionModel.from_pretrained(
            model_name   = GEMMA_BASE_ID,
            load_in_4bit = True,
        )

        # In training avevi usato get_chat_template("gemma-3")
        processor = get_chat_template(processor, "gemma-3")

        model = PeftModel.from_pretrained(model, lora_dir)
        FastVisionModel.for_inference(model)
        model.eval()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inference_fn = make_vision_inference_fn(
            model,
            processor,
            device,
            use_text_images = False,   # unsloth-style: processor(image, text, ...)
        )
        cleanup_fn = make_cleanup_fn([model, processor])
        return inference_fn, cleanup_fn

    # ------------------------ QWEN 2.5 ---------------------------
    if model_key == "qwen2dot5":
        lora_dir = LORA_PATHS["qwen2dot5"]
        print(f"[qwen2dot5] Loading base model + LoRA from: {lora_dir}")

        model, processor = FastVisionModel.from_pretrained(
            model_name   = QWEN25_BASE_ID,
            load_in_4bit = True,
        )

        model = PeftModel.from_pretrained(model, lora_dir)
        FastVisionModel.for_inference(model)
        model.eval()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inference_fn = make_vision_inference_fn(
            model,
            processor,
            device,
            use_text_images = False,   # come Gemma, usi processor(image, text, ...)
        )
        cleanup_fn = make_cleanup_fn([model, processor])
        return inference_fn, cleanup_fn

    # -------------------------- QWEN 3 ---------------------------
    if model_key == "qwen3":
        lora_dir = LORA_PATHS["qwen3"]
        print(f"[qwen3] Loading base model + LoRA from: {lora_dir}")

        model, processor = FastVisionModel.from_pretrained(
            model_name   = QWEN3_BASE_ID,
            load_in_4bit = True,
        )

        model = PeftModel.from_pretrained(model, lora_dir)
        FastVisionModel.for_inference(model)
        model.eval()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inference_fn = make_vision_inference_fn(
            model,
            processor,
            device,
            use_text_images = False,    # Qwen3 usa processor(text=..., images=...)
        )
        cleanup_fn = make_cleanup_fn([model, processor])
        return inference_fn, cleanup_fn

    # ------------------------ SMOLVLM2 ---------------------------
    if model_key == "smolvlm2":
        lora_dir = LORA_PATHS["smolvlm2"]
        print(f"[smolvlm2] Loading base model + LoRA from: {lora_dir}")

        processor = AutoProcessor.from_pretrained(lora_dir)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = AutoModelForImageTextToText.from_pretrained(
            SMOLVLM2_BASE_ID,
            torch_dtype          = torch.float16,   # allinea a fp16
            attn_implementation  = "eager",         # evita SDPA integrata che rompe sui dtype misti
        ).to(device)

        model = PeftModel.from_pretrained(model, lora_dir)
        model.eval()

        inference_fn = make_vision_inference_fn(
            model,
            processor,
            device,
            use_text_images = True,    # text=..., images=...
        )
        cleanup_fn = make_cleanup_fn([model, processor])
        return inference_fn, cleanup_fn

          # --------------------- GPT-5 INSTANT ------------------------
    if model_key == "gpt5_instant":
        print("[gpt5_instant] Using Azure OpenAI GPT-5 Instant deployment:",
              GPT5_INSTANT_DEPLOYMENT)

        def inference_fn(image, user_text):
            """
            Usa esattamente lo stesso user_text degli altri modelli
            e la stessa immagine, codificata in base64.
            """
            # Codifica l'immagine PIL in base64 (JPEG)
            buf = io.BytesIO()
            image.save(buf, format="JPEG")
            img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")

            # Messaggi: stesso prompt testuale, + immagine come image_url
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": user_text},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{img_b64}"
                            },
                        },
                    ],
                }
            ]

            resp = client_azure.chat.completions.create(
                model       = GPT5_INSTANT_DEPLOYMENT,
                messages    = messages,
                max_tokens  = 512,
                temperature = 0.0,
            )
            # Ritorna il testo della prima scelta
            return resp.choices[0].message.content

        # Nessuna GPU locale da liberare
        def cleanup_fn():
            return

        return inference_fn, cleanup_fn

    # -------------------- GPT-5 THINKING ------------------------
    if model_key == "gpt5_thinking":
        print("[gpt5_thinking] Using Azure OpenAI GPT-5 Thinking deployment:",
              GPT5_THINKING_DEPLOYMENT)

        def inference_fn(image, user_text):
            # Codifica l'immagine PIL in base64 (JPEG)
            buf = io.BytesIO()
            image.save(buf, format="JPEG")
            img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": user_text},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{img_b64}"
                            },
                        },
                    ],
                }
            ]

            resp = client_azure.chat.completions.create(
                model       = GPT5_THINKING_DEPLOYMENT,
                messages    = messages,
                max_tokens  = 512,
                temperature = 0.0,
            )
            return resp.choices[0].message.content

        def cleanup_fn():
            return

        return inference_fn, cleanup_fn

    # ------------------------------------------------------------
    raise ValueError(f"Unknown model_key: {model_key}")



## EVALUATION METHODS

In [9]:
# -------------------------------------------------------
#  Helper: extract (image, user_text, ground_truth)
# -------------------------------------------------------
def extract_fields(sample):
    """
    Given a sample with the 'messages' structure, returns:
      image       : PIL image
      user_text   : full user prompt string (with INSTRUCTION, etc.)
      ground_truth: BT ground truth (assistant text)
    """
    image = sample["messages"][0]["content"][0]["image"]
    user_text = sample["messages"][0]["content"][1]["text"]
    ground_truth = sample["messages"][1]["content"][0]["text"]
    return image, user_text, ground_truth


# -------------------------------------------------------
#  Helper: simple normalization of BT text
#          (minimize formatting noise for ROUGE/BLEU)
# -------------------------------------------------------
_WHITESPACE_RE = re.compile(r"\s+")

def normalize_bt(text: str) -> str:
    if text is None:
        return ""
    # Strip ends and collapse all whitespace (spaces / newlines / tabs) to single spaces
    return _WHITESPACE_RE.sub(" ", text.strip())


# -------------------------------------------------------
#  Metrics (loaded once, reused for all models)
# -------------------------------------------------------
rouge = evaluate.load("rouge")
bleu  = evaluate.load("bleu")
print("Loaded metrics: ROUGE + BLEU")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Loaded metrics: ROUGE + BLEU


In [10]:
from pathlib import Path
import json
from tqdm.auto import tqdm

def evaluate_model(model_key, eval_indices, dataset, force=False):
    """
    Esegue inference sul subset e calcola ROUGE/BLEU per un singolo modello.

    Args:
        model_key    : "gemma", "qwen2dot5", "qwen3", "smolvlm2",
                       "gpt5_instant", "gpt5_thinking"
        eval_indices : lista di indici nel dataset (subset_indices)
        dataset      : es. eval_ds
        force        : se False e metrics.json esiste, non ricalcola

    Output:
        dict con metriche aggregate.
        Salva anche:
          - EVAL_ROOT/<model_key>/metrics.json
          - EVAL_ROOT/<model_key>/predictions.jsonl
    """
    model_key = model_key.lower()

    model_dir = Path(EVAL_ROOT) / model_key
    model_dir.mkdir(parents=True, exist_ok=True)

    metrics_path = model_dir / "metrics.json"
    preds_path   = model_dir / "predictions.jsonl"

    # Se abbiamo già tutto e force=False, non rifacciamo il lavoro
    if metrics_path.exists() and preds_path.exists() and not force:
        print(f"[{model_key}] metrics already found, skipping (force=False).")
        with open(metrics_path, "r") as f:
            metrics = json.load(f)
        return metrics

    # 1) Carica modello + inference_fn + cleanup_fn
    print(f"\n===== Evaluating model: {model_key} =====")
    print(f"Subset size: {len(eval_indices)} examples")
    inference_fn, cleanup_fn = load_model(model_key)

    predictions        = []  # normalized predictions (per metriche)
    references         = []  # normalized references (per metriche)
    raw_predictions    = []  # raw text output (per salvataggio)
    raw_references     = []  # reference originale (se ti serve in futuro)

    # 2) Loop sul subset con tqdm
    for step, idx in enumerate(tqdm(eval_indices, desc=f"{model_key} eval", unit="ex")):
        sample = dataset[idx]
        image, user_text, ground_truth = extract_fields(sample)

        # Predizione raw dal modello
        pred_raw = inference_fn(image, user_text)

        # Normalizza prediction + reference per le metriche
        pred_norm = normalize_bt(pred_raw)
        ref_norm  = normalize_bt(ground_truth)

        raw_predictions.append(pred_raw)
        raw_references.append(ground_truth)
        predictions.append(pred_norm)
        references.append(ref_norm)

    # 3) Calcolo metriche aggregate
    rouge_res = rouge.compute(
        predictions = predictions,
        references  = references,
        use_stemmer = True,
    )
    # BLEU di HF evaluate si aspetta references come lista di liste
    bleu_res = bleu.compute(
        predictions = predictions,
        references  = [[r] for r in references],
    )

    metrics = {
        "model":        model_key,
        "num_examples": len(eval_indices),
        "rouge1":       rouge_res["rouge1"]    * 100,
        "rouge2":       rouge_res["rouge2"]    * 100,
        "rougeL":       rouge_res["rougeL"]    * 100,
        "rougeLsum":    rouge_res["rougeLsum"] * 100,
        "bleu":         bleu_res["bleu"]       * 100,
    }

    print("\n--- Aggregated metrics ---")
    for k in ["rouge1", "rouge2", "rougeL", "rougeLsum", "bleu"]:
        print(f"{k}: {metrics[k]:.2f}")

    # 4) Salva metrics + predizioni su Drive
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=2)

    with open(preds_path, "w") as f:
        for idx, pred_raw, ref_raw, ref_norm in zip(
            eval_indices, raw_predictions, raw_references, references
        ):
            rec = {
                "idx":         int(idx),
                "prediction":  pred_raw,
                "reference":   ref_raw,
                "reference_n": ref_norm,
            }
            f.write(json.dumps(rec) + "\n")

    print(f"\n[{model_key}] Saved metrics to     {metrics_path}")
    print(f"[{model_key}] Saved predictions to {preds_path}")

    # 5) Cleanup GPU / memoria
    cleanup_fn()

    return metrics


## GEMMA

In [11]:
# ========================================
# EVALUATION – GEMMA
# ========================================

MODEL_KEY = "gemma"
print(f"\n Starting evaluation for: {MODEL_KEY}")

metrics_gemma = evaluate_model(
    model_key    = MODEL_KEY,
    eval_indices = subset_indices,
    dataset      = eval_ds,
    force        = False,  # metti True se vuoi ricalcolare anche se esistono già i file
)

print("\n=== GEMMA – METRICS ===")
for k, v in metrics_gemma.items():
    print(f"{k}: {v}")



 Starting evaluation for: gemma

===== Evaluating model: gemma =====
[gemma] Loading base model + LoRA from: /content/drive/MyDrive/thesis/gemma/gemma3_4b_vision_bt_lora
==((====))==  Unsloth 2025.12.4: Fast Gemma3 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.


model.safetensors:   0%|          | 0.00/4.38G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

[gemma] Saved metrics to      /content/drive/MyDrive/thesis/eval_bt_metrics/gemma/metrics.json
[gemma] Saved predictions to  /content/drive/MyDrive/thesis/eval_bt_metrics/gemma/predictions.jsonl

=== GEMMA – METRICS ===
model: gemma
num_examples: 50
rouge1: 87.17959812990112
rouge2: 77.7469804414167
rougeL: 81.17493820726064
rougeLsum: 81.0874489087041
bleu: 84.08782250243982


## QWEN 2.5

In [12]:
# ========================================
# EVALUATION – QWEN2.5
# ========================================

MODEL_KEY = "qwen2dot5"
print(f"\n Starting evaluation for: {MODEL_KEY}")

metrics_qwen2dot5 = evaluate_model(
    model_key    = MODEL_KEY,
    eval_indices = subset_indices,
    dataset      = eval_ds,
    force        = False,
)

print("\n=== QWEN2.5 – METRICS ===")
for k, v in metrics_qwen2dot5.items():
    print(f"{k}: {v}")



 Starting evaluation for: qwen2dot5

===== Evaluating model: qwen2dot5 =====
[qwen2dot5] Loading base model + LoRA from: /content/drive/MyDrive/thesis/qwen2dot5/qwen2dot5-3B-Instruct_bt_lora
==((====))==  Unsloth 2025.12.4: Fast Qwen2_5_Vl patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.79G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

[qwen2dot5] Saved metrics to      /content/drive/MyDrive/thesis/eval_bt_metrics/qwen2dot5/metrics.json
[qwen2dot5] Saved predictions to  /content/drive/MyDrive/thesis/eval_bt_metrics/qwen2dot5/predictions.jsonl

=== QWEN2.5 – METRICS ===
model: qwen2dot5
num_examples: 50
rouge1: 87.11249814624247
rouge2: 77.72788084979751
rougeL: 81.26809346098266
rougeLsum: 81.08625945129941
bleu: 83.81308459884865


## QWEN 3

In [13]:
# ========================================
# EVALUATION – QWEN3
# ========================================

MODEL_KEY = "qwen3"
print(f"\n Starting evaluation for: {MODEL_KEY}")

metrics_qwen3 = evaluate_model(
    model_key    = MODEL_KEY,
    eval_indices = subset_indices,
    dataset      = eval_ds,
    force        = False,
)

print("\n=== QWEN3 – METRICS ===")
for k, v in metrics_qwen3.items():
    print(f"{k}: {v}")



 Starting evaluation for: qwen3

===== Evaluating model: qwen3 =====
[qwen3] Loading base model + LoRA from: /content/drive/MyDrive/thesis/qwen3/qwen3_vl_8b_bt_lora


ValueError: `unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit` is not supported yet in `transformers==4.56.2`.
Please update transformers via `pip install --upgrade transformers` and try again.

## SMOLVLM2

In [16]:
# ========================================
# EVALUATION – SMOLVLM2
# ========================================

MODEL_KEY = "smolvlm2"
print(f"\n Starting evaluation for: {MODEL_KEY}")

metrics_smolvlm2 = evaluate_model(
    model_key    = MODEL_KEY,
    eval_indices = subset_indices,
    dataset      = eval_ds,
    force        = False,
)

print("\n=== SMOLVLM2 – METRICS ===")
for k, v in metrics_smolvlm2.items():
    print(f"{k}: {v}")



 Starting evaluation for: smolvlm2

===== Evaluating model: smolvlm2 =====
[smolvlm2] Loading base model + LoRA from: /content/drive/MyDrive/thesis/smolvlm2/lora_adapter


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[smolvlm2] Saved metrics to      /content/drive/MyDrive/thesis/eval_bt_metrics/smolvlm2/metrics.json
[smolvlm2] Saved predictions to  /content/drive/MyDrive/thesis/eval_bt_metrics/smolvlm2/predictions.jsonl

=== SMOLVLM2 – METRICS ===
model: smolvlm2
num_examples: 50
rouge1: 82.01047285867232
rouge2: 69.57292079822975
rougeL: 71.39221177600295
rougeLsum: 71.31843237565488
bleu: 75.34269154311187


## GPT-5 INSTANT

In [None]:
# ========================================
# EVALUATION – GPT-5 INSTANT
# ========================================
# Richiede che il client Azure (client_azure, GPT5_INSTANT_DEPLOYMENT, ecc.)
# sia già inizializzato correttamente.

MODEL_KEY = "gpt5_instant"
print(f"\n Starting evaluation for: {MODEL_KEY}")

metrics_gpt5_instant = evaluate_model(
    model_key    = MODEL_KEY,
    eval_indices = subset_indices,
    dataset      = eval_ds,
    force        = False,
)

print("\n=== GPT-5 INSTANT – METRICS ===")
for k, v in metrics_gpt5_instant.items():
    print(f"{k}: {v}")


## GPT-5

In [None]:
# ========================================
# EVALUATION – GPT-5 THINKING
# ========================================
# Anche qui serve il client Azure già inizializzato.

MODEL_KEY = "gpt5_thinking"
print(f"\n Starting evaluation for: {MODEL_KEY}")

metrics_gpt5_thinking = evaluate_model(
    model_key    = MODEL_KEY,
    eval_indices = subset_indices,
    dataset      = eval_ds,
    force        = False,
)

print("\n=== GPT-5 THINKING – METRICS ===")
for k, v in metrics_gpt5_thinking.items():
    print(f"{k}: {v}")


## FINAL EVALUATION

In [None]:
# ========================================
# SYNTHESIS PLOT – ROUGE / BLEU per modello
# ========================================
from pathlib import Path
import json
import pandas as pd
import matplotlib.pyplot as plt

# Assicuriamoci che EVAL_ROOT sia un Path
EVAL_ROOT = Path(EVAL_ROOT)

# Mappa per avere nomi leggibili in legenda (modifica se vuoi)
DISPLAY_NAMES = {
    "gemma":        "Gemma-3 4B-Vision",
    "qwen2dot5":    "Qwen2.5-VL-3B",
    "qwen3":        "Qwen3-VL-8B",
    "smolvlm2":     "SmolVLM2-2.2B",
    "gpt5_instant": "GPT-5 Instant",
    "gpt5_thinking":"GPT-5 Thinking",
    # se aggiungi altri modelli, puoi estendere qui
}

rows = []

print(f"🔍 Scanning evaluation directory: {EVAL_ROOT}")

for subdir in sorted(EVAL_ROOT.iterdir()):
    if not subdir.is_dir():
        continue

    metrics_file = subdir / "metrics.json"
    if not metrics_file.exists():
        print(f"⚠️ No metrics.json in {subdir}, skipping.")
        continue

    try:
        with open(metrics_file, "r") as f:
            metrics = json.load(f)

        model_key = subdir.name
        display_name = DISPLAY_NAMES.get(model_key, model_key)

        row = {
            "model_key":  model_key,
            "model":      display_name,
            "rouge1":     metrics.get("rouge1", float("nan")),
            "rouge2":     metrics.get("rouge2", float("nan")),
            "rougeL":     metrics.get("rougeL", float("nan")),
            "rougeLsum":  metrics.get("rougeLsum", float("nan")),
            "bleu":       metrics.get("bleu", float("nan")),
        }
        rows.append(row)
        print(f"✅ Loaded metrics for model: {display_name}")
    except Exception as e:
        print(f"❌ Error reading {metrics_file}: {e}")

if not rows:
    print("⚠️ No metrics found under EVAL_ROOT. Run the per-model evaluations first.")
else:
    # -----------------------------
    # Costruisci DataFrame base
    # -----------------------------
    df_raw = pd.DataFrame(rows)

    # Ordine dei modelli nella legenda (facoltativo: qui uso l'ordine di DISPLAY_NAMES)
    model_order = [DISPLAY_NAMES.get(k, k) for k in DISPLAY_NAMES.keys()
                   if (df_raw["model_key"] == k).any()]
    if model_order:
        df_raw["model"] = pd.Categorical(df_raw["model"], categories=model_order, ordered=True)
        df_raw = df_raw.sort_values("model")

    # Ordine delle metriche (asse X)
    metric_order = ["rouge1", "rouge2", "rougeL", "rougeLsum", "bleu"]

    # Tabella con righe = metriche, colonne = modelli → come nel grafico di BTGenBot
    df_plot = (
        df_raw
        .set_index("model")[metric_order]  # index: model, columns: metrics
        .T                                  # index: metric, columns: model
    )

    # -----------------------------
    # Tabella riepilogativa
    # -----------------------------
    print("\n=== SUMMARY TABLE (ROUGE / BLEU, % units) ===")
    display(df_plot.T)  # righe = modello, colonne = metriche

    # -----------------------------
    # Barplot comparativo
    # -----------------------------
    plt.figure(figsize=(10, 4))  # rapporto simile alla figura che hai mostrato

    ax = df_plot.plot(kind="bar")  # metriche sull'asse X, barre affiancate per modello

    ax.set_ylabel("Score (%)")
    ax.set_xlabel("")  # etichetta X non necessaria
    ax.set_title("Behavior Tree Generation – ROUGE / BLEU")

    # Nomi più leggibili sull'asse X
    ax.set_xticklabels(
        ["ROUGE 1", "ROUGE 2", "ROUGE L", "ROUGE Lsum", "BLEU"],
        rotation=0,
    )

    ax.legend(title="", loc="upper left")  # legenda con nomi modello

    ax.set_ylim(0, 100)  # i punteggi sono in percentuale
    ax.grid(axis="y", linestyle="--", alpha=0.3)

    # Annotazione dei valori sopra ogni barra (come nel grafico di esempio)
    try:
        for container in ax.containers:
            ax.bar_label(container, fmt="%.0f", padding=2)
    except Exception:
        # se bar_label non è disponibile nella versione di matplotlib, semplicemente saltiamo
        pass

    plt.tight_layout()
    plt.show()
