The following code is responsible for the tolerance function and generating metrics

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import unicodedata
from math import radians, sin, cos, sqrt, asin

# ======================================================
# TOLERANCE FUNCTIONS (adaptadas)
# ======================================================

def normalize_tokens(s):
    if s is None:
        return []
    s = str(s).lower()
    s = ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
    s = re.sub(r'[^a-z0-9]+', ' ', s)
    return s.split()

def overlap_coefficient(s1, s2):
    t1 = set(normalize_tokens(s1))
    t2 = set(normalize_tokens(s2))

    if not t1 or not t2:
        return 0.0

    return len(t1 & t2) / min(len(t1), len(t2))

def haversine(lat1, lon1, lat2, lon2):
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return 6371 * c  # km

def relative_similarity(x, y):
    try:
        x, y = float(x), float(y)
        return 1 - abs(x - y) / max(abs(x), abs(y), 1e-9)
    except:
        return 0.0

def considerTolerance(correct, predicted, prop):
    """
    Returns True if prediction is correct under tolerance
    """
    if prop not in correct or prop not in predicted:
        return None  # not comparable

    ref = correct[prop]
    val = predicted[prop]

    if ref is None or val is None:
        return None

    # Latitude / Longitude handled together
    if prop in ["latitude", "longitude"]:
        if not all(k in correct and k in predicted for k in ["latitude", "longitude"]):
            return None

        dist = haversine(
            float(predicted["latitude"]),
            float(predicted["longitude"]),
            float(correct["latitude"]),
            float(correct["longitude"])
        )
        return dist < 10  # km tolerance

    # Strings
    if isinstance(val, str):
        sim = overlap_coefficient(ref, val)
        return sim >= 0.75

    # Numbers
    try:
        sim = relative_similarity(val, ref)
        return sim >= 0.75
    except:
        return False


# ======================================================
# EVALUATION FUNCTION
# ======================================================

def getEntityDiff(entity):
    for ent in resultsTasks:
        icao = ent[0]
        if icao == entity:
            return ent[3]['entityDiff:']

    
    return None

def evaluate_results(
    results_csv,
    ground_truth_csv,
    id_column="icao"
):
    """
    results_csv: CSV from majority vote (one row per entity)
    ground_truth_csv: CSV with GT
    id_column: entity identifier (present in GT)
    """

    results_df = pd.read_csv(results_csv)
    
    gt_df = pd.read_csv(ground_truth_csv)
    # Index by id
    gt_df = gt_df.set_index(id_column)

    y_true = []
    y_pred = []

    detailed_rows = []

    for index, pred_row in results_df.iterrows():
        
        

        entity_id = pred_row['test'][:4]
        prompt_id = pred_row['test'].split("_")[1]
        diff_id = pred_row['test'].split("_")[2]
        diff_entity_id = getEntityDiff(entity_id)
    

        if entity_id not in gt_df.index:
            continue

        gt_row = gt_df.loc[entity_id]

        predicted = pred_row.dropna().to_dict()
        correct = gt_row.dropna().to_dict()

        for prop in predicted:
            if prop == id_column:
                continue
            if prop not in correct:
                continue  # do not penalize missing GT or missing result

            tol = considerTolerance(correct, predicted, prop)
            if tol is None:
                continue

            y_true.append(1)
            y_pred.append(1 if tol else 0)

            detailed_rows.append({
                "id": entity_id,
                "test": pred_row['test'],
                "prompt": prompt_id,
                "diff":diff_id,
                "diff_ent":diff_entity_id,
                "property": prop,
                "predicted": predicted[prop],
                "ground_truth": correct[prop],
                "correct": tol
            })

    # Metrics
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred) if y_true else None,
        "precision": precision_score(y_true, y_pred, zero_division=0) if y_true else None,
        "recall": recall_score(y_true, y_pred, zero_division=0) if y_true else None,
        "f1": f1_score(y_true, y_pred, zero_division=0) if y_true else None,
        "total_comparisons": len(y_true)
    }

   

    detailed_df = pd.DataFrame(detailed_rows)
    
     # =====================
    # Metrics per prompt
    # =====================
    prompt_metrics = {}

    for prompt, group in detailed_df.groupby("prompt"):
        y_true_p = [1] * len(group)
        y_pred_p = group["correct"].astype(int).tolist()

        prompt_metrics[prompt] = {
            "accuracy": accuracy_score(y_true_p, y_pred_p) if y_true_p else None,
            "precision": precision_score(y_true_p, y_pred_p, zero_division=0),
            "recall": recall_score(y_true_p, y_pred_p, zero_division=0),
            "f1": f1_score(y_true_p, y_pred_p, zero_division=0),
            "total_comparisons": len(y_true_p)
        }

    diff_metric = {}

    for diff, group in detailed_df.groupby("diff_ent"):
        y_true_p = [1] * len(group)
        y_pred_p = group["correct"].astype(int).tolist()

        diff_metric[diff] = {
            "accuracy": accuracy_score(y_true_p, y_pred_p) if y_true_p else None,
            "precision": precision_score(y_true_p, y_pred_p, zero_division=0),
            "recall": recall_score(y_true_p, y_pred_p, zero_division=0),
            "f1": f1_score(y_true_p, y_pred_p, zero_division=0),
            "total_comparisons": len(y_true_p)
        }

    return metrics, prompt_metrics, detailed_df, diff_metric


# ======================================================
# RUN
# ======================================================

global_metrics, prompt_metrics, detailed, diff_metric = evaluate_results(
    results_csv="gpt5_airports_majority_vote.csv",
    ground_truth_csv="ground_truth.csv",
)

print("=== Global metrics ===")
for k, v in global_metrics.items():
    print(f"{k}: {v}")

print("\n=== Metrics per prompt ===")
for prompt, metrics in prompt_metrics.items():
    print(f"\nPrompt {prompt}")
    for k, v in metrics.items():
        print(f"  {k}: {v}")

print("\n=== Metrics per prompt ===")
for diff, metrics in diff_metric.items():
    print(f"\ndiff {diff}")
    for k, v in metrics.items():
        print(f"  {k}: {v}")

detailed.to_csv("evaluation_detailed.csv", index=False)

In [None]:
# ======================================================
# ALL-IN-ONE CELL: Evaluation + Difficulty vs F1 Scatter
# ======================================================

import pandas as pd
import numpy as np
import re
import unicodedata
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from math import radians, sin, cos, sqrt, asin

# ======================================================
# TOLERANCE FUNCTIONS
# ======================================================

def normalize_tokens(s):
    if s is None:
        return []
    s = str(s).lower()
    s = ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
    s = re.sub(r'[^a-z0-9]+', ' ', s)
    return s.split()

def overlap_coefficient(s1, s2):
    t1 = set(normalize_tokens(s1))
    t2 = set(normalize_tokens(s2))
    if not t1 or not t2:
        return 0.0
    return len(t1 & t2) / min(len(t1), len(t2))

def haversine(lat1, lon1, lat2, lon2):
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return 6371 * c  # km

def relative_similarity(x, y):
    try:
        x, y = float(x), float(y)
        return 1 - abs(x - y) / max(abs(x), abs(y), 1e-9)
    except:
        return 0.0

def considerTolerance(correct, predicted, prop):
    if prop not in correct or prop not in predicted:
        return None

    ref = correct[prop]
    val = predicted[prop]

    if ref is None or val is None:
        return None

    if prop in ["latitude", "longitude"]:
        if not all(k in correct and k in predicted for k in ["latitude", "longitude"]):
            return None
        dist = haversine(
            float(predicted["latitude"]),
            float(predicted["longitude"]),
            float(correct["latitude"]),
            float(correct["longitude"])
        )
        return dist < 10

    if isinstance(val, str):
        return overlap_coefficient(ref, val) >= 0.75

    try:
        return relative_similarity(val, ref) >= 0.75
    except:
        return False

# ======================================================
# LOAD DIFFICULTY CSV
# ======================================================

def load_difficulty_csv(path):
    df = pd.read_csv(path, sep=";")
    df.columns = [c.strip() for c in df.columns]

    for col in ["SpecH", "SpecG", "diff"]:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(",", ".", regex=False)
            .astype(float)
        )
    return df

# ======================================================
# EVALUATION FUNCTION
# ======================================================

def evaluate_results(results_csv, ground_truth_csv, id_column="icao"):

    results_df = pd.read_csv(results_csv)
    gt_df = pd.read_csv(ground_truth_csv).set_index(id_column)

    y_true, y_pred = [], []
    detailed_rows = []

    for _, pred_row in results_df.iterrows():

        entity_id = pred_row["test"][:4]
        prompt_id = pred_row["test"].split("_")[1]
        diff_id = pred_row["test"].split("_")[2]

        if entity_id not in gt_df.index:
            continue

        predicted = pred_row.dropna().to_dict()
        correct = gt_df.loc[entity_id].dropna().to_dict()

        for prop in predicted:
            if prop == id_column or prop not in correct:
                continue

            tol = considerTolerance(correct, predicted, prop)
            if tol is None:
                continue

            y_true.append(1)
            y_pred.append(1 if tol else 0)

            detailed_rows.append({
                "id": entity_id,
                "prompt": prompt_id,
                "diff": diff_id,
                "property": prop,
                "correct": tol
            })

    detailed_df = pd.DataFrame(detailed_rows)

    global_metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "total_comparisons": len(y_true)
    }

    # =====================
    # Metrics per entity
    # =====================
    entity_metrics = {}

    for entity, group in detailed_df.groupby("id"):
        y_true_e = [1] * len(group)
        y_pred_e = group["correct"].astype(int).tolist()

        entity_metrics[entity] = {
            "f1": f1_score(y_true_e, y_pred_e, zero_division=0),
            "accuracy": accuracy_score(y_true_e, y_pred_e),
            "total_comparisons": len(y_true_e)
        }

    entity_metrics_df = (
        pd.DataFrame.from_dict(entity_metrics, orient="index")
          .reset_index()
          .rename(columns={"index": "Entity"})
    )

    return global_metrics, detailed_df, entity_metrics_df

# ======================================================
# RUN
# ======================================================

global_metrics, detailed_df, entity_metrics_df = evaluate_results(
    results_csv="claude_sonnet_airports_majority_vote.csv",
    ground_truth_csv="ground_truth.csv"
)

print("=== Global metrics ===")
for k, v in global_metrics.items():
    print(f"{k}: {v}")

# ======================================================
# DIFFICULTY vs F1 SCATTER
# ======================================================

difficulty_df = load_difficulty_csv("diff_per_task.csv")

# Average difficulty per entity
difficulty_entity = (
    difficulty_df.groupby("Entity", as_index=False)["diff"]
    .mean()
)

entity_diff_metrics = []

for (entity, diff), group in detailed_df.groupby(["id", "diff"]):
    y_true_ed = [1] * len(group)
    y_pred_ed = group["correct"].astype(int).tolist()

    entity_diff_metrics.append({
        "Entity": entity,
        "diff": diff,
        "f1": f1_score(y_true_ed, y_pred_ed, zero_division=0),
        "accuracy": accuracy_score(y_true_ed, y_pred_ed),
        "total_comparisons": len(y_true_ed)
    })
# Load numeric difficulty once
diff_df = pd.read_csv("diff_per_task.csv", sep=";")

# Normalize keys
diff_df["Entity"] = diff_df["Entity"].astype(str).str.strip().str.upper()
diff_df["diffProps"] = diff_df["diffProps"].astype(str).str.strip().str.lower()

# Parse numeric diff
diff_df["diff_numeric"] = (
    diff_df["diff"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .astype(float)
)

# Build a lookup dict: (Entity, diff) -> diff_numeric
diff_lookup = {
    (row["Entity"], row["diffProps"]): row["diff_numeric"]
    for _, row in diff_df.iterrows()
}

# ======================================================
# F1 per (Entity, diff) + numeric diff
# ======================================================

entity_diff_metrics = []

for (entity, diff), group in detailed_df.groupby(["id", "diff"]):
    y_true_ed = [1] * len(group)
    y_pred_ed = group["correct"].astype(int).tolist()

    entity_diff_metrics.append({
        "Entity": entity,
        "diff": diff,
        "f1": f1_score(y_true_ed, y_pred_ed, zero_division=0),
       
        
        "diff_numeric": diff_lookup.get(
            (entity.strip().upper(), diff.strip().lower()),
            np.nan
        )
    })

entity_diff_metrics_df = pd.DataFrame(entity_diff_metrics)

entity_diff_metrics_df = pd.DataFrame(entity_diff_metrics)
entity_diff_metrics_df.to_csv('entity_diff_f1.csv',index=False, sep=';', decimal=',')

In [None]:
entity_prompt_diff_metrics = []

for (entity, prompt, diff_props), group in detailed_df.groupby(
    ["id", "prompt", "diff"]
):
    y_true = [1] * len(group)
    y_pred = group["correct"].astype(int).tolist()

    entity_prompt_diff_metrics.append({
        "entity": entity,
        "prompt": prompt,
        "diffProps": diff_props,
        "accuracy": accuracy_score(y_true, y_pred) if y_true else None,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
    })

entity_prompt_diff_df = pd.DataFrame(entity_prompt_diff_metrics)
entity_prompt_diff_dfNew = entity_prompt_diff_df.rename(
    columns={"diffProps": "diffPropQuartile"}
)

def map_prompt_name(prompt_id: str) -> str:
    mapping = {
        "p0": "compact",
        "p1": "few5",
        "p2": "few3",
        "p3": "few1",
        "p4": "zero",
        "p5": "5cot",
        "p6": "zerocot",
        "p7": "3cot",
        "p8": "1cot",
    }

    if prompt_id not in mapping:
        raise ValueError(f"Unknown prompt id: {prompt_id}")

    return mapping[prompt_id]

entity_prompt_diff_dfNew["prompt"] = (
    entity_prompt_diff_dfNew["prompt"].apply(map_prompt_name)
)

entity_prompt_diff_dfNew