In [2]:
from pathlib import Path

import pandas as pd
from sklearn.metrics import average_precision_score, roc_auc_score

In [20]:
def make_split(test_df: pd.DataFrame, pos_classes: list[str], score_type: str) -> tuple:
    mask = test_df["labels"].isin(["in", *pos_classes])
    y_true = test_df.loc[mask, "labels"].isin(pos_classes)
    y_score = test_df.loc[mask, score_type]
    return y_true, y_score


def read_baselines(base: Path, suffix: str = "baselines.jsonl"):
    test_df = pd.read_json(base / suffix, lines=True)
    results = []

    for split in (["far-ood"], ["near-ood"], ["far-ood", "near-ood"]):
        split_name = "in vs " + " + ".join(split)
        for score_type in ("mean_sp", "min_sp"):
            y_true, y_score = make_split(test_df, split, score_type)
            auroc_val = roc_auc_score(y_true, y_score)
            prauc_val = average_precision_score(y_true, y_score)
            results.append({
                "split": split_name,
                "score_type": score_type,
                "AUROC": f"{auroc_val:.3f}",
                "PRAUC": f"{prauc_val:.3f}",
            })

    return pd.DataFrame(results)


def read_uq_sequence(base: Path):
    results = []
    for model in ("mcdropout", "flipout"):
        for metric in ("f1", "bertscore"):
            test_df = pd.read_json(base / f"{model}-large-{metric}.jsonl", lines=True)
            for split in (["far-ood"], ["near-ood"], ["far-ood", "near-ood"]):
                split_name = "in vs " + " + ".join(split)
                y_true, y_score = make_split(test_df, split, metric)

                auroc_val = roc_auc_score(y_true, y_score)
                prauc_val = average_precision_score(y_true, y_score)
                results.append({
                    "split": split_name,
                    "score_type": f"{model}-{metric}",
                    "AUROC": f"{auroc_val:.3f}",
                    "PRAUC": f"{prauc_val:.3f}",
                })

    return pd.DataFrame(results)


def read_uq_token(base: Path):
    results = []
    for model in ("mcdropout",):
        test_df = pd.read_json(base / f"{model}-large-token.jsonl", lines=True)
        for split in (["far-ood"], ["near-ood"], ["far-ood", "near-ood"]):
            split_name = "in vs " + " + ".join(split)
            for score_type in ("mean_mi", "max_mi"):
                y_true, y_score = make_split(test_df, split, score_type)

                auroc_val = roc_auc_score(y_true, y_score)
                prauc_val = average_precision_score(y_true, y_score)
                results.append({
                    "split": split_name,
                    "score_type": f"{model}-{score_type}",
                    "AUROC": f"{auroc_val:.3f}",
                    "PRAUC": f"{prauc_val:.3f}",
                })


    return pd.DataFrame(results)


def main():
    base = Path("../results")
    dataset = "webquestions"
    baseline_df = read_baselines(base / dataset)
    combined_df = baseline_df
    # uq_sequence_df = read_uq_sequence(base / dataset)
    # uq_token_df = read_uq_token(base / dataset)

    # combined_df = pd.concat([baseline_df, uq_sequence_df, uq_token_df]).reset_index(drop=True)
    combined_df["AUROC"] = combined_df["AUROC"].astype(float)           # for sorting safely

    for split, group in combined_df.groupby("split"):
        print(f"\n=== {split} ===")
        group_sorted = group.sort_values("AUROC", ascending=False)
        print(group_sorted[["score_type", "AUROC"]].to_string(index=False))

    combined_df["PRAUC"] = combined_df["PRAUC"].astype(float)           # for sorting safely

    for split, group in combined_df.groupby("split"):
        print(f"\n=== {split} ===")
        group_sorted = group.sort_values("PRAUC", ascending=False)
        print(group_sorted[["score_type", "PRAUC"]].to_string(index=False))


main()



=== in vs far-ood ===
score_type  AUROC
   mean_sp  0.723
    min_sp  0.721

=== in vs far-ood + near-ood ===
score_type  AUROC
   mean_sp  0.711
    min_sp  0.701

=== in vs near-ood ===
score_type  AUROC
   mean_sp  0.675
    min_sp  0.644

=== in vs far-ood ===
score_type  PRAUC
    min_sp  0.879
   mean_sp  0.876

=== in vs far-ood + near-ood ===
score_type  PRAUC
   mean_sp  0.900
    min_sp  0.898

=== in vs near-ood ===
score_type  PRAUC
   mean_sp  0.673
    min_sp  0.633


Removed:
- ROC curves
- EM scores and performance (#correct) per split. using most common answer as hard prediction. including answer normalization.
