In [20]:
from pathlib import Path

import pandas as pd
from sklearn.metrics import roc_auc_score

In [52]:
def make_split(test_df: pd.DataFrame, pos_classes: list[str], score_type: str) -> tuple:
    mask = test_df["labels"].isin(["in", *pos_classes])
    y_true = test_df.loc[mask, "labels"].isin(pos_classes)
    y_score = test_df.loc[mask, score_type]
    return y_true, y_score


def read_baselines(base: Path, suffix: str = "baselines.jsonl"):
    test_df = pd.read_json(base / suffix, lines=True)
    results = []

    for split in (["far-ood"], ["near-ood"], ["far-ood", "near-ood"]):
        split_name = "in vs " + " + ".join(split)
        for score_type in ("mean_sp", "min_sp"):
            y_true, y_score = make_split(test_df, split, score_type)
            auc_val = roc_auc_score(y_true, y_score)
            results.append({
                "split": split_name,
                "score_type": score_type,
                "AUROC": f"{auc_val:.3f}",
            })

    return pd.DataFrame(results)


def read_uq_sequence(base: Path):
    results = []
    for model in ("mcdropout", "flipout"):
        for metric in ("f1", "bertscore"):
            test_df = pd.read_json(base / f"{model}-large-{metric}.jsonl", lines=True)
            for split in (["far-ood"], ["near-ood"], ["far-ood", "near-ood"]):
                split_name = "in vs " + " + ".join(split)
                y_true, y_score = make_split(test_df, split, metric)
                auc_val = roc_auc_score(y_true, y_score)
                results.append({
                    "split": split_name,
                    "score_type": f"{model}-{metric}",
                    "AUROC": f"{auc_val:.3f}",
                })

    return pd.DataFrame(results)


def read_uq_token(base: Path):
    results = []
    for model in ("mcdropout",):
        test_df = pd.read_json(base / f"{model}-large-token.jsonl", lines=True)
        for split in (["far-ood"], ["near-ood"], ["far-ood", "near-ood"]):
            split_name = "in vs " + " + ".join(split)
            for score_type in ("mean_mi", "max_mi"):
                y_true, y_score = make_split(test_df, split, score_type)
                auc_val = roc_auc_score(y_true, y_score)
                results.append({
                    "split": split_name,
                    "score_type": f"{model}-{score_type}",
                    "AUROC": f"{auc_val:.3f}",
                })

    return pd.DataFrame(results)


def main():
    base = Path("../results")
    dataset = "webquestions"
    baseline_df = read_baselines(base / dataset)
    uq_sequence_df = read_uq_sequence(base / dataset)
    uq_token_df = read_uq_token(base / dataset)

    combined_df = pd.concat([baseline_df, uq_sequence_df, uq_token_df]).reset_index(drop=True)
    combined_df["AUROC"] = combined_df["AUROC"].astype(float)           # for sorting safely

    for split, group in combined_df.groupby("split"):
        print(f"\n=== {split} ===")
        group_sorted = group.sort_values("AUROC", ascending=False)
        print(group_sorted[["score_type", "AUROC"]].to_string(index=False))

main()



=== in vs far-ood ===
         score_type  AUROC
  mcdropout-mean_mi  0.796
   mcdropout-max_mi  0.773
mcdropout-bertscore  0.760
       mcdropout-f1  0.751
  flipout-bertscore  0.740
         flipout-f1  0.731
             min_sp  0.723
            mean_sp  0.718

=== in vs far-ood + near-ood ===
         score_type  AUROC
  mcdropout-mean_mi  0.775
   mcdropout-max_mi  0.757
mcdropout-bertscore  0.743
       mcdropout-f1  0.740
  flipout-bertscore  0.720
         flipout-f1  0.718
            mean_sp  0.708
             min_sp  0.706

=== in vs near-ood ===
         score_type  AUROC
  mcdropout-mean_mi  0.715
       mcdropout-f1  0.710
   mcdropout-max_mi  0.710
mcdropout-bertscore  0.695
         flipout-f1  0.682
            mean_sp  0.679
  flipout-bertscore  0.662
             min_sp  0.658


Removed:
- ROC curves
- EM scores and performance (#correct) per split. using most common answer as hard prediction. including answer normalization.
