In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

from src.benchmark import load_benchmark

benchmark_path = Path("../benchmarks/benchmark-difficult")
runs_path = Path("runs-benchmark-difficult")
figs_path = Path("figs-benchmark-difficult")

In [None]:
screens = load_benchmark(benchmark_path)

In [None]:
screen_counts = dict()
for screen_file, hits in screens.groupby("screen_file")["hit"]:
    screen_name = screen_file.split(".")[0]
    n_hits = len(hits)
    n_yes = hits.sum()
    n_no = n_hits - n_yes
    print(f"{screen_name}: {n_yes: >3} / {n_hits: >3} ({n_yes/n_hits*100:0.2f}%)")
    screen_counts[screen_name] = {
        "hits": n_hits,
        "yes": n_yes,
        "no": n_no,
    }

In [None]:
pd.DataFrame(screen_counts).T

In [None]:
pd.DataFrame(screen_counts).T.to_csv(figs_path / "data_counts.csv")

In [None]:
def eval(y_true, y_pred):
    y_true = pd.Series(y_true).reset_index(drop=True)
    y_pred = pd.Series(y_pred).reset_index(drop=True)
    true_neg_pred_neg = ((y_true == "No") & (y_pred == "No")).sum()
    true_neg_pred_pos = ((y_true == "No") & (y_pred == "Yes")).sum()
    true_pos_pred_neg = ((y_true == "Yes") & (y_pred == "No")).sum()
    true_pos_pred_pos = ((y_true == "Yes") & (y_pred == "Yes")).sum()

    tp = true_pos_pred_pos
    tn = true_neg_pred_neg
    fp = true_neg_pred_pos
    fn = true_pos_pred_neg

    metrics = {
        "n-binary-pred": tp + tn + fp + fn,
        "f1": (2 * tp) / (2 * tp + fp + fn),
        "ppv": tp / (tp + fp),
        "npv": tn / (fn + tn),
        "sensitivity": tp / (tp + fn),
        "specificity": tn / (fp + tn),
        "fpr": fp / (fp + tn),
    }

    return metrics

In [None]:
runs = [
    # Model           CoT  Few-shot             Filename
    # --------------------------------- Llama 2 7B ------------------------------------------------------
    ("Llama-2-7B",    "N", "0-shot",            "llama-2-7B-chat-0-shot"),
    ("Llama-2-7B",    "N", "1-shot (+)",        "llama-2-7B-chat-1-shot-pos"),
    ("Llama-2-7B",    "N", "1-shot (-)",        "llama-2-7B-chat-1-shot-neg"),
    ("Llama-2-7B",    "N", "2-shot (+/-)",      "llama-2-7B-chat-2-shot-pos-neg"),
    ("Llama-2-7B",    "N", "2-shot (-/+)",      "llama-2-7B-chat-2-shot-neg-pos"),
    ("Llama-2-7B",    "Y", "0-shot",            "llama-2-7B-chat-0-shot-cot"),
    ("Llama-2-7B",    "Y", "1-shot (+)",        "llama-2-7B-chat-1-shot-pos-cot"),
    ("Llama-2-7B",    "Y", "1-shot (-)",        "llama-2-7B-chat-1-shot-neg-cot"),
    ("Llama-2-7B",    "Y", "2-shot (+/-)",      "llama-2-7B-chat-2-shot-pos-neg-cot"),
    ("Llama-2-7B",    "Y", "2-shot (-/+)",      "llama-2-7B-chat-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 2 13B ------------------------------------------------------
    ("Llama-2-13B",   "N", "0-shot",            "llama-2-13B-chat-0-shot"),
    ("Llama-2-13B",   "N", "1-shot (+)",        "llama-2-13B-chat-1-shot-pos"),
    ("Llama-2-13B",   "N", "1-shot (-)",        "llama-2-13B-chat-1-shot-neg"),
    ("Llama-2-13B",   "N", "2-shot (+/-)",      "llama-2-13B-chat-2-shot-pos-neg"),
    ("Llama-2-13B",   "N", "2-shot (-/+)",      "llama-2-13B-chat-2-shot-neg-pos"),
    ("Llama-2-13B",   "Y", "0-shot",            "llama-2-13B-chat-0-shot-cot"),
    ("Llama-2-13B",   "Y", "1-shot (+)",        "llama-2-13B-chat-1-shot-pos-cot"),
    ("Llama-2-13B",   "Y", "1-shot (-)",        "llama-2-13B-chat-1-shot-neg-cot"),
    ("Llama-2-13B",   "Y", "2-shot (+/-)",      "llama-2-13B-chat-2-shot-pos-neg-cot"),
    ("Llama-2-13B",   "Y", "2-shot (-/+)",      "llama-2-13B-chat-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 2 70B ------------------------------------------------------
    ("Llama-2-70B",   "N", "0-shot",            "llama-2-70B-chat-0-shot"),
    ("Llama-2-70B",   "N", "1-shot (+)",        "llama-2-70B-chat-1-shot-pos"),
    ("Llama-2-70B",   "N", "1-shot (-)",        "llama-2-70B-chat-1-shot-neg"),
    ("Llama-2-70B",   "N", "2-shot (+/-)",      "llama-2-70B-chat-2-shot-pos-neg"),
    ("Llama-2-70B",   "N", "2-shot (-/+)",      "llama-2-70B-chat-2-shot-neg-pos"),
    ("Llama-2-70B",   "Y", "0-shot",            "llama-2-70B-chat-0-shot-cot"),
    ("Llama-2-70B",   "Y", "1-shot (+)",        "llama-2-70B-chat-1-shot-pos-cot"),
    ("Llama-2-70B",   "Y", "1-shot (-)",        "llama-2-70B-chat-1-shot-neg-cot"),
    ("Llama-2-70B",   "Y", "2-shot (+/-)",      "llama-2-70B-chat-2-shot-pos-neg-cot"),
    ("Llama-2-70B",   "Y", "2-shot (-/+)",      "llama-2-70B-chat-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 3 8B ------------------------------------------------------
    ("Llama-3-8B",    "N", "0-shot",            "llama-3-8B-instruct-0-shot"),
    ("Llama-3-8B",    "N", "1-shot (+)",        "llama-3-8B-instruct-1-shot-pos"),
    ("Llama-3-8B",    "N", "1-shot (-)",        "llama-3-8B-instruct-1-shot-neg"),
    ("Llama-3-8B",    "N", "2-shot (+/-)",      "llama-3-8B-instruct-2-shot-pos-neg"),
    ("Llama-3-8B",    "N", "2-shot (-/+)",      "llama-3-8B-instruct-2-shot-neg-pos"),
    ("Llama-3-8B",    "Y", "0-shot",            "llama-3-8B-instruct-0-shot-cot"),
    ("Llama-3-8B",    "Y", "1-shot (+)",        "llama-3-8B-instruct-1-shot-pos-cot"),
    ("Llama-3-8B",    "Y", "1-shot (-)",        "llama-3-8B-instruct-1-shot-neg-cot"),
    ("Llama-3-8B",    "Y", "2-shot (+/-)",      "llama-3-8B-instruct-2-shot-pos-neg-cot"),
    ("Llama-3-8B",    "Y", "2-shot (-/+)",      "llama-3-8B-instruct-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 3 70B ------------------------------------------------------
    ("Llama-3-70B",   "N", "0-shot",            "llama-3-70B-instruct-0-shot"),
    ("Llama-3-70B",   "N", "1-shot (+)",        "llama-3-70B-instruct-1-shot-pos"),
    ("Llama-3-70B",   "N", "1-shot (-)",        "llama-3-70B-instruct-1-shot-neg"),
    ("Llama-3-70B",   "N", "2-shot (+/-)",      "llama-3-70B-instruct-2-shot-pos-neg"),
    ("Llama-3-70B",   "N", "2-shot (-/+)",      "llama-3-70B-instruct-2-shot-neg-pos"),
    ("Llama-3-70B",   "Y", "0-shot",            "llama-3-70B-instruct-0-shot-cot"),
    ("Llama-3-70B",   "Y", "1-shot (+)",        "llama-3-70B-instruct-1-shot-pos-cot"),
    ("Llama-3-70B",   "Y", "1-shot (-)",        "llama-3-70B-instruct-1-shot-neg-cot"),
    ("Llama-3-70B",   "Y", "2-shot (+/-)",      "llama-3-70B-instruct-2-shot-pos-neg-cot"),
    ("Llama-3-70B",   "Y", "2-shot (-/+)",      "llama-3-70B-instruct-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 3.1 8B ------------------------------------------------------
    ("Llama-3.1-8B",  "N", "0-shot",            "llama-3.1-8B-instruct-0-shot"),
    ("Llama-3.1-8B",  "N", "1-shot (+)",        "llama-3.1-8B-instruct-1-shot-pos"),
    ("Llama-3.1-8B",  "N", "1-shot (-)",        "llama-3.1-8B-instruct-1-shot-neg"),
    ("Llama-3.1-8B",  "N", "2-shot (+/-)",      "llama-3.1-8B-instruct-2-shot-pos-neg"),
    ("Llama-3.1-8B",  "N", "2-shot (-/+)",      "llama-3.1-8B-instruct-2-shot-neg-pos"),
    ("Llama-3.1-8B",  "Y", "0-shot",            "llama-3.1-8B-instruct-0-shot-cot"),
    ("Llama-3.1-8B",  "Y", "1-shot (+)",        "llama-3.1-8B-instruct-1-shot-pos-cot"),
    ("Llama-3.1-8B",  "Y", "1-shot (-)",        "llama-3.1-8B-instruct-1-shot-neg-cot"),
    ("Llama-3.1-8B",  "Y", "2-shot (+/-)",      "llama-3.1-8B-instruct-2-shot-pos-neg-cot"),
    ("Llama-3.1-8B",  "Y", "2-shot (-/+)",      "llama-3.1-8B-instruct-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 3.1 70B ------------------------------------------------------
    ("Llama-3.1-70B", "N", "0-shot",            "llama-3.1-70B-instruct-0-shot"),
    ("Llama-3.1-70B", "N", "1-shot (+)",        "llama-3.1-70B-instruct-1-shot-pos"),
    ("Llama-3.1-70B", "N", "1-shot (-)",        "llama-3.1-70B-instruct-1-shot-neg"),
    ("Llama-3.1-70B", "N", "2-shot (+/-)",      "llama-3.1-70B-instruct-2-shot-pos-neg"),
    ("Llama-3.1-70B", "N", "2-shot (-/+)",      "llama-3.1-70B-instruct-2-shot-neg-pos"),
    ("Llama-3.1-70B", "Y", "0-shot",            "llama-3.1-70B-instruct-0-shot-cot"),
    ("Llama-3.1-70B", "Y", "1-shot (+)",        "llama-3.1-70B-instruct-1-shot-pos-cot"),
    ("Llama-3.1-70B", "Y", "1-shot (-)",        "llama-3.1-70B-instruct-1-shot-neg-cot"),
    ("Llama-3.1-70B", "Y", "2-shot (+/-)",      "llama-3.1-70B-instruct-2-shot-pos-neg-cot"),
    ("Llama-3.1-70B", "Y", "2-shot (-/+)",      "llama-3.1-70B-instruct-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 3.2 1B ------------------------------------------------------
    ("Llama-3.2-1B",  "N", "0-shot",            "llama-3.2-1B-instruct-0-shot"),
    ("Llama-3.2-1B",  "N", "1-shot (+)",        "llama-3.2-1B-instruct-1-shot-pos"),
    ("Llama-3.2-1B",  "N", "1-shot (-)",        "llama-3.2-1B-instruct-1-shot-neg"),
    ("Llama-3.2-1B",  "N", "2-shot (+/-)",      "llama-3.2-1B-instruct-2-shot-pos-neg"),
    ("Llama-3.2-1B",  "N", "2-shot (-/+)",      "llama-3.2-1B-instruct-2-shot-neg-pos"),
    ("Llama-3.2-1B",  "Y", "0-shot",            "llama-3.2-1B-instruct-0-shot-cot"),
    ("Llama-3.2-1B",  "Y", "1-shot (+)",        "llama-3.2-1B-instruct-1-shot-pos-cot"),
    ("Llama-3.2-1B",  "Y", "1-shot (-)",        "llama-3.2-1B-instruct-1-shot-neg-cot"),
    ("Llama-3.2-1B",  "Y", "2-shot (+/-)",      "llama-3.2-1B-instruct-2-shot-pos-neg-cot"),
    ("Llama-3.2-1B",  "Y", "2-shot (-/+)",      "llama-3.2-1B-instruct-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 3.2 3B ------------------------------------------------------
    ("Llama-3.2-3B",  "N", "0-shot",            "llama-3.2-3B-instruct-0-shot"),
    ("Llama-3.2-3B",  "N", "1-shot (+)",        "llama-3.2-3B-instruct-1-shot-pos"),
    ("Llama-3.2-3B",  "N", "1-shot (-)",        "llama-3.2-3B-instruct-1-shot-neg"),
    ("Llama-3.2-3B",  "N", "2-shot (+/-)",      "llama-3.2-3B-instruct-2-shot-pos-neg"),
    ("Llama-3.2-3B",  "N", "2-shot (-/+)",      "llama-3.2-3B-instruct-2-shot-neg-pos"),
    ("Llama-3.2-3B",  "Y", "0-shot",            "llama-3.2-3B-instruct-0-shot-cot"),
    ("Llama-3.2-3B",  "Y", "1-shot (+)",        "llama-3.2-3B-instruct-1-shot-pos-cot"),
    ("Llama-3.2-3B",  "Y", "1-shot (-)",        "llama-3.2-3B-instruct-1-shot-neg-cot"),
    ("Llama-3.2-3B",  "Y", "2-shot (+/-)",      "llama-3.2-3B-instruct-2-shot-pos-neg-cot"),
    ("Llama-3.2-3B",  "Y", "2-shot (-/+)",      "llama-3.2-3B-instruct-2-shot-neg-pos-cot"),
    # --------------------------------- Llama 3.3 70B ------------------------------------------------------
    ("Llama-3.3-70B", "N", "0-shot",            "llama-3.3-70B-instruct-0-shot"),
    ("Llama-3.3-70B", "N", "1-shot (+)",        "llama-3.3-70B-instruct-1-shot-pos"),
    ("Llama-3.3-70B", "N", "1-shot (-)",        "llama-3.3-70B-instruct-1-shot-neg"),
    ("Llama-3.3-70B", "N", "2-shot (+/-)",      "llama-3.3-70B-instruct-2-shot-pos-neg"),
    ("Llama-3.3-70B", "N", "2-shot (-/+)",      "llama-3.3-70B-instruct-2-shot-neg-pos"),
    ("Llama-3.3-70B", "Y", "0-shot",            "llama-3.3-70B-instruct-0-shot-cot"),
    ("Llama-3.3-70B", "Y", "1-shot (+)",        "llama-3.3-70B-instruct-1-shot-pos-cot"),
    ("Llama-3.3-70B", "Y", "1-shot (-)",        "llama-3.3-70B-instruct-1-shot-neg-cot"),
    ("Llama-3.3-70B", "Y", "2-shot (+/-)",      "llama-3.3-70B-instruct-2-shot-pos-neg-cot"),
    ("Llama-3.3-70B", "Y", "2-shot (-/+)",      "llama-3.3-70B-instruct-2-shot-neg-pos-cot"),
    # --------------------------------- o1 ------------------------------------------------------
    ("o1",            "I", "0-shot",            "o1-2024-12-17-0-shot"),
    ("o1",            "I", "1-shot (+)",        "o1-2024-12-17-1-shot-pos"),
    ("o1",            "I", "1-shot (-)",        "o1-2024-12-17-1-shot-neg"),
    ("o1",            "I", "2-shot (+/-)",      "o1-2024-12-17-2-shot-pos-neg"),
    ("o1",            "I", "2-shot (-/+)",      "o1-2024-12-17-2-shot-neg-pos"),
    # --------------------------------- o1-mini ------------------------------------------------------
    ("o1-mini",       "I", "0-shot",            "o1-mini-2024-09-12-0-shot"),
    ("o1-mini",       "I", "1-shot (+)",        "o1-mini-2024-09-12-1-shot-pos"),
    ("o1-mini",       "I", "1-shot (-)",        "o1-mini-2024-09-12-1-shot-neg"),
    ("o1-mini",       "I", "2-shot (+/-)",      "o1-mini-2024-09-12-2-shot-pos-neg"),
    ("o1-mini",       "I", "2-shot (-/+)",      "o1-mini-2024-09-12-2-shot-neg-pos"),
    # --------------------------------- GPT-4o ------------------------------------------------------
    ("GPT-4o",        "N", "0-shot",            "gpt-4o-2024-11-20-0-shot"),
    ("GPT-4o",        "N", "1-shot (+)",        "gpt-4o-2024-11-20-1-shot-pos"),
    ("GPT-4o",        "N", "1-shot (-)",        "gpt-4o-2024-11-20-1-shot-neg"),
    ("GPT-4o",        "N", "2-shot (+/-)",      "gpt-4o-2024-11-20-2-shot-pos-neg"),
    ("GPT-4o",        "N", "2-shot (-/+)",      "gpt-4o-2024-11-20-2-shot-neg-pos"),
    ("GPT-4o",        "Y", "0-shot",            "gpt-4o-2024-11-20-0-shot-cot"),
    ("GPT-4o",        "Y", "1-shot (+)",        "gpt-4o-2024-11-20-1-shot-pos-cot"),
    ("GPT-4o",        "Y", "1-shot (-)",        "gpt-4o-2024-11-20-1-shot-neg-cot"),
    ("GPT-4o",        "Y", "2-shot (+/-)",      "gpt-4o-2024-11-20-2-shot-pos-neg-cot"),
    ("GPT-4o",        "Y", "2-shot (-/+)",      "gpt-4o-2024-11-20-2-shot-neg-pos-cot"),
    # --------------------------------- GPT-4o-mini ------------------------------------------------------
    ("GPT-4o-mini",   "N", "0-shot",            "gpt-4o-mini-2024-07-18-0-shot"),
    ("GPT-4o-mini",   "N", "1-shot (+)",        "gpt-4o-mini-2024-07-18-1-shot-pos"),
    ("GPT-4o-mini",   "N", "1-shot (-)",        "gpt-4o-mini-2024-07-18-1-shot-neg"),
    ("GPT-4o-mini",   "N", "2-shot (+/-)",      "gpt-4o-mini-2024-07-18-2-shot-pos-neg"),
    ("GPT-4o-mini",   "N", "2-shot (-/+)",      "gpt-4o-mini-2024-07-18-2-shot-neg-pos"),
    ("GPT-4o-mini",   "Y", "0-shot",            "gpt-4o-mini-2024-07-18-0-shot-cot"),
    ("GPT-4o-mini",   "Y", "1-shot (+)",        "gpt-4o-mini-2024-07-18-1-shot-pos-cot"),
    ("GPT-4o-mini",   "Y", "1-shot (-)",        "gpt-4o-mini-2024-07-18-1-shot-neg-cot"),
    ("GPT-4o-mini",   "Y", "2-shot (+/-)",      "gpt-4o-mini-2024-07-18-2-shot-pos-neg-cot"),
    ("GPT-4o-mini",   "Y", "2-shot (-/+)",      "gpt-4o-mini-2024-07-18-2-shot-neg-pos-cot"),
    # --------------------------------- GPT-4-turbo ------------------------------------------------------
    ("GPT-4-turbo",   "N", "0-shot",            "gpt-4-turbo-2024-04-09-0-shot"),
    ("GPT-4-turbo",   "N", "1-shot (+)",        "gpt-4-turbo-2024-04-09-1-shot-pos"),
    ("GPT-4-turbo",   "N", "1-shot (-)",        "gpt-4-turbo-2024-04-09-1-shot-neg"),
    ("GPT-4-turbo",   "N", "2-shot (+/-)",      "gpt-4-turbo-2024-04-09-2-shot-pos-neg"),
    ("GPT-4-turbo",   "N", "2-shot (-/+)",      "gpt-4-turbo-2024-04-09-2-shot-neg-pos"),
    ("GPT-4-turbo",   "Y", "0-shot",            "gpt-4-turbo-2024-04-09-0-shot-cot"),
    ("GPT-4-turbo",   "Y", "1-shot (+)",        "gpt-4-turbo-2024-04-09-1-shot-pos-cot"),
    ("GPT-4-turbo",   "Y", "1-shot (-)",        "gpt-4-turbo-2024-04-09-1-shot-neg-cot"),
    ("GPT-4-turbo",   "Y", "2-shot (+/-)",      "gpt-4-turbo-2024-04-09-2-shot-pos-neg-cot"),
    ("GPT-4-turbo",   "Y", "2-shot (-/+)",      "gpt-4-turbo-2024-04-09-2-shot-neg-pos-cot"),
    # --------------------------------- GPT-4 ------------------------------------------------------
    ("GPT-4",         "N", "0-shot",            "gpt-4-0125-preview-0-shot"),
    ("GPT-4",         "N", "1-shot (+)",        "gpt-4-0125-preview-1-shot-pos"),
    ("GPT-4",         "N", "1-shot (-)",        "gpt-4-0125-preview-1-shot-neg"),
    ("GPT-4",         "N", "2-shot (+/-)",      "gpt-4-0125-preview-2-shot-pos-neg"),
    ("GPT-4",         "N", "2-shot (-/+)",      "gpt-4-0125-preview-2-shot-neg-pos"),
    ("GPT-4",         "Y", "0-shot",            "gpt-4-0125-preview-0-shot-cot"),
    ("GPT-4",         "Y", "1-shot (+)",        "gpt-4-0125-preview-1-shot-pos-cot"),
    ("GPT-4",         "Y", "1-shot (-)",        "gpt-4-0125-preview-1-shot-neg-cot"),
    ("GPT-4",         "Y", "2-shot (+/-)",      "gpt-4-0125-preview-2-shot-pos-neg-cot"),
    ("GPT-4",         "Y", "2-shot (-/+)",      "gpt-4-0125-preview-2-shot-neg-pos-cot"),
    # --------------------------------- GPT-3.5-turbo ------------------------------------------------------
    ("GPT-3.5-turbo", "N", "0-shot",            "gpt-3.5-turbo-0125-0-shot"),
    ("GPT-3.5-turbo", "N", "1-shot (+)",        "gpt-3.5-turbo-0125-1-shot-pos"),
    ("GPT-3.5-turbo", "N", "1-shot (-)",        "gpt-3.5-turbo-0125-1-shot-neg"),
    ("GPT-3.5-turbo", "N", "2-shot (+/-)",      "gpt-3.5-turbo-0125-2-shot-pos-neg"),
    ("GPT-3.5-turbo", "N", "2-shot (-/+)",      "gpt-3.5-turbo-0125-2-shot-neg-pos"),
    ("GPT-3.5-turbo", "Y", "0-shot",            "gpt-3.5-turbo-0125-0-shot-cot"),
    ("GPT-3.5-turbo", "Y", "1-shot (+)",        "gpt-3.5-turbo-0125-1-shot-pos-cot"),
    ("GPT-3.5-turbo", "Y", "1-shot (-)",        "gpt-3.5-turbo-0125-1-shot-neg-cot"),
    ("GPT-3.5-turbo", "Y", "2-shot (+/-)",      "gpt-3.5-turbo-0125-2-shot-pos-neg-cot"),
    ("GPT-3.5-turbo", "Y", "2-shot (-/+)",      "gpt-3.5-turbo-0125-2-shot-neg-pos-cot"),
    # --------------------------------- Random ------------------------------------------------------
    ("Random", "N/A", "N/A", "N/A"),
]

In [None]:
experiments = dict()
for i, (model, cot, fewshot, experiment) in enumerate(runs):
    y_true = []
    y_pred = []
    for j, screen_file in enumerate(screens["screen_file"].unique()):
        hits = screens.loc[screens["screen_file"] == screen_file, "hit"]
        _y_true = np.where(hits, "Yes", "No")
        y_true.extend(_y_true)

        if model == "Random":
            n_yes = (_y_true == "Yes").sum()
            n_no = (_y_true == "No").sum()
            n = len(_y_true)
            _y_pred = np.random.choice(["Yes", "No"], size=n, replace=True, p=[n_yes/n, n_no/n])
        else:
            pred_df = pd.read_csv(runs_path / experiment / screen_file, sep="\t")
            _y_pred = pred_df["answer"]

        y_pred.extend(_y_pred)
    metrics = eval(y_true, y_pred)
    experiments[(model, fewshot, cot)] = metrics

In [None]:
results = pd.DataFrame(experiments).T
results.index.names = ["model", "fewshot", "cot"]
with pd.option_context("display.max_rows", None):
    display(results)

In [None]:
results.to_csv(figs_path / "experiment_results.csv")

### Compute aggregates or isolate specific results

In [None]:
import pandas as pd

In [None]:
results = pd.read_csv(figs_path / "experiment_results.csv")

In [None]:
temp = results[results["model"].isin(["GPT-4o"])]
temp[["f1", "fpr", "ppv", "npv", "sensitivity", "specificity"]].mean().apply(lambda x: f"{x:0.2f}") + " ± " + \
temp[["f1", "fpr", "ppv", "npv", "sensitivity", "specificity"]].std().apply(lambda x: f"{x:0.2f}")

In [None]:
print(temp[["model", "fewshot", "cot", "f1", "fpr", "ppv", "npv", "sensitivity", "specificity"]].to_latex(index=False, float_format="%.2f"))

In [None]:
mean = results.groupby("model")[["f1", "fpr"]].mean()
std = results.groupby("model")[["f1", "fpr"]].std()

In [None]:
(mean.map(lambda x: f"{x:0.2f}") + "±" + std.map(lambda x: f"{x:0.2f}")).loc[results["model"].drop_duplicates()]

In [None]:
print((mean.map(lambda x: f"{x:0.2f}") + " ± " + std.map(lambda x: f"{x:0.2f}")).loc[results["model"].drop_duplicates()].to_latex())