In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

In [None]:
import logging
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np

from benchmark import (
    INVALID_ANSWER,
    NO_ANSWER,
    VALID_ANSWER,
    aggregate_responses,
    calibration_curve,
    detect_names_from_dict,
    empirical_distr,
    extract_predictions,
    kl_div,
    load_predictions,
    load_responses_all,
    plot_calibration_curve,
    plot_heatmap,
)
from utils_ext.plot import Plotter
from utils_ext.tools import setup_logging

plt.ioff()
setup_logging()

logger = logging.getLogger(__name__)

PATH_OUTPUT = "../results"

# setup plotter

FONTSIZE_DEFAULT = 6
FONTSIZE_SMALL = 5

Plotter.setup()
Plotter.configure(
    basewidth=5.5,
    fontsize=FONTSIZE_DEFAULT,
    latex=False,
    rcparams={
        "lines.linewidth": 1, # default: 1.5
        "axes.labelpad": 1, # default: 4
    },
    save_dir=f"{PATH_OUTPUT}/plots_paper",
    save_format="pdf",
)
Plotter.configure(
    latex=True,
    latex_preamble="\n".join([
        r"\usepackage[utf8]{inputenc}",
        r"\usepackage[T1]{fontenc}",
        r"\usepackage{microtype}",
        r"\usepackage{amsmath,amssymb,amsfonts,mathrsfs}",
        r"\renewcommand{\rmdefault}{ptm}",
        r"\renewcommand{\sfdefault}{phv}",
    ]),
)

Plotter.display_css(".cell-output-ipywidget-background { background: lightgray !important; }")

In [3]:
DATASET_CACHE = {}

## Load results

In [None]:
# OPTION 1: load responses and extract predictions
responses_all = load_responses_all(f"{PATH_OUTPUT}/responses", dataset_cache=DATASET_CACHE)
y_true_all, y_pred_all = extract_predictions(responses_all, sample=1000)

In [4]:
# OPTION 2: load predictions only
y_true_all, y_pred_all = load_predictions(f"{PATH_OUTPUT}/predictions_sampled")

## Paper Plots

In [6]:
COLOR_ACCURACY = "tab:blue"
COLOR_CONFIDENCE = "tab:green"
COLOR_ECE = "tab:red"
COLOR_CONF_N_DISTINCT = "darkorange"
COLOR_CONF_VARIANCE = "orange"
COLOR_KL_DIV = "mediumpurple"

YLIM_CALIBRATION = (0, 1.12)
YLIM_OTHERS_1 = (0, 112)
YLIM_OTHERS_2 = (0, 0.23)

# plotting utils

MODELS_TINY = [
    # "gemma1.1-2b-it",
    "gemma1.1-7b-it",
    "llama3-8b-instruct",
    "qwen1.5-7b-chat",
]
MODELS_LARGE = [
    "llama3-70b-instruct",
    "qwen1.5-32b-chat",
    "qwen1.5-72b-chat",
    "qwen1.5-110b-chat",
    "gpt3.5-turbo",
    "gpt4o-mini",
    "gpt4o",
]

METHODS_SCORE_RANGE = [
    "basic_1s",
    None,
    "basic_1s_scorefloat",
    None,
    "basic_1s_scoreletter",
    None,
    "basic_1s_scoretext",
]
METHODS_SCORE_FORMULATION = [
    "basic_1s",
    "basic_1s_probscore",
    None,
    "advanced_1s",
    "advanced_1s_probscore",
    None,
    "tian2023just_1s_top1_v3",
    "tian2023just_1s_top1_v2",
    "tian2023just_1s_top1",
]
METHODS_ADVANCED_FORMULATION = [
    "basic_1s",
    "advanced_1s",
    None,
    "basic_1s_probscore",
    "advanced_1s_probscore",
]
METHODS_FEW_SHOT = [
    "basic_1s",
    "basic_1s_1shot",
    "basic_1s_5shot",
]
METHODS_OTHERS = [
    "tian2023just_1s_top1",
    "tian2023just_1s_top1_v1",
    None,
    "tian2023just_1s_top1",
    "tian2023just_1s_top4",
    None,
    "xiong2023can_vanilla",
    "xiong2023can_cot",
]
METHODS_COMBO = [
    "basic_1s",
    None,
    "basic_1s_probscore",
    None,
    "advanced_1s",
    None,
    "basic_1s_5shot",
    None,
    "combo_1s_v2",
]

DATASET_NAME_ALIASES = {
    "arc-c":           "arc-c",
    "arc-e":           "arc-e",
    "commonsense_qa":  "commonsense_qa",
    "imdb":            "imdb",
    "logi_qa":         "logi_qa",
    "mmlu":            "mmlu",
    "sciq":            "sciq",
    "social_i_qa":     "social_i_qa",
    "trivia_qa":       "trivia_qa",
    "truthful_qa-mc1": "truthful_qa-mc1",
    "truthful_qa-mc2": "truthful_qa-mc2",
}
MODEL_NAME_ALIASES = {
    "gemma1.1-2b-it":      "gemma1.1-2b",
    "gemma1.1-7b-it":      "gemma1.1-7b",
    "llama3-8b-instruct":  "llama3-8b",
    "llama3-70b-instruct": "llama3-70b",
    "qwen1.5-7b-chat":     "qwen1.5-7b",
    "qwen1.5-32b-chat":    "qwen1.5-32b",
    "qwen1.5-72b-chat":    "qwen1.5-72b",
    "qwen1.5-110b-chat":   "qwen1.5-110b",
    "gpt3.5-turbo":        "gpt3.5-turbo",
    "gpt4o-mini":          "gpt4o-mini",
    "gpt4o":               "gpt4o",
}
METHOD_NAME_ALIASES = {
    "basic_1s":                  "basic",
    "basic_1s_scorefloat":       "basic_scorefloat",
    "basic_1s_scoreletter":      "basic_scoreletter",
    "basic_1s_scoretext":        "basic_scoretext",
    "basic_1s_probscore":        "basic_probscore",
    "basic_1s_1shot":            "basic_1shot",
    "basic_1s_5shot":            "basic_5shot",
    "advanced_1s":               "advanced",
    "advanced_1s_probscore":     "advanced_probscore",
    "combo_1s_v2":               "combo",
    "tian2023just_1s_top1":      "tian2023_top1",
    "tian2023just_1s_top1_v1":   "tian2023_top1_v1",
    "tian2023just_1s_top1_v2":   "tian2023_top1_v2",
    "tian2023just_1s_top1_v3":   "tian2023_top1_v3",
    "tian2023just_1s_top4":      "tian2023_top4",
    "xiong2023can_vanilla":      "xiong2023_vanilla",
    "xiong2023can_cot":          "xiong2023_cot",
}

def translate(names, aliases):
    if isinstance(names, str):
        return aliases[names]
    else:
        return [aliases[name] for name in names]
def translate_dataset(names):
    return translate(names, DATASET_NAME_ALIASES)
def translate_model(names):
    return translate(names, MODEL_NAME_ALIASES)
def translate_method(names):
    return translate(names, METHOD_NAME_ALIASES)

def extract_none_indices(names):
    none_indices = [i for i, name in enumerate(names) if name is None]
    none_indices = list(none_indices - np.arange(len(none_indices)))
    names = [name for name in names if name is not None]
    return names, none_indices

def plot_grouped_bar(axes_scores_args, width=None, alpha=None, with_labels=False, with_lines=False, none_indices=[]):
    def filter_singles(l):
        l_new = []
        for i in range(len(l)):
            if not np.isnan(l[i]):
                left_is_val = i-1 >= 0 and not np.isnan(l[i-1])
                right_is_val = i+1 < len(l) and not np.isnan(l[i+1])
                if left_is_val:
                    l_new.append(l[i])
                elif right_is_val:
                    if len(l_new) > 0:
                        l_new.append(np.nan)
                    l_new.append(l[i])
        return np.asarray(l_new)

    n_bars = len(axes_scores_args)
    n_scores = len(axes_scores_args[0][1])
    if width is None:
        width = 1 / (n_bars + 1.5)
    if alpha is None:
        alpha = 0.5 if with_lines else 1

    x = np.arange(n_scores, dtype=float)
    offset = (n_bars - 1) / 2
    # plot bars
    for i, (ax, scores, args) in enumerate(axes_scores_args):
        bar_container = ax.bar(x+(i-offset)*width, scores, width, color=args["color"], alpha=alpha, label=args["label"])
        if with_labels:
            ax.bar_label(bar_container, fmt=args["fmt"], fontsize=FONTSIZE_SMALL, padding=3, rotation=90)
    # plot lines
    if with_lines:
        x_ = filter_singles(np.insert(x, none_indices, np.nan))
        for i, (ax, scores, args) in enumerate(axes_scores_args):
            ax.plot(x_+(i-offset)*width, filter_singles(np.insert(scores, none_indices, np.nan)), color=args["color"], marker="o", markersize=2)

def annotate_agg_over(ax, agg_over):
    ax.text(1.0, 1.02, f"agg. over {agg_over}", ha="right", va="bottom", fontsize=FONTSIZE_SMALL, transform=ax.transAxes)

# plotting functions

def make_plots_datasets(y_true_all, y_pred_all, agg_over, dataset_names=None, model_names=None, method_names=None, sort_by=None, label_rotation=45, n_bins=20):
    dataset_names, model_names, method_names = detect_names_from_dict(y_true_all, dataset_names=dataset_names, model_names=model_names, method_names=method_names)

    # compute
    scores = {
        "ece": np.zeros(len(dataset_names)),
        "accuracy": np.zeros(len(dataset_names)),
        "confidence": np.zeros(len(dataset_names)),
        "confidence_n_distinct": np.zeros(len(dataset_names)),
        "confidence_variance": np.zeros(len(dataset_names)),
        "kl_div_over_dataset": np.zeros(len(dataset_names)),
    }
    for i, dataset_name in enumerate(dataset_names):
        y_true = aggregate_responses(y_true_all, dataset_name, model_names, method_names)
        y_pred = aggregate_responses(y_pred_all, dataset_name, model_names, method_names)

        prob_true, prob_pred, bins, bin_count = calibration_curve(y_true, y_pred, n_bins=n_bins)
        ece = np.sum(bin_count / len(y_true) * np.abs(prob_true - prob_pred), where=bin_count > 0)
        scores["ece"][i] = ece
        scores["accuracy"][i] = np.mean(y_true)
        scores["confidence"][i] = np.mean(y_pred)
        scores["confidence_n_distinct"][i] = len(np.unique(y_pred))
        scores["confidence_variance"][i] = np.std(y_pred)
        scores["kl_div_over_dataset"][i] = np.mean([
            kl_div(
                empirical_distr(aggregate_responses(y_pred_all, dataset_name, model_name, method_name), n_bins),
                empirical_distr(aggregate_responses(y_pred_all, dataset_names, model_name, method_name), n_bins),
            )
            for model_name in model_names
            for method_name in method_names
        ])

    # sort
    if sort_by is not None:
        scores_zipped = zip(scores[sort_by], dataset_names, scores["accuracy"], scores["confidence"], scores["ece"], scores["confidence_n_distinct"], scores["confidence_variance"], scores["kl_div_over_dataset"])
        scores_zipped = sorted(scores_zipped, reverse=True)
        _, dataset_names, scores["accuracy"], scores["confidence"], scores["ece"], scores["confidence_n_distinct"], scores["confidence_variance"], scores["kl_div_over_dataset"] = zip(*scores_zipped)

    # plot
    x = np.arange(len(dataset_names))

    fig1, ax = Plotter.create()
    Plotter.set(
        ax,
        ylim=YLIM_CALIBRATION,
        xticks=dict(ticks=x, labels=translate_dataset(dataset_names), rotation=label_rotation, ha="right"),
    )
    plot_grouped_bar([
        (ax, scores["accuracy"], dict(label="accuracy", color=COLOR_ACCURACY)),
        (ax, scores["confidence"], dict(label="confidence", color=COLOR_CONFIDENCE)),
        (ax, scores["ece"], dict(label="ECE", color=COLOR_ECE)),
    ], with_lines=True)
    annotate_agg_over(ax, agg_over)

    fig2, ax1 = Plotter.create()
    ax2 = ax1.twinx()
    Plotter.set(
        ax1,
        ylim=YLIM_OTHERS_1,
        xticks=dict(ticks=x, labels=translate_dataset(dataset_names), rotation=label_rotation, ha="right"),
    )
    Plotter.set(
        ax2,
        ylim=YLIM_OTHERS_2,
    )
    plot_grouped_bar([
        (ax1, scores["confidence_n_distinct"], dict(label="# distinct", color=COLOR_CONF_N_DISTINCT)),
        (ax2, scores["confidence_variance"], dict(label="variance", color=COLOR_CONF_VARIANCE)),
        (ax2, scores["kl_div_over_dataset"], dict(label="kl_div", color=COLOR_KL_DIV)),
    ], with_lines=True)
    annotate_agg_over(ax1, agg_over)

    return fig1, fig2

def make_plots_models(y_true_all, y_pred_all, agg_over, model_names=None, method_names=None, sort_by=None, label_rotation=45, n_bins=20):
    if model_names is not None:
        model_names, none_indices = extract_none_indices(model_names)
    else:
        none_indices = []

    dataset_names, model_names, method_names = detect_names_from_dict(y_true_all, model_names=model_names, method_names=method_names)

    # compute
    scores = {
        "ece": np.zeros(len(model_names)),
        "accuracy": np.zeros(len(model_names)),
        "confidence": np.zeros(len(model_names)),
        "confidence_n_distinct": np.zeros(len(model_names)),
        "confidence_variance": np.zeros(len(model_names)),
        "kl_div_over_dataset": np.zeros(len(model_names)),
    }
    for i, model_name in enumerate(model_names):
        y_true = aggregate_responses(y_true_all, dataset_names, model_name, method_names)
        y_pred = aggregate_responses(y_pred_all, dataset_names, model_name, method_names)

        prob_true, prob_pred, bins, bin_count = calibration_curve(y_true, y_pred, n_bins=n_bins)
        ece = np.sum(bin_count / len(y_true) * np.abs(prob_true - prob_pred), where=bin_count > 0)
        scores["ece"][i] = ece
        scores["accuracy"][i] = np.mean(y_true)
        scores["confidence"][i] = np.mean(y_pred)
        scores["confidence_n_distinct"][i] = len(np.unique(y_pred))
        scores["confidence_variance"][i] = np.std(y_pred)
        scores["kl_div_over_dataset"][i] = np.mean([
            kl_div(
                empirical_distr(aggregate_responses(y_pred_all, dataset_name, model_name, method_name), n_bins),
                empirical_distr(aggregate_responses(y_pred_all, dataset_names, model_name, method_name), n_bins),
            )
            for dataset_name in dataset_names
            for method_name in method_names
        ])

    # sort
    if sort_by is not None:
        scores_zipped = zip(scores[sort_by], model_names, scores["accuracy"], scores["confidence"], scores["ece"], scores["confidence_n_distinct"], scores["confidence_variance"], scores["kl_div_over_dataset"])
        scores_zipped = sorted(scores_zipped, reverse=True)
        _, model_names, scores["accuracy"], scores["confidence"], scores["ece"], scores["confidence_n_distinct"], scores["confidence_variance"], scores["kl_div_over_dataset"] = zip(*scores_zipped)

    # plot
    x = np.arange(len(model_names), dtype=float)

    fig1, ax = Plotter.create()
    Plotter.set(
        ax,
        ylim=YLIM_CALIBRATION,
        xticks=dict(ticks=x, labels=translate_model(model_names), rotation=label_rotation, ha="right"),
    )
    plot_grouped_bar([
        (ax, scores["accuracy"], dict(label="accuracy", color=COLOR_ACCURACY, fmt="{:.2f}")),
        (ax, scores["confidence"], dict(label="confidence", color=COLOR_CONFIDENCE, fmt="{:.2f}")),
        (ax, scores["ece"], dict(label="ECE", color=COLOR_ECE, fmt="{:.2f}")),
    ], with_lines=True, none_indices=none_indices)
    annotate_agg_over(ax, agg_over)

    fig2, ax1 = Plotter.create()
    ax2 = ax1.twinx()
    Plotter.set(
        ax1,
        ylim=YLIM_OTHERS_1,
        xticks=dict(ticks=x, labels=translate_model(model_names), rotation=label_rotation, ha="right"),
    )
    Plotter.set(
        ax2,
        ylim=YLIM_OTHERS_2,
    )
    plot_grouped_bar([
        (ax1, scores["confidence_n_distinct"], dict(label="# distinct", color=COLOR_CONF_N_DISTINCT)),
        (ax2, scores["confidence_variance"], dict(label="variance", color=COLOR_CONF_VARIANCE)),
        (ax2, scores["kl_div_over_dataset"], dict(label="kl_div", color=COLOR_KL_DIV)),
    ], with_lines=True, none_indices=none_indices)
    annotate_agg_over(ax1, agg_over)

    return fig1, fig2

def make_plots_methods(y_true_all, y_pred_all, agg_over, model_names=None, method_names=None, label_rotation=45, single_plot=False, n_bins=20):
    if method_names is not None:
        method_names, none_indices = extract_none_indices(method_names)
    else:
        none_indices = []

    dataset_names, model_names, method_names = detect_names_from_dict(
        y_true_all,
        model_names=model_names,
        method_names=method_names,
    )

    # compute
    scores = {
        "ece": np.zeros(len(method_names)),
        "accuracy": np.zeros(len(method_names)),
        "confidence": np.zeros(len(method_names)),
        "confidence_n_distinct": np.zeros(len(method_names)),
        "confidence_variance": np.zeros(len(method_names)),
        "kl_div_over_dataset": np.zeros((len(method_names))),
    }
    for i, method_name in enumerate(method_names):
        y_true = aggregate_responses(y_true_all, dataset_names, model_names, method_name)
        y_pred = aggregate_responses(y_pred_all, dataset_names, model_names, method_name)

        prob_true, prob_pred, bins, bin_count = calibration_curve(y_true, y_pred, n_bins=n_bins)
        ece = np.sum(bin_count / len(y_true) * np.abs(prob_true - prob_pred), where=bin_count > 0)
        scores["ece"][i] = ece
        scores["accuracy"][i] = np.mean(y_true)
        scores["confidence"][i] = np.mean(y_pred)
        scores["confidence_n_distinct"][i] = len(np.unique(y_pred))
        scores["confidence_variance"][i] = np.std(y_pred)
        scores["kl_div_over_dataset"][i] = np.mean([
            kl_div(
                empirical_distr(aggregate_responses(y_pred_all, dataset_name, model_name, method_name), n_bins),
                empirical_distr(aggregate_responses(y_pred_all, dataset_names, model_name, method_name), n_bins),
            )
            for dataset_name in dataset_names
            for model_name in model_names
        ])

    # plot
    x = np.arange(len(method_names), dtype=float)

    fig1, ax = Plotter.create()
    fig2, ax1 = Plotter.create()
    ax2 = ax1.twinx()

    if single_plot:
        Plotter.set(ax, xticks=[])
    else:
        Plotter.set(ax, xticks=dict(ticks=x, labels=translate_method(method_names), rotation=label_rotation, ha="right"))
    Plotter.set(
        ax,
        ylim=YLIM_CALIBRATION,
    )
    plot_grouped_bar([
        (ax, scores["accuracy"], dict(label="accuracy", color=COLOR_ACCURACY, fmt="{:.2f}")),
        (ax, scores["confidence"], dict(label="confidence", color=COLOR_CONFIDENCE, fmt="{:.2f}")),
        (ax, scores["ece"], dict(label="ECE", color=COLOR_ECE, fmt="{:.2f}")),
    ], with_labels=True, with_lines=True, none_indices=none_indices)
    annotate_agg_over(ax, agg_over)

    Plotter.set(
        ax1,
        ylim=YLIM_OTHERS_1,
        xticks=dict(ticks=x, labels=translate_method(method_names), rotation=label_rotation, ha="right"),
    )
    Plotter.set(
        ax2,
        ylim=YLIM_OTHERS_2,
    )
    plot_grouped_bar([
        (ax1, scores["confidence_n_distinct"], dict(label="n_distinct", color=COLOR_CONF_N_DISTINCT, fmt="{:.0f}")),
        (ax2, scores["confidence_variance"], dict(label="variance", color=COLOR_CONF_VARIANCE, fmt="{:.2f}")),
        (ax2, scores["kl_div_over_dataset"], dict(label="kl_div", color=COLOR_KL_DIV, fmt="{:.2f}")),
    ], with_labels=True, with_lines=True, none_indices=none_indices)
    if not single_plot:
        annotate_agg_over(ax1, agg_over)

    return fig1, fig2

In [23]:
Plotter.configure(save_always=True)

### Main plots

In [None]:
def plot(y_true_all, y_pred_all):
    dataset_names_all = [
        "sciq",
        "arc-e",
        "arc-c",
        "commonsense_qa",
        "social_i_qa",
        "mmlu",
        "trivia_qa",
        "truthful_qa-mc1",
        "truthful_qa-mc2",
        "logi_qa",
    ]
    model_names_all = [
        "gemma1.1-2b-it",
        "gemma1.1-7b-it",
        None,
        "llama3-8b-instruct",
        "llama3-70b-instruct",
        None,
        "qwen1.5-7b-chat",
        "qwen1.5-32b-chat",
        "qwen1.5-72b-chat",
        "qwen1.5-110b-chat",
        None,
        "gpt3.5-turbo",
        "gpt4o-mini",
        "gpt4o",
    ]

    plots = []

    fig1, fig2 = make_plots_datasets(y_true_all, y_pred_all, "models[all], methods[all]", dataset_names=dataset_names_all, label_rotation=35)
    fig1.axes[0].legend(loc="center left")
    plots.append((fig1, "datasets-calibration"))
    # plots.append((fig2, "datasets-others"))

    fig1, fig2 = make_plots_models(y_true_all, y_pred_all, "datasets[all], methods[all]", model_names=model_names_all)
    plots.append((fig1, "models-calibration"))
    # plots.append((fig2, "models-others"))

    Plotter.finish(
        plots, figwidth=0.5,
        grid_ncols=2, consistent_size=True,
        # save=True,
    )

plot(y_true_all, y_pred_all)

In [None]:
def plot(y_true_all, y_pred_all):
    plots = []

    fig1, fig2 = make_plots_methods(y_true_all, y_pred_all, "datasets[all], models[tiny]", model_names=MODELS_TINY, method_names=METHODS_COMBO, label_rotation=25, single_plot=True)
    h1, l1 = fig2.axes[0].get_legend_handles_labels()
    h2, l2 = fig2.axes[1].get_legend_handles_labels()
    fig2.axes[0].legend(h1+h2[:1], l1+l2[:1], loc="upper left")
    fig2.axes[1].legend(h2[1:], l2[1:], loc="upper right")
    plots.append((fig1, f"methods-calibration-combo-tinymodels"))
    plots.append((fig2, f"methods-others-combo-tinymodels"))

    fig1, fig2 = make_plots_methods(y_true_all, y_pred_all, "datasets[all], models[large]", model_names=MODELS_LARGE, method_names=METHODS_COMBO, label_rotation=25, single_plot=True)
    fig1.axes[0].legend(loc="center right")
    plots.append((fig1, f"methods-calibration-combo-largemodels"))
    plots.append((fig2, f"methods-others-combo-largemodels"))
    plots[::2], plots[1::2] = plots[:2], plots[2:]

    Plotter.finish(
        plots, figwidth=0.5,
        grid_ncols=2, consistent_size=True,
        # save=True,
    )

plot(y_true_all, y_pred_all)

In [None]:
def plot(y_true_all, y_pred_all):
    def make_plot(y_true_all, y_pred_all, agg_over, model_name, method_names):
        dataset_names, _, _ = detect_names_from_dict(y_true_all)

        fig, axes = Plotter.create(ncols=len(method_names), sharey=True)
        fig.supxlabel("confidence")
        axes[0].set_ylabel("accuracy")

        for ax, method_name in zip(axes, method_names):
            y_true = aggregate_responses(y_true_all, dataset_names, model_name, method_name)
            y_pred = aggregate_responses(y_pred_all, dataset_names, model_name, method_name)
            plot_calibration_curve(ax, y_true, y_pred, n_bins=20, labelfmt="{:.2f}", labelsize=FONTSIZE_DEFAULT)

        annotate_agg_over(axes[-1], agg_over)

        return fig

    plots = []

    fig = make_plot(y_true_all, y_pred_all, "datasets[all]", "llama3-8b-instruct", ["basic_1s", "combo_1s_v2"])
    plots.append((fig, "methods-calibration_curve-basic_vs_combo-llama3_8b"))
    fig = make_plot(y_true_all, y_pred_all, "datasets[all]", "gpt4o", ["basic_1s", "combo_1s_v2"])
    plots.append((fig, "methods-calibration_curve-basic_vs_combo-gpt4o"))

    Plotter.finish(
        plots, figwidth=0.5, axratio=1,
        consistent_size=True,
        # save=True,
    )

plot(y_true_all, y_pred_all)

### Appendix plots

In [None]:
def plot(responses_all):
    dataset_names, model_names, method_names = detect_names_from_dict(responses_all)

    # compute
    answer_stats = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: (0, 0, 0))))
    answer_stats_over_dataset = defaultdict(lambda: defaultdict(lambda: (0, 0, 0)))
    for model_name in model_names:
        for method_name in method_names:
            for dataset_name in dataset_names:
                responses = responses_all[dataset_name][model_name][method_name]
                if responses is None:
                    continue
                answer_stats[model_name][method_name][dataset_name] = (
                    len(responses[VALID_ANSWER]),
                    len(responses[NO_ANSWER]),
                    len(responses[INVALID_ANSWER]),
                )
            answer_stats_over_dataset[model_name][method_name] = (
                sum(answer_stats[model_name][method_name][dataset_name][0] for dataset_name in dataset_names),
                sum(answer_stats[model_name][method_name][dataset_name][1] for dataset_name in dataset_names),
                sum(answer_stats[model_name][method_name][dataset_name][2] for dataset_name in dataset_names),
            )

    percentage_valid_answer = np.zeros((len(method_names), len(model_names)))
    percentage_no_answer = np.zeros((len(method_names), len(model_names)))
    for i, method_name in enumerate(method_names):
        for j, model_name in enumerate(model_names):
            n_valid_answer, n_no_answer, n_invalid_answer = answer_stats_over_dataset[model_name][method_name]
            n_total = n_valid_answer + n_no_answer + n_invalid_answer
            percentage_valid_answer[i, j] = n_valid_answer / n_total if n_total > 0 else None
            percentage_no_answer[i, j] = n_no_answer / n_total if n_total > 0 else None

    plots = []

    fig, ax = Plotter.create()
    plot_heatmap(ax, percentage_valid_answer, translate_method(method_names), translate_model(model_names), plot_mean=False, format="{:.2f}", vmin=0, vmax=1.3, cmap="Greens")
    plots.append((fig, "answer_statistics"))

    # fig, ax = Plotter.create()
    # plot_heatmap(ax, percentage_no_answer, method_names, model_names, plot_mean=False, format="{:.2f}", vmin=0, vmax=0.3, cmap="Reds")
    # plots.append((fig, "answer_statistics-no_answers"))

    Plotter.finish(
        plots, figwidth=0.59, axratio=len(method_names) / len(model_names),
        save=True,
    )

plot(responses_all)

In [None]:
def plot(y_true_all, y_pred_all):
    dataset_names_all = [
        "sciq",
        "arc-e",
        "arc-c",
        "commonsense_qa",
        "social_i_qa",
        "mmlu",
        "trivia_qa",
        "truthful_qa-mc1",
        "truthful_qa-mc2",
        "logi_qa",
    ]

    plots = []

    fig1, fig2 = make_plots_datasets(y_true_all, y_pred_all, "models[tiny], methods[all]", dataset_names=dataset_names_all, model_names=MODELS_TINY, label_rotation=35)
    fig1.axes[0].legend(loc="center left")
    plots.append((fig1, "datasets-calibration-tinymodels"))
    # plots.append((fig2, "datasets-others-tinymodels"))

    fig1, fig2 = make_plots_datasets(y_true_all, y_pred_all, "models[large], methods[all]", dataset_names=dataset_names_all, model_names=MODELS_LARGE, label_rotation=35)
    plots.append((fig1, "datasets-calibration-largemodels"))
    # plots.append((fig2, "datasets-others-largemodels"))

    Plotter.finish(
        plots, figwidth=0.5,
        grid_ncols=2, consistent_size=True,
        # save=True,
    )

plot(y_true_all, y_pred_all)

In [None]:
def plot(y_true_all, y_pred_all):
    dataset_names, _, method_names = detect_names_from_dict(y_true_all)

    fig, ax = Plotter.create()
    ax.set_xlabel("confidence")
    ax.set_ylabel("accuracy")

    y_true = aggregate_responses(y_true_all, dataset_names, "gemma1.1-2b-it", method_names)
    y_pred = aggregate_responses(y_pred_all, dataset_names, "gemma1.1-2b-it", method_names)
    plot_calibration_curve(ax, y_true, y_pred, n_bins=20, labelfmt="{:.2f}", labelsize=FONTSIZE_DEFAULT)
    annotate_agg_over(ax, "datasets[all], methods[all]")

    Plotter.finish(
        (fig, "models-calibration_curve-gemma1_1_2b"), figwidth=0.4, axratio=1,
        # save=True,
    )

plot(y_true_all, y_pred_all)

: 

In [None]:
def plot(y_true_all, y_pred_all):
    def make_plots_all(model_names, agg_over, suffix=""):
        plots = []

        fig1, fig2 = make_plots_methods(y_true_all, y_pred_all, agg_over, model_names=model_names, method_names=METHODS_SCORE_RANGE, label_rotation=30, single_plot=True)
        plots.append((fig1, f"methods-calibration-score_range{suffix}"))
        plots.append((fig2, f"methods-others-score_range{suffix}"))
        fig1, fig2 = make_plots_methods(y_true_all, y_pred_all, agg_over, model_names=model_names, method_names=METHODS_SCORE_FORMULATION, label_rotation=30, single_plot=True)
        plots.append((fig1, f"methods-calibration-score_formulation{suffix}"))
        plots.append((fig2, f"methods-others-score_formulation{suffix}"))
        fig1, fig2 = make_plots_methods(y_true_all, y_pred_all, agg_over, model_names=model_names, method_names=METHODS_ADVANCED_FORMULATION, label_rotation=30, single_plot=True)
        plots.append((fig1, f"methods-calibration-advanced_formulation{suffix}"))
        plots.append((fig2, f"methods-others-advanced_formulation{suffix}"))
        fig1, fig2 = make_plots_methods(y_true_all, y_pred_all, agg_over, model_names=model_names, method_names=METHODS_FEW_SHOT, label_rotation=30, single_plot=True)
        plots.append((fig1, f"methods-calibration-few_shot{suffix}"))
        plots.append((fig2, f"methods-others-few_shot{suffix}"))
        fig1, fig2 = make_plots_methods(y_true_all, y_pred_all, agg_over, model_names=model_names, method_names=METHODS_OTHERS, label_rotation=30, single_plot=True)
        plots.append((fig1, f"methods-calibration-others{suffix}"))
        plots.append((fig2, f"methods-others-others{suffix}"))

        return plots

    plots_tiny = make_plots_all(MODELS_TINY, "datasets[all], models[tiny]", suffix="-tinymodels")
    for (fig1, _), (fig2, _) in zip(plots_tiny[::2], plots_tiny[1::2]):
        h1, l1 = fig2.axes[0].get_legend_handles_labels()
        h2, l2 = fig2.axes[1].get_legend_handles_labels()
        fig2.axes[0].legend(h1+h2[:1], l1+l2[:1], loc="upper left")
        fig2.axes[1].legend(h2[1:], l2[1:], loc="upper right")

    plots_large = make_plots_all(MODELS_LARGE, "datasets[all], models[large]", suffix="-largemodels")
    for (fig1, _), (fig2, _) in zip(plots_large[::2], plots_large[1::2]):
        fig1.axes[0].legend(loc="center right")

    plots = plots_tiny + plots_large
    plots[::2], plots[1::2] = plots[:10], plots[10:]

    Plotter.finish(
        plots, figwidth=0.5,
        grid_ncols=2, consistent_size=True,
        save=True,
    )

plot(y_true_all, y_pred_all)

In [31]:
Plotter.configure(save_always=False)
plt.close()