In [None]:

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import torchmetrics
import torch
import scipy

In [None]:
df = pd.read_csv(snakemake.input.all_perplexities, index_col=0)
if np.isnan(df["question_id"]).any():
    df["question_id"] = df.index

In [None]:
# Compute probabilities for individual classes


def ppl_to_proba(group):
    probas = scipy.special.softmax(
        -group["eval_all_perplexities"] * 3
    )  # Adding temperature of 3 to get some more confident probabilities

    return pd.Series(index=group["response"], data=probas)


probas_df = df.groupby("question_id").apply(ppl_to_proba).unstack(level=1)
probas_df.to_csv(snakemake.output.predictions_raw)
probas_df

In [None]:
labels_text = df[df["type"] == "correct"].set_index("question_id")["response"]
labels = torch.from_numpy(labels_text.map(probas_df.columns.get_loc).values)
labels

In [None]:
predictions = torch.tensor(probas_df.values)
predictions.shape

In [None]:
labels

In [None]:
accuracy = torchmetrics.functional.accuracy(
    predictions,
    labels,
    average="macro",
    task="multiclass",
    num_classes=probas_df.shape[1],
)
precision = torchmetrics.functional.precision(
    predictions,
    labels,
    average="macro",
    task="multiclass",
    num_classes=probas_df.shape[1],
)
recall = torchmetrics.functional.recall(
    predictions,
    labels,
    average="macro",
    task="multiclass",
    num_classes=probas_df.shape[1],
)
f1 = torchmetrics.functional.f1_score(
    predictions,
    labels,
    average="macro",
    task="multiclass",
    num_classes=probas_df.shape[1],
)
auroc = torchmetrics.functional.auroc(
    torch.tensor(probas_df.values),
    labels,
    task="multiclass",
    num_classes=probas_df.shape[1],
)

performance = pd.Series(
    {
        "accuracy": accuracy.item(),
        "precision": precision.item(),
        "recall": recall.item(),
        "f1": f1.item(),
        "auroc": auroc.item(),
    },
    name="value",
)
performance.index.name = "metric"
performance.to_csv(snakemake.output.performance)

performance

In [None]:
predictions = []

for question_id, group in df.groupby("question_id"):
    label = group.loc[group["type"] == "correct", "response"].iloc[0]
    predicted = group.iloc[
        group.set_index("response")["eval_all_perplexities"].argmin()
    ]["response"]

    res = pd.Series(
        dict(
            predicted_labels=predicted,
            valid_prediction=True,
            label=group.loc[group["type"] == "correct", "response"].iloc[0],
            is_correct=(label == predicted),
        )
    )
    res.name = question_id
    predictions.append(res)
predictions = pd.DataFrame(predictions)

predictions.to_csv(snakemake.output.predictions)
predictions