In [62]:
import os
import json
import pandas as pd
import glob
import matplotlib.pyplot as plt
import re
import ast
import sklearn
import numpy as np
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
results = glob.glob("outputs/**/*.csv", recursive=True)

In [64]:
brewer_sources = ["paper/exp1", "e3"]
other_sources = ["e1", "e2", "e3"]
experiments = ["brewer", "delatorre", "gerrig", "lehne"]
ROOT = os.path.dirname("outputs/")

LIKERT_BOUNDS = {
    "delatorre": [1, 9],
    "brewer": [1, 7],
    "gerrig": [1, 7],
    "lehne": [1, 10]
}

def normalize_scalar(n, experiment):
    lmin, lmax = LIKERT_BOUNDS[experiment]
    normalized = (n - lmin) / (lmax - lmin)
    return normalized * 10

def get_experiment_name(experiment, runs):

    out = pd.DataFrame()
    
    data_blocks = glob.glob(f"outputs/{experiment}_experiment/final/**/results.csv", recursive=True)

    likert_midpoint = (LIKERT_BOUNDS[experiment][1] - LIKERT_BOUNDS[experiment][0]) / 2

    sign = lambda x: 1 if x > likert_midpoint else (0 if x < likert_midpoint else float('nan'))

    keys = None
    for block_path in data_blocks:

        mat = re.search(r"outputs\/(.+)_experiment\/final\/(.+)\/(.+)\/(.+)\/results.csv", block_path)
        model = mat.group(3)
        if mat.group(2) not in runs:
            continue
        block = pd.read_csv(block_path)
        block['model'] = model
        block['run'] = mat.group(2)

        if experiment == "brewer":
            block = block[block['version'].str.contains("chunks", case=False)]

        try:
            responses = block['response'].apply(ast.literal_eval)
        except:
            continue
        responses = responses.apply(pd.Series)
        if experiment == "brewer":
            brewer_responses = pd.DataFrame(columns=["0", "3", "6", "9", "12"])	
            responses = responses[[col for col in ["0", "3", "6", "9", "12"] if col in responses.columns]]
            responses = pd.concat([brewer_responses, responses], axis=0)
        block = pd.concat([block, responses], axis=1).drop(columns=['response'])
        out = pd.concat([out, block], axis=0)

        keys = responses.columns

    out['id'] = out['experiment_name'] + "," + out['version']

    for key in keys:
        out[key] = out[key].astype(float)

    out = out.drop(columns=['experiment_name', 'version'])

    out = out.groupby(['id', 'model']).mean(numeric_only=True).reset_index()

    out['response'] = out[keys].apply(lambda x: x.to_list(), axis=1)
    out.drop(columns=keys, inplace=True)

    out['response'] = out['response'].apply(lambda x: [normalize_scalar(n, experiment) for n in x])

    if experiment == "gerrig":
        out['response'] = out['response'].apply(lambda x: [x[1]])
    if experiment == "brewer":
        def reform_id(id):
            return id.split(",")[0].split(" Chunks")[0] + "," + id.split(",")[1].split(" Chunks")[0]
        out['id'] = out['id'].apply(reform_id)

    return out[['id', 'model', 'response']]

In [65]:
lehne = {
    'Experiment,Normal': [5.565217391, 5, 4.826086957, 5.739130435, 5.52173913, 6.826086957, 7.304347826, 5.434782609, 6.391304348, 7.47826087, 7.043478261, 5.869565217, 6.739130435, 6.956521739, 6.47826087, 5.956521739, 4.652173913, 4.260869565, 5.173913043, 4.086956522, 4.173913043, 4.304347826, 5, 4.043478261, 4.217391304, 4.434782609, 5.347826087, 6.217391304, 5.434782609, 4.782608696, 6.173913043, 5.956521739, 6.47826087, 5, 4.739130435, 5.173913043, 6.304347826, 6.434782609, 5.260869565, 5.304347826, 5.956521739, 4.304347826, 5.260869565, 4.391304348, 4.956521739, 5.695652174, 5.043478261, 5.826086957, 5.043478261, 4.913043478, 5.217391304, 6.217391304, 6.391304348, 6.52173913, 7.217391304, 6.565217391, 5.52173913, 4.347826087, 3.869565217, 7, 7.565217391, 6.52173913, 6.260869565, 6.043478261, 4.913043478]
}

# Scuffed, but it works
delatorre_global_ratings = [3.34, 3.725, 3.705, 3.89, 4.08, 5.02, 4.87, 4.81, 5.84, 5.77, 6.44, 4.685]
delatorre_unique_categories = ['Experiment,Journalistic Bad Not Revealed',
       'Experiment,Journalistic Bad Revealed',
       'Experiment,Journalistic Good Not Revealed',
       'Experiment,Journalistic Good Revealed',
       'Experiment,Novel Bad Not Revealed',
       'Experiment,Novel Bad Revealed',
       'Experiment,Novel Good Not Revealed',
       'Experiment,Novel Good Revealed']

delatorre = {
    k: delatorre_global_ratings for k in delatorre_unique_categories
}

brewer = {
    'Experiment A,American Story Birthday' : 3.2,
    'Experiment A,American Story Flying' : 3.6,
    'Experiment A,American Story Lottery' : 4.5,
    'Experiment A,American Story Old Phoebe' : 3.4,
    'Experiment A,American Story Ylla' : 5.1,
}

gerrig = { # Standard suspense / Q2 ratings
    "Experiment A,Pen Not Mentioned": (3.78 + 3.43) / 2,
    "Experiment A,Pen Mentioned Removed": (4.38 + 4.06) / 2,
    "Experiment A,Pen Mentioned Not Removed": 3.47,
    "Experiment B,Unused Comb": 3.96,
    "Experiment B,Used Comb": 3.41,
    "Experiment C,Prior Solution Not Mentioned": (3.76 + 3.34) / 2,
    "Experiment C,Prior Solution Mentioned and Removed": (4.61 + 3.99) / 2,
    "Experiment C,Prior Solution Mentioned Not Removed": 4.14
}

def normalize_human_ratings(experiment):
    exp_dict = globals()[experiment]
    for key in exp_dict.keys():
        if type(exp_dict[key]) == list:
            exp_dict[key] = [normalize_scalar(n, experiment) for n in exp_dict[key]]
        else:
            exp_dict[key] = [normalize_scalar(exp_dict[key], experiment)]
    return exp_dict

In [66]:
gerrig_human_ratings = normalize_human_ratings("gerrig")
brewer_human_ratings = normalize_human_ratings("brewer")
lehne_human_ratings = normalize_human_ratings("lehne")
delatorre_human_ratings = normalize_human_ratings("delatorre")

In [67]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Calculate accuracy, precision, recall, and F1 score

gerrig_experiment_data = get_experiment_name("gerrig", other_sources)
brewer_experiment_data = get_experiment_name("brewer", brewer_sources)
lehne_experiment_data = get_experiment_name("lehne", other_sources)
delatorre_experiment_data = get_experiment_name("delatorre", other_sources)

def get_metrics(experiment_data, human_ratings, experiment_name, fn):
    for row in experiment_data.iterrows():
        ground_truth = human_ratings[row[1]['id']]
        predictions = row[1]['response']

        if len(ground_truth) == 1 and len(predictions) > 1:
            ground_truth = ground_truth * len(predictions)

        value = fn(predictions, ground_truth)
    

        # print(f"Valid: {len([x for x in predictions if not pd.isna(x)])}, Invalid: {len([x for x in predictions if pd.isna(x)])}")
        experiment_data.at[row[0], 'metric'] = value
        experiment_data.at[row[0], 'num_valid'] = len([x for x in predictions if not pd.isna(x)])
        experiment_data.at[row[0], 'num_invalid'] = len([x for x in predictions if pd.isna(x)])

    metrics_summary = experiment_data.groupby(['model']).agg({
        'metric': ['mean', 'std'],
        'num_valid': ['sum'],
        'num_invalid': ['sum']
    })

    # Add a summary row with means
    metrics_summary.loc['Average'] = metrics_summary.mean()

    return metrics_summary

    # # predictions = experiment_data.iloc[:, -len(value_cols):]
    # print(predictions, human_ratings)
    # conf_matrices = predictions.apply(lambda row: confusion_matrix(human_ratings, row, labels=[0, 1]), axis=1)

def mse(x, y):
    all_na = True
    for i in x:
        if not pd.isna(i):
            all_na = False
            break
    if all_na:
        return float('nan')

    total = 0
    for i in range(len(x)):
        if pd.isna(x[i]):
            continue
        total += (x[i] - y[i]) ** 2
    return total / len([x for x in x if not pd.isna(x)])    

def rmse(x, y):
    return np.sqrt(mse(x, y))

def l1(x, y):
    all_na = True
    for i in x:
        if not pd.isna(i):
            all_na = False
            break
    if all_na:
        return float('nan')

    total = 0
    for i in range(len(x)):
        if pd.isna(x[i]):
            continue
        total += abs(x[i] - y[i])
    return total / len([x for x in x if not pd.isna(x)])

def all_metrics(fn):
    brewer_metrics = get_metrics(brewer_experiment_data, brewer_human_ratings, "brewer", fn)
    gerrig_metrics = get_metrics(gerrig_experiment_data, gerrig_human_ratings, "gerrig", fn)
    delatorre_metrics = get_metrics(delatorre_experiment_data, delatorre_human_ratings, "delatorre", fn)
    lehne_metrics = get_metrics(lehne_experiment_data, lehne_human_ratings, "lehne", fn)
    return brewer_metrics, gerrig_metrics, delatorre_metrics, lehne_metrics

brewer_metrics, gerrig_metrics, delatorre_metrics, lehne_metrics = all_metrics(l1)

In [68]:
def format_model_name(model : str):
    excluded = ["Average", "Consensus"]
    if model in excluded:
        return model
    model_name = model.split("_")[-1]
    model_name = model_name.split("-Instruct")[0]
    model_name = model_name.split("-chat")[0]
    model_name = model_name.split("-it")[0]
    model_name = model_name.replace("-", " ")
    model_name = model_name[0].upper() + model_name[1:]
    return model_name

def prettify_table(table):
    table = table.copy()
    numerics = ['metric']
    def format_mean_std(x):
        if pd.isna(x['std']):
            return f"{x['mean']:.2f} ± 0.0"
        return f"{x['mean']:.2f} ± {x['std']:.2f}"
    for metric in numerics:
        table[metric, 'mean'] = table[metric, 'mean'].apply(lambda x: round(x, 2))
        table[metric, 'std'] = table[metric, 'std'].apply(lambda x: round(x, 2))
        collapsed = table[metric].apply(format_mean_std, axis=1)
        table[metric] = collapsed
        table.drop(columns=[(metric, 'std')], inplace=True)
    table.columns = table.columns.droplevel(1)
    table = table.reset_index()
    table['model'] = table['model'].apply(format_model_name)

    table.set_index('model', inplace=True)
    
    return table

In [69]:
pretty_gerrig = prettify_table(gerrig_metrics)
pretty_brewer = prettify_table(brewer_metrics)
pretty_delatorre = prettify_table(delatorre_metrics)
pretty_lehne = prettify_table(lehne_metrics)

In [70]:
pretty_brewer

Unnamed: 0_level_0,metric,num_valid,num_invalid
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Qwen2 72B,1.79 ± 0.50,25.0,0.0
DeepSeek V3,2.44 ± 0.77,25.0,0.0
Gemma 2 27b,1.98 ± 0.54,25.0,0.0
Gemma 2 9b,1.59 ± 0.83,25.0,0.0
Llama 2 7b,3.94 ± 1.82,16.0,9.0
Llama 3 70b,1.77 ± 0.90,25.0,0.0
Llama 3 8b,1.26 ± 0.34,25.0,0.0
WizardLM 2 8x22B,2.15 ± 1.26,25.0,0.0
Mistral 7B,1.03 ± 0.22,25.0,0.0
Mixtral 8x7B,2.40 ± 1.36,23.0,2.0


In [71]:
def join_with_single_metric(tables : list[pd.DataFrame], metric : str, labels : list[str] = None):
    out = pd.concat(tables, axis=1)
    out = out[[metric]]
    if labels:
        out.columns = labels
    out = out.reset_index()
    for col in out.columns:
        col = col.upper()

    return out.reset_index()

def as_latex(table):
    table = table.drop(columns=['index'])
    table = table.rename(columns={'model': 'Model'})
    return table.to_latex(index=False, escape=False)

joined = join_with_single_metric([pretty_gerrig, pretty_brewer, pretty_delatorre, pretty_lehne], 'metric', labels=['Gerrig', 'Brewer', 'Delatorre', 'Lehne'])

In [72]:
print(as_latex(joined))

\begin{tabular}{lllll}
\toprule
Model & Gerrig & Brewer & Delatorre & Lehne \\
\midrule
Qwen2 72B & 3.61 ± 0.60 & 1.79 ± 0.50 & 2.60 ± 0.32 & 1.88 ± 0.0 \\
DeepSeek V3 & 3.61 ± 0.60 & 2.44 ± 0.77 & 2.21 ± 0.16 & 1.73 ± 0.0 \\
Gemma 2 27b & 3.41 ± 0.70 & 1.98 ± 0.54 & 1.97 ± 0.17 & 2.07 ± 0.0 \\
Gemma 2 9b & 2.99 ± 1.11 & 1.59 ± 0.83 & 2.36 ± 0.36 & 2.12 ± 0.0 \\
Llama 2 7b & 3.61 ± 0.60 & 3.94 ± 1.82 & 3.22 ± 0.22 & 3.31 ± 0.0 \\
Llama 3 70b & 3.61 ± 0.60 & 1.77 ± 0.90 & 3.15 ± 0.23 & 2.54 ± 0.0 \\
Llama 3 8b & 5.07 ± 0.70 & 1.26 ± 0.34 & 2.87 ± 0.19 & 2.66 ± 0.0 \\
WizardLM 2 8x22B & 3.61 ± 0.60 & 2.15 ± 1.26 & 2.46 ± 0.67 & 1.75 ± 0.0 \\
Mistral 7B & 2.64 ± 0.99 & 1.03 ± 0.22 & 2.59 ± 0.52 & 3.00 ± 0.0 \\
Mixtral 8x7B & 2.71 ± 1.32 & 2.40 ± 1.36 & 2.86 ± 0.42 & 2.04 ± 0.0 \\
Average & 3.49 ± 0.78 & 2.03 ± 0.85 & 2.63 ± 0.33 & 2.31 ± 0.0 \\
\bottomrule
\end{tabular}



In [73]:
delatorre_experiment_data

Unnamed: 0,id,model,response,metric,num_valid,num_invalid
0,"Experiment,Journalistic Bad Not Revealed",Qwen_Qwen2-72B-Instruct,"[7.5, 4.166666666666666, 1.666666666666667, 5....",2.384201,12.0,0.0
1,"Experiment,Journalistic Bad Not Revealed",deepseek-ai_DeepSeek-V3,"[7.5, 3.75, 5.0, 6.25, 7.5, 8.75, 5.0, 2.5, 7....",2.116493,12.0,0.0
2,"Experiment,Journalistic Bad Not Revealed",google_gemma-2-27b-it,"[5.0, 2.916666666666667, 3.333333333333333, 6....",2.206076,12.0,0.0
3,"Experiment,Journalistic Bad Not Revealed",google_gemma-2-9b-it,"[6.25, 3.75, 5.0, 7.5, 6.25, 9.166666666666668...",2.672049,12.0,0.0
4,"Experiment,Journalistic Bad Not Revealed",meta-llama_Llama-2-7b-chat-hf,"[7.5, 5.416666666666666, 7.5, 6.25, 7.08333333...",3.072049,12.0,0.0
...,...,...,...,...,...,...
75,"Experiment,Novel Good Revealed",meta-llama_Llama-3-70b-chat-hf,"[6.25, 2.916666666666667, 8.75, 0.833333333333...",3.065104,12.0,0.0
76,"Experiment,Novel Good Revealed",meta-llama_Llama-3-8b-chat-hf,"[6.25, 5.0, 8.75, 0.4166666666666666, 5.833333...",3.261285,12.0,0.0
77,"Experiment,Novel Good Revealed",microsoft_WizardLM-2-8x22B,"[7.083333333333334, 5.0, 8.333333333333334, 2....",3.014410,12.0,0.0
78,"Experiment,Novel Good Revealed",mistralai_Mistral-7B-Instruct-v0.3,"[7.5, 5.0, 7.5, 0.0, 6.25, 8.75, 8.33333333333...",3.049132,12.0,0.0
