In [None]:
import os
import json
import pandas as pd
import glob
import matplotlib.pyplot as plt
import re
import ast
import sklearn
import numpy as np
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

%load_ext autoreload
%autoreload 2

In [None]:
results = glob.glob("../../outputs/**/*.csv", recursive=True)

In [None]:
brewer_sources = ["paper/exp1", "e3"]
other_sources = ["e1", "e2", "e3"]
experiments = ["brewer", "delatorre", "gerrig", "lehne"]
ROOT = os.path.dirname("../../outputs/")

LIKERT_BOUNDS = {
    "delatorre": [1, 9],
    "brewer": [1, 7],
    "gerrig": [1, 7],
    "lehne": [1, 10]
}


def get_experiment_name(experiment, runs):
    out = pd.DataFrame()
    
    data_blocks = glob.glob(f"../../outputs/{experiment}_experiment/final/**/results.csv", recursive=True)

    likert_midpoint = (LIKERT_BOUNDS[experiment][1] - LIKERT_BOUNDS[experiment][0]) / 2

    sign = lambda x: 1 if x > likert_midpoint else (0 if x < likert_midpoint else float('nan'))

    keys = None
    for block_path in data_blocks:
        mat = re.search(r"../../outputs[\\/](.+)_experiment[\\/]final[\\/](.+)[\\/](.+)[\\/](.+)[\\/]results.csv", block_path)
        model = mat.group(3)
        if mat.group(2) not in runs:
            continue
        block = pd.read_csv(block_path)
        block['model'] = model
        block['run'] = mat.group(2)

        if experiment == "brewer":
            block = block[block['version'].str.contains("chunks", case=False)]

        try:
            responses = block['response'].apply(ast.literal_eval)
        except:
            continue
        
        responses = responses.apply(pd.Series)
        if experiment == "brewer":
            brewer_responses = pd.DataFrame(columns=["0", "3", "6", "9", "12"])	
            responses = responses[[col for col in ["0", "3", "6", "9", "12"] if col in responses.columns]]
            responses = pd.concat([brewer_responses, responses], axis=0)
        block = pd.concat([block, responses], axis=1).drop(columns=['response'])
        out = pd.concat([out, block], axis=0)

        keys = responses.columns

    out['id'] = out['experiment_name'] + "," + out['version']

    for key in keys:
        out[key] = out[key].astype(float)

    out = out.drop(columns=['experiment_name', 'version'])

    out = out.groupby(['id', 'model']).mean(numeric_only=True).reset_index()

    out['response'] = out[keys].apply(lambda x: x.to_list(), axis=1)
    out.drop(columns=keys, inplace=True)

    out['response'] = out['response'].apply(lambda x: [sign(y) for y in x])

    if experiment == "gerrig":
        out['response'] = out['response'].apply(lambda x: [x[1]])
    if experiment == "brewer":
        def reform_id(id):
            return id.split(",")[0].split(" Chunks")[0] + "," + id.split(",")[1].split(" Chunks")[0]
        out['id'] = out['id'].apply(reform_id)

    return out[['id', 'model', 'response']]

In [None]:
lehne = {
    'Experiment,Normal': [5.565217391, 5, 4.826086957, 5.739130435, 5.52173913, 6.826086957, 7.304347826, 5.434782609, 6.391304348, 7.47826087, 7.043478261, 5.869565217, 6.739130435, 6.956521739, 6.47826087, 5.956521739, 4.652173913, 4.260869565, 5.173913043, 4.086956522, 4.173913043, 4.304347826, 5, 4.043478261, 4.217391304, 4.434782609, 5.347826087, 6.217391304, 5.434782609, 4.782608696, 6.173913043, 5.956521739, 6.47826087, 5, 4.739130435, 5.173913043, 6.304347826, 6.434782609, 5.260869565, 5.304347826, 5.956521739, 4.304347826, 5.260869565, 4.391304348, 4.956521739, 5.695652174, 5.043478261, 5.826086957, 5.043478261, 4.913043478, 5.217391304, 6.217391304, 6.391304348, 6.52173913, 7.217391304, 6.565217391, 5.52173913, 4.347826087, 3.869565217, 7, 7.565217391, 6.52173913, 6.260869565, 6.043478261, 4.913043478]
}

# Scuffed, but it works
delatorre_global_ratings = [3.34, 3.725, 3.705, 3.89, 4.08, 5.02, 4.87, 4.81, 5.84, 5.77, 6.44, 4.685]
delatorre_unique_categories = ['Experiment,Journalistic Bad Not Revealed',
       'Experiment,Journalistic Bad Revealed',
       'Experiment,Journalistic Good Not Revealed',
       'Experiment,Journalistic Good Revealed',
       'Experiment,Novel Bad Not Revealed',
       'Experiment,Novel Bad Revealed',
       'Experiment,Novel Good Not Revealed',
       'Experiment,Novel Good Revealed']

delatorre = {
    k: delatorre_global_ratings for k in delatorre_unique_categories
}

brewer = {
    'Experiment A,American Story Birthday' : 3.2,
    'Experiment A,American Story Flying' : 3.6,
    'Experiment A,American Story Lottery' : 4.5,
    'Experiment A,American Story Old Phoebe' : 3.4,
    'Experiment A,American Story Ylla' : 5.1,
}

gerrig = { # Standard suspense / Q2 ratings
    "Experiment A,Pen Not Mentioned": (3.78 + 3.43) / 2,
    "Experiment A,Pen Mentioned Removed": (4.38 + 4.06) / 2,
    "Experiment A,Pen Mentioned Not Removed": 3.47,
    "Experiment B,Unused Comb": 3.96,
    "Experiment B,Used Comb": 3.41,
    "Experiment C,Prior Solution Not Mentioned": (3.76 + 3.34) / 2,
    "Experiment C,Prior Solution Mentioned and Removed": (4.61 + 3.99) / 2,
    "Experiment C,Prior Solution Mentioned Not Removed": 4.14
}

def normalize_scalar(n, experiment):
    likert_midpoint = (LIKERT_BOUNDS[experiment][1] - LIKERT_BOUNDS[experiment][0]) / 2
    normalized = (n - likert_midpoint) / likert_midpoint
    if normalized > 0:
        return 1
    elif normalized < 0:
        return 0
    else:
        return pd.NA

def normalize_human_ratings(experiment):
    exp_dict = globals()[experiment]
    for key in exp_dict.keys():
        if type(exp_dict[key]) == list:
            exp_dict[key] = [normalize_scalar(n, experiment) for n in exp_dict[key]]
        else:
            exp_dict[key] = [normalize_scalar(exp_dict[key], experiment)]
    return exp_dict

In [None]:
gerrig_human_ratings = normalize_human_ratings("gerrig")
brewer_human_ratings = normalize_human_ratings("brewer")
lehne_human_ratings = normalize_human_ratings("lehne")
delatorre_human_ratings = normalize_human_ratings("delatorre")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Calculate accuracy, precision, recall, and F1 score

gerrig_experiment_data = get_experiment_name("gerrig", other_sources)
brewer_experiment_data = get_experiment_name("brewer", brewer_sources)
lehne_experiment_data = get_experiment_name("lehne", other_sources)
delatorre_experiment_data = get_experiment_name("delatorre", other_sources)

def get_metrics(experiment_data, human_ratings, experiment_name):
    for row in experiment_data.iterrows():
        ground_truth = human_ratings[row[1]['id']]
        predictions = row[1]['response']

        if len(ground_truth) == 1 and len(predictions) > 1:
            ground_truth = ground_truth * len(predictions)

        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for i in range(len(ground_truth)):
            if pd.isna(ground_truth[i]) or pd.isna(predictions[i]):
                continue
            if ground_truth[i] == 1 and predictions[i] == 1:
                tp += 1
            elif ground_truth[i] == 0 and predictions[i] == 0:
                tn += 1
            elif ground_truth[i] == 0 and predictions[i] == 1:
                fp += 1
            elif ground_truth[i] == 1 and predictions[i] == 0:
                fn += 1

        if tp + tn + fp + fn != 0:
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            num_valid = len([x for x in predictions if not pd.isna(x)])
            num_invalid = len([x for x in predictions if pd.isna(x)])
        else:
            accuracy = float('nan')
            precision = float('nan')
            recall = float('nan')
            f1 = float('nan')
            num_valid = 0
            num_invalid = len(predictions)

        # print(f"Valid: {len([x for x in predictions if not pd.isna(x)])}, Invalid: {len([x for x in predictions if pd.isna(x)])}")
        experiment_data.at[row[0], 'accuracy'] = accuracy
        experiment_data.at[row[0], 'precision'] = precision
        experiment_data.at[row[0], 'recall'] = recall
        experiment_data.at[row[0], 'f1'] = f1
        experiment_data.at[row[0], 'num_valid'] = num_valid
        experiment_data.at[row[0], 'num_invalid'] = num_invalid

    metrics_summary = experiment_data.groupby(['model']).agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'num_valid': ['sum'],
        'num_invalid': ['sum']
    })

    # Add a summary row with means
    metrics_summary.loc['Average'] = metrics_summary.mean()

    return metrics_summary

    # # predictions = experiment_data.iloc[:, -len(value_cols):]
    # print(predictions, human_ratings)
    # conf_matrices = predictions.apply(lambda row: confusion_matrix(human_ratings, row, labels=[0, 1]), axis=1)


brewer_metrics = get_metrics(brewer_experiment_data, brewer_human_ratings, "brewer")
gerrig_metrics = get_metrics(gerrig_experiment_data, gerrig_human_ratings, "gerrig")
delatorre_metrics = get_metrics(delatorre_experiment_data, delatorre_human_ratings, "delatorre")
lehne_metrics = get_metrics(lehne_experiment_data, lehne_human_ratings, "lehne")

In [None]:
def format_model_name(model : str):
    excluded = ["Average", "Consensus"]
    if model in excluded:
        return model
    model_name = model.split("_")[-1]
    model_name = model_name.split("-Instruct")[0]
    model_name = model_name.split("-chat")[0]
    model_name = model_name.split("-it")[0]
    model_name = model_name.replace("-", " ")
    model_name = model_name[0].upper() + model_name[1:]
    return model_name

def prettify_table(table):
    table = table.copy()
    numerics = ['accuracy', 'precision', 'recall', 'f1']
    def format_mean_std(x):
        if pd.isna(x['std']):
            return f"{x['mean']:.2f} ± 0.0"
        return f"{x['mean']:.2f} ± {x['std']:.2f}"
    for metric in numerics:
        table[metric, 'mean'] = table[metric, 'mean'].apply(lambda x: round(x, 2))
        table[metric, 'std'] = table[metric, 'std'].apply(lambda x: round(x, 2))
        collapsed = table[metric].apply(format_mean_std, axis=1)
        table[metric] = collapsed
        table.drop(columns=[(metric, 'std')], inplace=True)
    table.columns = table.columns.droplevel(1)
    table = table.reset_index()
    table['model'] = table['model'].apply(format_model_name)

    table.set_index('model', inplace=True)
    
    return table

In [None]:
pretty_gerrig = prettify_table(gerrig_metrics)
pretty_brewer = prettify_table(brewer_metrics)
pretty_delatorre = prettify_table(delatorre_metrics)
pretty_lehne = prettify_table(lehne_metrics)

In [None]:
# pretty_gerrig
# pretty_brewer
# pretty_delatorre
pretty_lehne

In [None]:
def join_with_single_metric(tables : list[pd.DataFrame], metric : str, labels : list[str] = None):
    out = pd.concat(tables, axis=1)
    out = out[[metric]]
    if labels:
        out.columns = labels
    out = out.reset_index()
    for col in out.columns:
        col = col.upper()

    return out.reset_index()

def as_latex(table):
    table = table.drop(columns=['index'])
    table = table.rename(columns={'model': 'Model'})
    return table.to_latex(index=False, escape=False)

joined = join_with_single_metric([pretty_gerrig, pretty_brewer, pretty_delatorre, pretty_lehne], 'f1', labels=['Gerrig', 'Brewer', 'Delatorre', 'Lehne'])

In [None]:
print(as_latex(joined))

In [None]:
delatorre_experiment_data