In [None]:
import matplotlib.pyplot as plt
import mpl_lego as mplego
import numpy as np
import pandas as pd

from mpl_lego.labels import bold_text, apply_subplot_labels
from pyprojroot import here

from tueplots.constants.color import palettes

from normative_evaluation_llms_everyday_dilemmas import keys


In [None]:
# Turn into LaTeX style; you need to have LaTeX installed
mplego.style.use_latex_style()

In [None]:
# Color palette for plots
cycle = palettes.paultol_muted

In [None]:
df = pd.read_csv(here('data/normative_evaluation_everyday_dilemmas_dataset.csv'))

In [None]:
# Throw out extra labels
df.drop(['gemma_label_4', 'gemma_label_5'], inplace=True, axis=1)

In [None]:
# Count number of votes by models
for AITA_LABEL in keys.AITA_LABELS:
    df[f'{AITA_LABEL}_votes'] = (df[keys.LABEL_COLS[1:]] == AITA_LABEL).sum(axis=1)

In [None]:
# Extract subsets of columns for each model
nta_votes = {
    llm: (df.filter(regex=f'^{llm}_label') == 'NTA').sum(axis=1)
    for llm in keys.LLMs}

yta_votes = {
    llm: (df.filter(regex=f'^{llm}_label') == 'YTA').sum(axis=1)
    for llm in keys.LLMs}

esh_votes = {
    llm: (df.filter(regex=f'^{llm}_label') == 'ESH').sum(axis=1)
    for llm in keys.LLMs}

nah_votes = {
    llm: (df.filter(regex=f'^{llm}_label') == 'NAH').sum(axis=1)
    for llm in keys.LLMs}

In [None]:
# Calculate bootstrap estimates
def categorical_proportion(records, AITA_LABEL, n_votes):
    votes = records[f'{AITA_LABEL}_votes']
    comments = records[f'comments_{AITA_LABEL.lower()}_agreement_weighted']
    return np.mean(comments[votes == n_votes])


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(7, 5), sharey=True)

LINE_WIDTH = 2
X_LABEL_FONT_SIZE = 12
Y_LABEL_FONT_SIZE = 12

for idx, label in enumerate(keys.AITA_LABELS[:4]):
    # Calculate number of votes for each label
    votes = (df[keys.LABEL_COLS[1:]] == label).sum(axis=1)
    # Calculate the average label rate for each number of votes
    centers = [np.mean(df[votes == n_votes][f'comments_{label.lower()}_agreement_weighted']) for n_votes in range(8)]

    # Subplot (a)
    axes[0, 0].errorbar(
        np.arange(8),
        centers,
        color=f'#{cycle[idx]}',
        label=bold_text(label),
        marker='o',
        linewidth=LINE_WIDTH)

# Subplot (b)
for idx, llm in enumerate(keys.LLMs):
    # Self-consistency for NTA
    agreements = [np.mean(df[nta_votes[llm] == n_votes]['comments_nta_agreement_weighted'])
                  for n_votes in range(4)]
    axes[1, 0].plot(np.arange(4), agreements, marker='o', linewidth=2, color=f'C{idx+1}')

# Subplot (c)
for idx, llm in enumerate(keys.LLMs):
    # Self-consistency for YTA
    agreements = [np.mean(df[yta_votes[llm] == n_votes]['comments_yta_agreement_weighted'])
                  for n_votes in range(4)]
    axes[1, 1].plot(np.arange(4),
                    agreements,
                    marker='o',
                    linewidth=LINE_WIDTH,
                    color=f'C{idx+1}',
                    label=bold_text(keys.MODEL_LABELS_PLOT[idx + 1]))

axes[0, 0].set_xticks(np.arange(8))
axes[0, 0].grid(axis='y')
axes[0, 0].set_axisbelow(True)
axes[0, 0].set_ylabel(bold_text('Average Label Rate'), fontsize=Y_LABEL_FONT_SIZE)
axes[0, 0].legend(loc='center left',
                  bbox_to_anchor=(0.49, 0.80),
                  bbox_transform=fig.transFigure)

for ax in axes[0]:
    ax.set_xlim([-0.25, 7.25])

for ax in axes[1]:
    ax.set_xlim([-0.25, 3.25])
    ax.set_ylim([0, 1])
    ax.set_xticks([0, 1, 2, 3])
    ax.set_xticklabels(bold_text(['0/3', '1/3', '2/3', '3/3']))
    ax.grid(axis='y')
    ax.set_axisbelow(True)

axes[0, 0].set_ylim([0, 1.03])
axes[0, 1].axis('off')
axes[1, 1].set_zorder(1000)
axes[1, 1].legend(loc='center left', bbox_to_anchor=(0.67, 0.57), bbox_transform=fig.transFigure)
axes[1, 0].set_xlabel(bold_text('Number of Votes for NTA\n(Self-Consistency)'), fontsize=X_LABEL_FONT_SIZE)
axes[1, 1].set_xlabel(bold_text('Number of Votes for YTA\n(Self-Consistency)'), fontsize=X_LABEL_FONT_SIZE)


axes[0, 0].set_xlabel(bold_text('Number of Votes Across Models'), fontsize=X_LABEL_FONT_SIZE)
axes[1, 0].set_ylabel(bold_text('Average Label Rate'), fontsize=Y_LABEL_FONT_SIZE)

apply_subplot_labels(
    [axes[0, 0], axes[1, 0], axes[1, 1]],
    x=-0.05,
    y=1.14,
    bold=True)

plt.subplots_adjust(hspace=0.6, wspace=0.15)

plt.savefig('fig3_repeated_runs.pdf', bbox_inches='tight')