In [None]:
from collections import defaultdict
import json, math, glob
import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
from tqdm import tqdm

def get_cruxeval(type):
    records = []
    for fname in glob.glob(f"evaluation_results/*_temp0.2_{type}.json"):
        name = fname.split('/')[1]
        model, temp, typejsonl = name.split('_')
        print(model, temp, type)

        with open(fname) as f:
            res = json.load(f)['raw_scored_generations']
            for exid in res:
                gotid = res[exid][0]
                records.append({
                    'benchmark': f'cruxeval_{type}',
                    'model': model,
                    'example': f'{exid}_{type}',
                    'result': gotid,
                    'hyperparams': temp
                })
    df = pd.DataFrame.from_records(records)
    display(df.describe())
    return df
        

with open('cruxeval_input.jsonl', 'w') as f:
    dfi = get_cruxeval('input')
    f.write(dfi.to_json(orient='records', lines=True))

with open('cruxeval_output.jsonl', 'w') as f:
    dfo = get_cruxeval('output')
    f.write(dfo.to_json(orient='records', lines=True))

result = pd.concat([dfi, dfo])
display(result)


In [None]:
def result_to_battle(result: pd.DataFrame):
    pa = pd.merge(result, result, on=['example'], suffixes=["_a", "_b"], how='outer')

    awins = (pa['result_a'] == True) & (pa['result_b'] == False)
    bwins = (pa['result_a'] == False) & (pa['result_b'] == True)
    ties_neither = (pa['result_a'] == False) & (pa['result_b'] == False)
    ties_both = (pa['result_a'] == True) & (pa['result_b'] == True)
    # pa[['winner']][awins] = 'model_a' 
    pa['winner'] = 'a'
    pa.loc[awins, 'winner'] = 'model_a'
    pa.loc[bwins, 'winner'] = 'model_b'
    pa.loc[ties_neither, 'winner'] = 'neither'
    pa.loc[ties_both, 'winner'] = 'both'
    return pa 

battles = result_to_battle(result)

In [None]:


def visualize_battle_count(battles, title, show_num_models=30):
    ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size",
                          fill_value=0)
    battle_counts = ptbl
    ordering = battle_counts.sum().sort_values(ascending=True).index
    ordering = ordering[:show_num_models]
    fig = px.imshow(battle_counts.loc[ordering, ordering],
                    title=title, text_auto=True)
    fig.update_layout(xaxis_title="Model B",
                      yaxis_title="Model A",
                      xaxis_side="top", height=800, width=800,
                      title_y=0.07, title_x=0.5,
                      font=dict(size=10))
    fig.update_traces(hovertemplate=
                      "Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
    return fig

battles_no_ties = battles[battles["winner"].str.contains("model_")]
# fig = visualize_battle_count(battles_no_ties, "No ties")
# # fig

In [None]:

def compute_pairwise_win_fraction(battles, max_num_models=30):
    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles[battles['winner'] == "model_a"],
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(battles,
        index="model_a", columns="model_b", aggfunc="size", fill_value=0)

    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (
        (a_win_ptbl) /
        (num_battles_ptbl)
    )
    # display(mcnemar)
    # Arrange ordering according to proprition of wins
    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
    prop_wins = prop_wins[:max_num_models]
    model_names = list(prop_wins.keys())
    row_beats_col = row_beats_col_freq.loc[model_names, model_names]

    wins = a_win_ptbl.loc[model_names, model_names]
    diffs = (wins - wins.T)
    sums = (wins + wins.T)
    mcnemar = diffs ** 2 / sums
    mcnemar = mcnemar.apply(lambda x: 1 - stats.chi2.cdf(x, 1))

    return row_beats_col, mcnemar, diffs, sums 

def visualize_pairwise_win_fraction(battles, title, max_num_models=30):
    row_beats_col, mcnemar, diffs, sums = compute_pairwise_win_fraction(battles, max_num_models)
    fig = px.imshow(mcnemar,
                    text_auto=".2f", title=title)
    fig.update_layout(xaxis_title=" Model B: Loser",
                  yaxis_title="Model A: Winner",
                  xaxis_side="top", height=900, width=900,
                  title_y=0.07, title_x=0.5)

    sort_keys = row_beats_col.keys() 
    extra_info = (pd.concat([row_beats_col, mcnemar, diffs, sums])
    .stack()
    .groupby(level=[0,1])
    .apply(tuple)
    .unstack()
    ).loc[sort_keys, sort_keys]
    
    # display(extra_info)
    fig.update_traces(customdata=extra_info, hovertemplate=
        "Model A: %{y}<br>Model B: %{x}<br>p-value: %{customdata[1]}<br> A beats B rate: %{customdata[0]} <br>diffs %{customdata[2]} <br>sums %{customdata[3]}")

    return fig

fig = visualize_pairwise_win_fraction(battles_no_ties, 'test', max_num_models=60)
fig
# fig.to_html('mcnemar_test.html')

In [None]:
row_beats_col, mcnemar, diffs, sums = compute_pairwise_win_fraction(battles, max_num_models=65)
# display(diffs)

df = diffs.reset_index().melt(id_vars='model_a').merge(sums.reset_index().melt(id_vars='model_a'), on=['model_a', 'model_b'])
df = df[df['value_x'] > 0]
fig = px.scatter(df[df['model_a'] != df['model_b']], x='value_x', y='value_y', custom_data=['model_a', 'model_b'])
display(df)
fig.update_traces(hovertemplate=
        "dModel A: %{customdata[0]}<br>Model B: %{customdata[1]}<br>")
fig