In [None]:
# data formatting scripts

from collections import defaultdict
import json, math, glob
import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
modelinfo_path = 'model_info.tsv'

def load_info():
    df_info = pd.read_csv(modelinfo_path, sep='\t')
    # sns.set_theme()

    def f(x):
        if str(x['display_name']) == 'nan':
            x['display_name'] = x['model']
        # print(x)
        return x
    df_info = df_info.apply(f, axis=1)
    return df_info

df_info = load_info()
display(df_info)

name_translation = df_info[['model', 'display_name']].set_index('model').to_dict()['display_name']
print(name_translation)

In [None]:
def pass_at_k(n, c, k):
    """
    :param n: total number of samples
    :param c: number of correct samples
    :param k: k in pass@$k$
    """
    if n - c < k:
        return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

def get_cruxeval(task, temp):
    records = []
    for fname in glob.glob(f"../evaluation_results/*_temp{temp}_{task}.json"):
        name = fname.split('/')[-1]
        model, _, typejsonl = name.split('_')
        print(model, temp, task)

        with open(fname) as f:
            res = json.load(f)['raw_scored_generations']
            for exid in res:
                gotid = np.mean(res[exid])
                p = np.mean(res[exid])
                n = len((res[exid])) 
                c = n*p 
                p = c / n
                pass1 = pass_at_k(n, n*p, 1)
                if n >=5:
                    pass5 = pass_at_k(n, n*p, 5)
                else: pass5 = None

                records.append({
                    'model': model,
                    'id': f"{exid}",
                    'pass1': pass1,
                    'pass5': pass5,
                    'task': task,
                    't': float(temp),
                    'n': n,
                    'c': c,
                })
    return records
records = get_cruxeval('input', '0.2')
records.extend(get_cruxeval('input', '0.8'))
records.extend(get_cruxeval('output', '0.2'))
records.extend(get_cruxeval('output', '0.8'))
df_results = pd.DataFrame(records)
# records.append({'id': id_str, 'pass1': pass1, 'pass5': pass5, 'task': task, 'model': name_translation[m], 't': t, 'n': n, 'c': n*p,

models = set(df_results['model'])
models_trans = set([name_translation[m] for m in models if m in name_translation])

In [None]:
def get_wide(df):
    df_hight= df[(df['t'] == 0.8)].reset_index()
    df_lowt = df[(df['t'] == 0.2)].reset_index()
    # display(df)
    df_wide_lowt = df_lowt[['id', 'task', 'model', 'pass1', 'n', 'c']].pivot(index=['id'], columns=['model', 'task'], values=['pass1', 'n', 'c'])
    df_wide_hight = df_hight[['id', 'task', 'model', 'pass5', 'n', 'c']].pivot(index=['id'], columns=['model', 'task'], values=['pass5'])
    df_wide = df_wide_lowt.merge(df_wide_hight, on='id')
    return df_wide

df_wide_hat = get_wide(df_results)

In [None]:
def doresample(r, task):
    # print(r.index)
    for model in models_trans:
        n = r[('n', model, task)]
        c0 = r[('c', model, task)]
        p = c0 / n
        p_s = np.random.binomial(10, p) / 10
        r[('pass1', model, task)] = p_s
        # r[('pass5', model, task)] = pass_at_k(10, c, 5)
    # print(r)
    return r

def main_results_bars(df_wide, task = 'output', passk='pass1',  B=10000, resample=False):
    def boostrap_diff(n_data, B=10):
        samps = []
        for i in range(B):
            draw10 = df_wide.sample(n_data, replace=True)
            # draw10 = df_wide
            if resample:
                draw10 = draw10.apply(lambda x: doresample(x, task), axis=1)
            samps.append(draw10.aggregate('mean', axis=0))
            # samps.append(df_wide.sample(n_data, replace=True).aggregate('mean', axis=0))

        means = pd.concat(samps, axis=1).transpose()
        means = means[passk]
        # for m in models:
            # means[pair[0]][task] - means[pair[1]][task]
        filtered = sorted([c for c in means.columns if c[1] == task])
        df = means[filtered]
        df.columns = df.columns.droplevel(1)
        return df
    return boostrap_diff(len(df_wide), B=B)
    
    
def baseline_plot(data, base=None, task=None, passk=None, suffix=''):
    main_bars = pd.DataFrame(data)
    if base is not None:
        mean = main_bars[base].mean()
        print(mean)
        # mean = 0
        for c in main_bars.columns:
            if c != base:
                main_bars[c] = main_bars[c] - main_bars[base] + mean
        main_bars[base] = mean

    means = main_bars.mean()
    means.sort_values(ascending=True, inplace=True)

    display(means.index)
    res = main_bars[means.index]
    res = res.drop([c for c in res.columns if '_py_' in c], axis=1)

    # display(res)

    res = res.reset_index()
    res = res.melt(id_vars='sample_id', value_vars=res.columns)
    res = res[['model', 'value']]
    res['passk'] = passk
    if base:
        res['baseline'] = base
    else:
        res['baseline'] = 'none'

    # display(res)
    return res

def main_barplot(res, name):
    plt.figure(figsize=(6, 8))
    sns.boxplot(data=res, y='model', x='value', hue='passk', showfliers=False, whis=(2.5, 97.5), width=0.5)
    # sns.stripplot(data=res, orient='h', color='skyblue', inner='quart')
    plt.grid(True)
    # plt.xticks(rotation=90, ha='right')

    plt.title(f'{name}')
    plt.xlabel("accuracy")
    plt.ylabel("")
    leg = plt.legend(frameon=False)
    plt.savefig(f'main_box_{name}.pdf', bbox_inches='tight')

# display(annotated_res)
# df_wide_hat.describe()

if True:
    for task in ['output', 'input']:
    # for task in ['output']:
        res = []
        for passk in ['pass1', 'pass5']:
        # for passk in ['pass5']:
            main_bars = main_results_bars(df_wide_hat, task=task, passk=passk, B=10000)
            main_bars.index.name = 'sample_id'
            res.append(baseline_plot(main_bars, base=None, task=task, passk=passk))
            # res.append(baseline_plot(main_bars, base='codellama_34B', task=task, passk=passk))

        final = pd.concat(res)
        # display(final)
        main_barplot(final, f'{task}')

# for task in ['output']: