In [1]:
from collections import defaultdict
import json, math, glob
import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
from tqdm import tqdm
import math
import os
from report_agg import result_table, pass1_to_battle


In [22]:
def get_anchor(benchmark_id: str, example_id: str):
    # supporting {'humaneval+', 'CRUXEval-input', 'mbpp+', 'CRUXEval-output'}
    def get_link():
        if benchmark_id in ['humaneval', 'humaneval+', 'mbpp', 'mbpp+']:
            dir, id = example_id.split('/') # expecting HumanEval/93 and Mbpp/622 etc.
            return f'https://crux-eval.github.io/eval-arena/evalplus/{dir}/{id}.html'
        elif benchmark_id in ['CRUXEval-input', 'CRUXEval-output']:
            id = example_id.replace(benchmark_id + '/', '')
            return f'https://crux-eval.github.io/demo.html?id={int(id) + 1}'
    return f'<a href="{get_link()}">{example_id}</a>'


def gen_example_table(result, all_stats):
    records = []
    ids = set(result['example_id']) 
    len_data = len(set(result['example_id']))
    print(np.mean(all_stats['elo']))
    
    for current_id in list(ids):
        example_data = result[result['example_id'] == current_id][['model', 'pass1']]
        ex = example_data.merge(all_stats[['model', 'elo']], left_on = 'model', right_on = 'model')
        # fit_data['result'] = fit_data['result']
        from sklearn.linear_model import LogisticRegression
        lr = LogisticRegression()
        ex['correct'] = np.where(ex['pass1'] > 0, 1, 0)
        model_elos = ex[ex['correct'] == 1]['elo']
        # print(model_elos.describe())
        r = model_elos.describe().to_dict()
        r['example_id'] = current_id
        r['models'] = ex[ex['correct'] == 1]['model'].to_numpy()
        r['acc'] = len(ex[ex['correct'] == 1]) / len(ex)
        records.append(r)

    return pd.DataFrame(records)


In [24]:

def get_example_level_results(benchmark_id):
    result = eval_results[eval_results['benchmark_id'] == benchmark_id]

    battles = pass1_to_battle(result)
    battles_no_ties = battles[battles["winner"].str.contains("model_")]
    all_stats = result_table(battles_no_ties, result)
    example_table = gen_example_table(result, all_stats)
    example_table['example_link'] = example_table['example_id'].apply(lambda x: get_anchor(benchmark_id, x))

    outputs = {}
    outputs['result table'] = all_stats.sort_values(by='elo', ascending=False).to_html(float_format='%10.3f')
    outputs['fig_min_elo_solve'] = px.histogram(example_table, x='min', marginal='rug', title='min ELO to solve').to_html(full_html=False)
    
    outputs['table_histogram_accs'] = px.histogram(example_table, x='acc', marginal='rug', title='accuracy on examples').to_html(full_html=False)

    no_solve = example_table[example_table['count'] == 0]
    outputs['list_no_solve'] = sorted(no_solve['example_link'].to_list())
    one_solve = example_table[example_table['count'] == 1]
    display(one_solve)
    one_solve['model'] = one_solve['models'].apply(lambda x: x[0])
    one_solve = one_solve.sort_values(by='max', ascending=False)
    one_solve = one_solve[['example_link', 'model', 'max']]
    display(one_solve)
    outputs['table_one_solve'] = one_solve.to_html(escape=False)

    elo75 = all_stats['elo'].quantile(0.75)
    print(elo75)
    list_suspect = example_table[example_table['max'] < elo75]
    outputs['table_suspect'] = list_suspect[['example_link', 'models', 'max']].to_html(escape=False)
    

    print(outputs.keys())
    return outputs

records = []
for fname in glob.glob(f"data/*.jsonl"):
    with open(fname, 'rt') as f:
        records.extend([json.loads(l) for l in f.readlines()])

eval_results = pd.DataFrame(records)
display(eval_results.describe())

def gen_report(benchmark_id: str):
    outputs = get_example_level_results(benchmark_id)
    from jinja2 import Template
    template_path = r"examplelevel_template.html"
    output_path = rf"crux-eval.github.io/eval-arena/ex_{benchmark_id}.html"
    with open(output_path, "w", encoding="utf-8") as output_file:
        with open(template_path) as template_file:
            j2_template = Template(template_file.read())
            output_file.write(j2_template.render({'benchmark_id': benchmark_id, 'outputs': outputs}))

for b in ['CRUXEval-output', 'CRUXEval-input', 'humaneval+', 'mbpp+']:
    gen_report(b)
    # outputs['fig_unique_solves'] = px.histogram(one_solve, x='model').update_xaxes(categoryorder='total descending')

# outputs['fig_max_elo_solve'] = px.histogram(example_table, x='max', marginal='rug', title='max ELO to solve')
# outputs['fig_median_elo_solve'] = px.histogram(example_table, x='50%', marginal='rug', title='max ELO to solve')

Unnamed: 0,pass1
count,73280.0
mean,0.470794
std,0.47413
min,0.0
25%,0.0
50%,0.3
75%,1.0
max,1.0


997.8160997963109


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,example_id,models,example_link
0,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/220,[gpt-4-turbo-2024-04-09+cot],"<a href=""https://crux-eval.github.io/demo.html..."
18,1.0,1323.342634,,1323.342634,1323.342634,1323.342634,1323.342634,1323.342634,CRUXEval-output/568,[gpt-3.5-turbo-0613+cot],"<a href=""https://crux-eval.github.io/demo.html..."
89,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/458,[gpt-4-turbo-2024-04-09+cot],"<a href=""https://crux-eval.github.io/demo.html..."
92,1.0,1215.286056,,1215.286056,1215.286056,1215.286056,1215.286056,1215.286056,CRUXEval-output/211,[gpt-4-turbo-2024-04-09],"<a href=""https://crux-eval.github.io/demo.html..."
96,1.0,1404.875733,,1404.875733,1404.875733,1404.875733,1404.875733,1404.875733,CRUXEval-output/501,[claude-3-opus-20240229+cot],"<a href=""https://crux-eval.github.io/demo.html..."
161,1.0,1215.286056,,1215.286056,1215.286056,1215.286056,1215.286056,1215.286056,CRUXEval-output/556,[gpt-4-turbo-2024-04-09],"<a href=""https://crux-eval.github.io/demo.html..."
182,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/35,[gpt-4-turbo-2024-04-09+cot],"<a href=""https://crux-eval.github.io/demo.html..."
188,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/444,[gpt-4-turbo-2024-04-09+cot],"<a href=""https://crux-eval.github.io/demo.html..."
196,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/484,[gpt-4-turbo-2024-04-09+cot],"<a href=""https://crux-eval.github.io/demo.html..."
198,1.0,1519.130146,,1519.130146,1519.130146,1519.130146,1519.130146,1519.130146,CRUXEval-output/310,[gpt-4-0613+cot],"<a href=""https://crux-eval.github.io/demo.html..."




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,example_link,model,max
0,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
182,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
539,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
299,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
541,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
188,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
196,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
608,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
681,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108
89,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-turbo-2024-04-09+cot,1544.180108


1116.515476680056
dict_keys(['result table', 'fig_min_elo_solve', 'list_no_solve', 'table_one_solve', 'table_suspect'])
999.4048389512561


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,example_id,models,example_link
117,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/647,[gpt-4-0613+cot],"<a href=""https://crux-eval.github.io/demo.html..."
177,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/250,[gpt-4-0613+cot],"<a href=""https://crux-eval.github.io/demo.html..."
229,1.0,993.535506,,993.535506,993.535506,993.535506,993.535506,993.535506,CRUXEval-input/770,[phind],"<a href=""https://crux-eval.github.io/demo.html..."
414,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/754,[gpt-4-0613+cot],"<a href=""https://crux-eval.github.io/demo.html..."
419,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/232,[gpt-4-0613+cot],"<a href=""https://crux-eval.github.io/demo.html..."
462,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/294,[gpt-4-0613+cot],"<a href=""https://crux-eval.github.io/demo.html..."
533,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/391,[gpt-4-0613+cot],"<a href=""https://crux-eval.github.io/demo.html..."
572,1.0,1000.0,,1000.0,1000.0,1000.0,1000.0,1000.0,CRUXEval-input/314,[gpt-3.5-turbo-0613],"<a href=""https://crux-eval.github.io/demo.html..."
700,1.0,965.540626,,965.540626,965.540626,965.540626,965.540626,965.540626,CRUXEval-input/119,[mixtral-8x7b],"<a href=""https://crux-eval.github.io/demo.html..."
747,1.0,1203.367688,,1203.367688,1203.367688,1203.367688,1203.367688,1203.367688,CRUXEval-input/474,[gpt-4-0613],"<a href=""https://crux-eval.github.io/demo.html..."




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,example_link,model,max
117,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-0613+cot,1540.236561
177,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-0613+cot,1540.236561
414,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-0613+cot,1540.236561
419,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-0613+cot,1540.236561
462,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-0613+cot,1540.236561
533,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-0613+cot,1540.236561
747,"<a href=""https://crux-eval.github.io/demo.html...",gpt-4-0613,1203.367688
572,"<a href=""https://crux-eval.github.io/demo.html...",gpt-3.5-turbo-0613,1000.0
229,"<a href=""https://crux-eval.github.io/demo.html...",phind,993.535506
700,"<a href=""https://crux-eval.github.io/demo.html...",mixtral-8x7b,965.540626


1066.5064869214837
dict_keys(['result table', 'fig_min_elo_solve', 'list_no_solve', 'table_one_solve', 'table_suspect'])
1000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,example_id,models,example_link
42,1.0,753.146161,,753.146161,753.146161,753.146161,753.146161,753.146161,HumanEval/122,[mistralai--Mistral-7B-Instruct-v0.2],"<a href=""https://crux-eval.github.io/eval-aren..."
88,1.0,1226.816246,,1226.816246,1226.816246,1226.816246,1226.816246,1226.816246,HumanEval/83,[meta-llama-3-70b-instruct],"<a href=""https://crux-eval.github.io/eval-aren..."
95,1.0,1177.236459,,1177.236459,1177.236459,1177.236459,1177.236459,1177.236459,HumanEval/93,[xwincoder-34b],"<a href=""https://crux-eval.github.io/eval-aren..."
96,1.0,903.425392,,903.425392,903.425392,903.425392,903.425392,903.425392,HumanEval/120,[open-hermes-2.5-code-290k-13b],"<a href=""https://crux-eval.github.io/eval-aren..."
136,1.0,1035.047031,,1035.047031,1035.047031,1035.047031,1035.047031,1035.047031,HumanEval/137,[Qwen--Qwen1.5-72B-Chat],"<a href=""https://crux-eval.github.io/eval-aren..."
142,1.0,1248.264043,,1248.264043,1248.264043,1248.264043,1248.264043,1248.264043,HumanEval/22,[opencodeinterpreter-ds-33b],"<a href=""https://crux-eval.github.io/eval-aren..."




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,example_link,model,max
142,"<a href=""https://crux-eval.github.io/eval-aren...",opencodeinterpreter-ds-33b,1248.264043
88,"<a href=""https://crux-eval.github.io/eval-aren...",meta-llama-3-70b-instruct,1226.816246
95,"<a href=""https://crux-eval.github.io/eval-aren...",xwincoder-34b,1177.236459
136,"<a href=""https://crux-eval.github.io/eval-aren...",Qwen--Qwen1.5-72B-Chat,1035.047031
96,"<a href=""https://crux-eval.github.io/eval-aren...",open-hermes-2.5-code-290k-13b,903.425392
42,"<a href=""https://crux-eval.github.io/eval-aren...",mistralai--Mistral-7B-Instruct-v0.2,753.146161


1154.4551336742863
dict_keys(['result table', 'fig_min_elo_solve', 'list_no_solve', 'table_one_solve', 'table_suspect'])
1000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,example_id,models,example_link
6,1.0,926.871039,,926.871039,926.871039,926.871039,926.871039,926.871039,Mbpp/759,[mixtral-8x7b-instruct],"<a href=""https://crux-eval.github.io/eval-aren..."
92,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/279,[gpt-4-1106-preview],"<a href=""https://crux-eval.github.io/eval-aren..."
120,1.0,982.765676,,982.765676,982.765676,982.765676,982.765676,982.765676,Mbpp/576,[microsoft--Phi-3-mini-4k-instruct],"<a href=""https://crux-eval.github.io/eval-aren..."
158,1.0,1138.260781,,1138.260781,1138.260781,1138.260781,1138.260781,1138.260781,Mbpp/74,[mixtral-8x22b-instruct-v0.1],"<a href=""https://crux-eval.github.io/eval-aren..."
188,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/622,[gpt-4-1106-preview],"<a href=""https://crux-eval.github.io/eval-aren..."
199,1.0,929.118121,,929.118121,929.118121,929.118121,929.118121,929.118121,Mbpp/124,[octocoder],"<a href=""https://crux-eval.github.io/eval-aren..."
222,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/780,[gpt-4-1106-preview],"<a href=""https://crux-eval.github.io/eval-aren..."
226,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/435,[gpt-4-1106-preview],"<a href=""https://crux-eval.github.io/eval-aren..."
236,1.0,1149.482313,,1149.482313,1149.482313,1149.482313,1149.482313,1149.482313,Mbpp/440,[bigcode--starcoder2-15b-instruct-v0.1],"<a href=""https://crux-eval.github.io/eval-aren..."
242,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/84,[gpt-4-1106-preview],"<a href=""https://crux-eval.github.io/eval-aren..."




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,example_link,model,max
92,"<a href=""https://crux-eval.github.io/eval-aren...",gpt-4-1106-preview,1285.135157
188,"<a href=""https://crux-eval.github.io/eval-aren...",gpt-4-1106-preview,1285.135157
222,"<a href=""https://crux-eval.github.io/eval-aren...",gpt-4-1106-preview,1285.135157
226,"<a href=""https://crux-eval.github.io/eval-aren...",gpt-4-1106-preview,1285.135157
242,"<a href=""https://crux-eval.github.io/eval-aren...",gpt-4-1106-preview,1285.135157
286,"<a href=""https://crux-eval.github.io/eval-aren...",gpt-4-1106-preview,1285.135157
317,"<a href=""https://crux-eval.github.io/eval-aren...",gpt-4-1106-preview,1285.135157
270,"<a href=""https://crux-eval.github.io/eval-aren...",opencodeinterpreter-ds-6.7b,1193.917275
236,"<a href=""https://crux-eval.github.io/eval-aren...",bigcode--starcoder2-15b-instruct-v0.1,1149.482313
158,"<a href=""https://crux-eval.github.io/eval-aren...",mixtral-8x22b-instruct-v0.1,1138.260781


1143.5920423754237
dict_keys(['result table', 'fig_min_elo_solve', 'list_no_solve', 'table_one_solve', 'table_suspect'])
