In [None]:
from collections import defaultdict
import json, math, glob
import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
from tqdm import tqdm
import math
import os
from report_agg import result_table, pass1_to_battle


In [83]:
def gen_example_table(result, all_stats):
    records = []
    ids = set(result['example_id']) 
    len_data = len(set(result['example_id']))
    print(np.mean(all_stats['elo']))
    
    for current_id in list(ids):
        example_data = result[result['example_id'] == current_id][['model', 'pass1']]
        ex = example_data.merge(all_stats[['model', 'elo']], left_on = 'model', right_on = 'model')
        # fit_data['result'] = fit_data['result']
        from sklearn.linear_model import LogisticRegression
        lr = LogisticRegression()
        ex['correct'] = np.where(ex['pass1'] > 0, 1, 0)
        model_elos = ex[ex['correct'] == 1]['elo']
        # print(model_elos.describe())
        r = model_elos.describe().to_dict()
        r['example_id'] = current_id
        r['models'] = ex[ex['correct'] == 1]['model'].to_numpy()
        records.append(r)

    return pd.DataFrame(records)


In [96]:



def get_example_level_results(benchmark_id):
    result = eval_results[eval_results['benchmark_id'] == benchmark_id]

    battles = pass1_to_battle(result)
    battles_no_ties = battles[battles["winner"].str.contains("model_")]
    all_stats = result_table(battles_no_ties, result)
    example_table = gen_example_table(result, all_stats)

    outputs = {}
    outputs['result table'] = all_stats.sort_values(by='elo').to_html(float_format='%10.3f')
    outputs['fig_min_elo_solve'] = px.histogram(example_table, x='min', marginal='rug', title='min ELO to solve').to_html(full_html=False)

    no_solve = example_table[example_table['count'] == 0]
    outputs['list_no_solve'] = no_solve['example_id'].to_numpy()
    one_solve = example_table[example_table['count'] == 1]
    display(one_solve)
    one_solve['model'] = one_solve['models'].apply(lambda x: x[0])
    one_solve = one_solve.sort_values(by='max', ascending=False)
    one_solve = one_solve[['example_id', 'model', 'max']]
    display(one_solve)

    outputs['table_one_solve'] = one_solve.to_html()

    elo75 = all_stats['elo'].quantile(0.75)
    print(elo75)
    list_suspect = example_table[example_table['max'] < elo75]
    outputs['table_suspect'] = list_suspect[['example_id', 'models', 'max']].to_html()

    print(outputs.keys())
    return outputs

records = []
for fname in glob.glob(f"data/*.jsonl"):
    with open(fname, 'rt') as f:
        records.extend([json.loads(l) for l in f.readlines()])

eval_results = pd.DataFrame(records)
display(eval_results.describe())

def gen_report(benchmark_id: str):
    outputs = get_example_level_results(benchmark_id)
    from jinja2 import Template
    template_path = r"examplelevel_template.html"
    output_path = rf"crux-eval.github.io/reports/ex_{benchmark_id}.html"
    with open(output_path, "w", encoding="utf-8") as output_file:
        with open(template_path) as template_file:
            j2_template = Template(template_file.read())
            output_file.write(j2_template.render({'benchmark_id': benchmark_id, 'outputs': outputs}))

for b in ['CRUXEval-output', 'CRUXEval-input', 'humaneval+', 'mbpp+']:
    gen_report(b)
    # outputs['fig_unique_solves'] = px.histogram(one_solve, x='model').update_xaxes(categoryorder='total descending')

# outputs['fig_max_elo_solve'] = px.histogram(example_table, x='max', marginal='rug', title='max ELO to solve')
# outputs['fig_median_elo_solve'] = px.histogram(example_table, x='50%', marginal='rug', title='max ELO to solve')

Unnamed: 0,pass1
count,73280.0
mean,0.470794
std,0.47413
min,0.0
25%,0.0
50%,0.3
75%,1.0
max,1.0


997.8160997963109


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,example_id,models
11,1.0,1404.875733,,1404.875733,1404.875733,1404.875733,1404.875733,1404.875733,CRUXEval-output/391,[claude-3-opus-20240229+cot]
18,1.0,1519.130146,,1519.130146,1519.130146,1519.130146,1519.130146,1519.130146,CRUXEval-output/631,[gpt-4-0613+cot]
23,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/220,[gpt-4-turbo-2024-04-09+cot]
49,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/599,[gpt-4-turbo-2024-04-09+cot]
77,1.0,1070.950698,,1070.950698,1070.950698,1070.950698,1070.950698,1070.950698,CRUXEval-output/571,[codellama-7b+cot]
92,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/484,[gpt-4-turbo-2024-04-09+cot]
97,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/698,[gpt-4-turbo-2024-04-09+cot]
104,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/169,[gpt-4-turbo-2024-04-09+cot]
109,1.0,1544.180108,,1544.180108,1544.180108,1544.180108,1544.180108,1544.180108,CRUXEval-output/444,[gpt-4-turbo-2024-04-09+cot]
210,1.0,1235.862699,,1235.862699,1235.862699,1235.862699,1235.862699,1235.862699,CRUXEval-output/438,[gpt-4-0613]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,example_id,model,max
454,CRUXEval-output/591,gpt-4-turbo-2024-04-09+cot,1544.180108
97,CRUXEval-output/698,gpt-4-turbo-2024-04-09+cot,1544.180108
572,CRUXEval-output/458,gpt-4-turbo-2024-04-09+cot,1544.180108
303,CRUXEval-output/35,gpt-4-turbo-2024-04-09+cot,1544.180108
511,CRUXEval-output/126,gpt-4-turbo-2024-04-09+cot,1544.180108
104,CRUXEval-output/169,gpt-4-turbo-2024-04-09+cot,1544.180108
109,CRUXEval-output/444,gpt-4-turbo-2024-04-09+cot,1544.180108
92,CRUXEval-output/484,gpt-4-turbo-2024-04-09+cot,1544.180108
49,CRUXEval-output/599,gpt-4-turbo-2024-04-09+cot,1544.180108
23,CRUXEval-output/220,gpt-4-turbo-2024-04-09+cot,1544.180108


1116.515476680056
dict_keys(['result table', 'fig_min_elo_solve', 'list_no_solve', 'table_one_solve', 'table_suspect'])
999.4048389512561


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,example_id,models
97,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/391,[gpt-4-0613+cot]
165,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/647,[gpt-4-0613+cot]
189,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/250,[gpt-4-0613+cot]
242,1.0,1000.0,,1000.0,1000.0,1000.0,1000.0,1000.0,CRUXEval-input/314,[gpt-3.5-turbo-0613]
424,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/754,[gpt-4-0613+cot]
463,1.0,965.540626,,965.540626,965.540626,965.540626,965.540626,965.540626,CRUXEval-input/119,[mixtral-8x7b]
466,1.0,993.535506,,993.535506,993.535506,993.535506,993.535506,993.535506,CRUXEval-input/770,[phind]
550,1.0,1203.367688,,1203.367688,1203.367688,1203.367688,1203.367688,1203.367688,CRUXEval-input/474,[gpt-4-0613]
552,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/294,[gpt-4-0613+cot]
799,1.0,1540.236561,,1540.236561,1540.236561,1540.236561,1540.236561,1540.236561,CRUXEval-input/232,[gpt-4-0613+cot]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,example_id,model,max
97,CRUXEval-input/391,gpt-4-0613+cot,1540.236561
165,CRUXEval-input/647,gpt-4-0613+cot,1540.236561
189,CRUXEval-input/250,gpt-4-0613+cot,1540.236561
424,CRUXEval-input/754,gpt-4-0613+cot,1540.236561
552,CRUXEval-input/294,gpt-4-0613+cot,1540.236561
799,CRUXEval-input/232,gpt-4-0613+cot,1540.236561
550,CRUXEval-input/474,gpt-4-0613,1203.367688
242,CRUXEval-input/314,gpt-3.5-turbo-0613,1000.0
466,CRUXEval-input/770,phind,993.535506
463,CRUXEval-input/119,mixtral-8x7b,965.540626


1066.5064869214837
dict_keys(['result table', 'fig_min_elo_solve', 'list_no_solve', 'table_one_solve', 'table_suspect'])
1000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,example_id,models
21,1.0,1226.816246,,1226.816246,1226.816246,1226.816246,1226.816246,1226.816246,HumanEval/83,[meta-llama-3-70b-instruct]
51,1.0,1035.047031,,1035.047031,1035.047031,1035.047031,1035.047031,1035.047031,HumanEval/137,[Qwen--Qwen1.5-72B-Chat]
53,1.0,1177.236459,,1177.236459,1177.236459,1177.236459,1177.236459,1177.236459,HumanEval/93,[xwincoder-34b]
70,1.0,1248.264043,,1248.264043,1248.264043,1248.264043,1248.264043,1248.264043,HumanEval/22,[opencodeinterpreter-ds-33b]
110,1.0,903.425392,,903.425392,903.425392,903.425392,903.425392,903.425392,HumanEval/120,[open-hermes-2.5-code-290k-13b]
142,1.0,753.146161,,753.146161,753.146161,753.146161,753.146161,753.146161,HumanEval/122,[mistralai--Mistral-7B-Instruct-v0.2]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,example_id,model,max
70,HumanEval/22,opencodeinterpreter-ds-33b,1248.264043
21,HumanEval/83,meta-llama-3-70b-instruct,1226.816246
53,HumanEval/93,xwincoder-34b,1177.236459
51,HumanEval/137,Qwen--Qwen1.5-72B-Chat,1035.047031
110,HumanEval/120,open-hermes-2.5-code-290k-13b,903.425392
142,HumanEval/122,mistralai--Mistral-7B-Instruct-v0.2,753.146161


1154.4551336742863
dict_keys(['result table', 'fig_min_elo_solve', 'list_no_solve', 'table_one_solve', 'table_suspect'])
1000.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,example_id,models
9,1.0,1149.482313,,1149.482313,1149.482313,1149.482313,1149.482313,1149.482313,Mbpp/440,[bigcode--starcoder2-15b-instruct-v0.1]
11,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/435,[gpt-4-1106-preview]
100,1.0,982.765676,,982.765676,982.765676,982.765676,982.765676,982.765676,Mbpp/576,[microsoft--Phi-3-mini-4k-instruct]
128,1.0,1138.260781,,1138.260781,1138.260781,1138.260781,1138.260781,1138.260781,Mbpp/74,[mixtral-8x22b-instruct-v0.1]
145,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/279,[gpt-4-1106-preview]
154,1.0,929.118121,,929.118121,929.118121,929.118121,929.118121,929.118121,Mbpp/124,[octocoder]
205,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/781,[gpt-4-1106-preview]
209,1.0,1056.786461,,1056.786461,1056.786461,1056.786461,1056.786461,1056.786461,Mbpp/125,[deepseek-coder-6.7b-base]
236,1.0,1285.135157,,1285.135157,1285.135157,1285.135157,1285.135157,1285.135157,Mbpp/622,[gpt-4-1106-preview]
245,1.0,966.646025,,966.646025,966.646025,966.646025,966.646025,966.646025,Mbpp/593,[codegemma-7b]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,example_id,model,max
11,Mbpp/435,gpt-4-1106-preview,1285.135157
145,Mbpp/279,gpt-4-1106-preview,1285.135157
205,Mbpp/781,gpt-4-1106-preview,1285.135157
236,Mbpp/622,gpt-4-1106-preview,1285.135157
324,Mbpp/72,gpt-4-1106-preview,1285.135157
331,Mbpp/84,gpt-4-1106-preview,1285.135157
365,Mbpp/780,gpt-4-1106-preview,1285.135157
330,Mbpp/294,opencodeinterpreter-ds-6.7b,1193.917275
9,Mbpp/440,bigcode--starcoder2-15b-instruct-v0.1,1149.482313
128,Mbpp/74,mixtral-8x22b-instruct-v0.1,1138.260781


1143.5920423754237
dict_keys(['result table', 'fig_min_elo_solve', 'list_no_solve', 'table_one_solve', 'table_suspect'])


In [None]:
import sys
sys.path.append('/private/home/sida/git/CodeGen/scripts/sida/arena/data/evalplus')

from evalplus.data import get_mbpp_plus, write_jsonl

samples = [
    dict(task_id=task_id, problem=problem['prompt'])
    for task_id, problem in get_mbpp_plus().items()
]
print(samples[77])
df = pd.DataFrame(samples)
# weird ones 77, 233
print(df[df['task_id'] == 'Mbpp/233']['problem'].iloc[0])