In [None]:
# data formatting scripts
from collections import defaultdict
import json, math, glob
import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
from tqdm import tqdm

In [None]:
def process_swe(swetype, id_path):
    with open(id_path) as id_file:
        res = json.load(id_file)
    print(res)
    ids = set(res['generated']) | set(res['no_generation'])
    print(f'len of {swetype}', len(ids))
    print('model', 'total', 'deduped')
    for fname in glob.glob(f"raw-data/swebench-experiments/evaluation/{swetype}/*/results/results.json"):
        mname = fname.split('/')[-3]
        try:
            with open(id_path) as id_file:
                res = json.load(id_file)
        except:
            print('not jsonl', fname)
        total_list = res["generated"] + res["no_generation"]
        print(mname, len(total_list), len(set(total_list)))
        assert len(total_list) == len(ids), "results for {fname} is incomplete"

    records = []
    for fname in glob.glob(f"raw-data/swebench-experiments/evaluation/{swetype}/*/results/results.json"):
        # print(fname)
        mname = fname.split('/')[-3]
        print(mname)
        with open(fname, 'r') as f:
            result = json.load(f)
        if 'resolved' in result:
            resolved = set(result['resolved'])
        else: resolved = set()
        
        for id in ids:
            records.append({
                'benchmark_id': f'swebench-{swetype}',
                'model': mname,
                'example_id': id,
                'pass1': 1 if id in resolved else 0
            })
    dfo = pd.DataFrame(records)
    display(dfo)
    dfo.to_json(f'data/swebench-{swetype}.jsonl', orient='records', lines=True)

swetype = 'lite'
id_path = 'raw-data/swebench-experiments/evaluation/lite/20231010_rag_claude2/results/results.json'
process_swe(swetype, id_path)

swetype = 'verified'
id_path = 'raw-data/swebench-experiments/evaluation/verified/20231010_rag_claude2/results/results.json'
process_swe(swetype, id_path)

swetype = 'test'
id_path = 'raw-data/swebench-experiments/evaluation/test/20231010_rag_claude2/results/results.json'
process_swe(swetype, id_path)

In [None]:
def get_cruxeval(type):
    records = []
    for fname in glob.glob(f"crux-eval.github.io/evaluation_results/*_temp0.2_{type}.json"):
        name = fname.split('/')[-1]
        model, temp, typejsonl = name.split('_')
        print(model, temp, type)

        with open(fname) as f:
            res = json.load(f)['raw_scored_generations']
            for exid in res:
                gotid = np.mean(res[exid])
                actualid = exid.replace('sample_', '')
                records.append({
                    'benchmark_id': f'CRUXEval-{type}',
                    'model': model,
                    'example_id': f"CRUXEval-{type}/{actualid}",
                    'pass1': gotid,
                    'hyperparams': 'temp0.2'
                })
    df = pd.DataFrame(records)
    display(df.describe())
    return df
        
with open('data/cruxeval_input.jsonl', 'w') as f:
    dfi = get_cruxeval('input')
    f.write(dfi.to_json(orient='records', lines=True))

with open('data/cruxeval_output.jsonl', 'w') as f:
    dfo = get_cruxeval('output')
    f.write(dfo.to_json(orient='records', lines=True))

pass1 = pd.concat([dfi, dfo])
display(pass1)


In [None]:
# generate evalplus links

import sys
from jinja2 import Template

sys.path.append('/private/home/sida/git/CodeGen/scripts/sida/arena/data/evalplus')

from evalplus.data import get_mbpp_plus, get_human_eval_plus, write_jsonl
# has these fields dict_keys(['task_id', 'prompt', 'entry_point', 'canonical_solution', 'test', 'contract', 'base_input', 'atol', 'plus_input'])

j2_template = Template("""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />   <!--It is necessary to use the UTF-8 encoding with plotly graphics to get e.g. negative signs to render correctly -->
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<link
  rel="stylesheet"
  href="https://crux-eval.github.io/static/css/bulma.min.css"
>
</head>

<body>
<section class="section">                       
<h1> {{ id }} </h1>
<pre><code> {{ problem }} </code></pre>
<h2>solution</h2>
<pre><code> {{ canonical_solution }} </code></pre>
<h2>base input</h2>
<pre><code> {{ base_input }} </code></pre>
<h2>plus input</h2>
<pre><code> {{ plus_input }} </code></pre>
<br>
<h1>Generations</h1>
{% for key,value in outputs.iterrows() %}
      <p> {{ value['model'] }}. fail_base: {{ value['fail_base'] }}, fail_plus: {{ value['fail_plus'] }} </p>
      <pre><code>  {{ value['raw_genereation'] }}</code></pre>
{% endfor %}
                       
</section>

</body>
</html>
""")

def write_benchmark(benchmark_id):
    if benchmark_id == 'humaneval+':
        get_bench = get_human_eval_plus
    else:
        get_bench = get_mbpp_plus

    samples = [
        dict(example_id=task_id,
            problem=problem['prompt'],
            canonical_solution=problem['canonical_solution'],
            base_input=problem['base_input'],
            plus_input=problem['plus_input'],
        )
        for task_id, problem in get_bench().items()
    ]
    df_prob = pd.DataFrame(samples)

    records = []
    for fname in glob.glob(f"data/noise_analysis/*/*.jsonl"):
        with open(fname, 'rt') as f:
            records.extend([json.loads(l) for l in f.readlines()])

    eval_results = pd.DataFrame(records)
    display(eval_results.describe())

    benchmark_results = eval_results[eval_results['benchmark_id'] == benchmark_id]

    df = df_prob.merge(benchmark_results, on='example_id')

    ids = list(set(df['example_id']))
    for id in ids:
        current_prob = df[df['example_id'] == id]
        first = current_prob.iloc[0]
        # display(current_prob)
        generation = current_prob[['model', 'raw_genereation', 'sanitized_generation', 'fail_base', 'fail_plus']]

        with open(f'crux-eval.github.io/eval-arena/evalplus/{id}.html', 'w') as f: 
            f.write(j2_template.render({'id': id, 'problem': first['problem'],
                                        'canonical_solution': first['canonical_solution'],
                                        'base_input': first['base_input'],
                                        'plus_input': first['plus_input'],
                                        'outputs': generation}))

write_benchmark('humaneval+')
write_benchmark('mbpp+')

In [None]:
def inspect(benchmark_id, example_id):
    # has these fields dict_keys(['task_id', 'prompt', 'entry_point', 'canonical_solution', 'test', 'contract', 'base_input', 'atol', 'plus_input'])
    if benchmark_id == 'humaneval+':
        get_bench = get_human_eval_plus
    else:
        get_bench = get_mbpp_plus

    samples = [
        dict(example_id=task_id, problem=problem['prompt'], solution=problem['canonical_solution'], test=problem['test'], plus_input=problem['plus_input'])
        for task_id, problem in get_bench().items()
    ]
    df_prob = pd.DataFrame(samples)
    for r in df_prob[df_prob['example_id'] == example_id].to_numpy():
        for v in r:
            print(v)

inspect('humaneval+', 'HumanEval/122')

In [None]:
dfi = pd.read_csv('data/lcb_arena.csv')
display(dfi)
with open('data/lcb_arena.jsonl', 'w') as f:
    dfi = dfi[['benchmark_id', 'example_id', 'model', 'pass1']] 
    f.write(dfi.to_json(orient='records', lines=True, index=False))


In [None]:
dfi = pd.read_json('raw-data/ds1000-full.jsonl', lines=True)
with open('data/ds1000.jsonl', 'w') as f:
    dfi = dfi[['benchmark_id', 'example_id', 'model', 'pass1']] 
    f.write(dfi.to_json(orient='records', lines=True))

In [None]:
# process evalplus ones
def evalplus(name, isplus):
    records = []
    for fname in glob.glob(f"data/noise_analysis/{name}/*.jsonl"):
        with open(fname, 'rt') as f:
            records.extend([json.loads(l) for l in f.readlines()])
    df = pd.DataFrame(records)
    # display(df.describe())
    if isplus:
        df['pass1'] = np.where(df['fail_plus'], 0, 1) 
        df['benchmark_id'] = f'{name}+' 
    else:
        df['pass1'] = np.where(df['fail_base'], 0, 1) 
        df['benchmark_id'] = f'{name}'
    df = df[[
        'benchmark_id',
        'model',
        'example_id',
        'pass1',
    ]]
    return df

with open('data/humaneval.jsonl', 'w') as f:
    dfi = evalplus('humaneval', False)
    f.write(dfi.to_json(orient='records', lines=True, index=False))

with open('data/humaneval+.jsonl', 'w') as f:
    dfi = evalplus('humaneval', True)
    f.write(dfi.to_json(orient='records', lines=True, index=False))

with open('data/mbpp.jsonl', 'w') as f:
    dfi = evalplus('mbpp', False)
    f.write(dfi.to_json(orient='records', lines=True, index=False))

with open('data/mbpp+.jsonl', 'w') as f:
    dfi = evalplus('mbpp', True)
    f.write(dfi.to_json(orient='records', lines=True, index=False))