In [2]:
import json, math, glob
from collections import Counter

import numpy as np
import pandas as pd
import scipy.stats as stats
from tqdm import tqdm

import plotly.express as px
import plotly.graph_objects as go

import arena

records = []
for fname in glob.glob(f"data/*.jsonl"):
    with open(fname, 'rt') as f:
        records.extend([json.loads(l) for l in f.readlines()])
eval_results = pd.DataFrame(records)

benchmarks = set(eval_results['benchmark_id'])
records = []
for b in benchmarks:
    result = eval_results[eval_results['benchmark_id'] == b] 
    battles = arena.pass1_to_battle(result)
    summary = arena.battle_summary(battles)
    agg_results = arena.result_table(battles, result)
    ex = arena.example_table(result, agg_results)

    data_sz = int(summary.iloc[0]['total'])
    num_tested = len(set(result['model']))
    min_p5 = int(summary[summary['pvalue'] < 0.05]['diff'].abs().min())
    max_p5 = int(summary[summary['pvalue'] > 0.05]['diff'].abs().max())
    min_dist = int(summary['sum'].abs().min())
    print(f'{b}\t N={data_sz},\t diff_min/max={min_p5}/{max_p5}')
    r = {
        'benchmark_id': b,
        'size': data_sz,
        # 'models_tested': num_tested,
        'p5_min': min_p5,
        'p5_max': max_p5,
        'min_dist': min_dist,
        'no_solve': (ex['acc'] == 0).to_numpy().sum(),
        # '#solved_by_1': ((ex['acc'] > 0) & (ex['acc'] <= 1/num_tested+ 1e-10)).to_numpy().sum(),
        'neg_tau': (ex['tau'] < 0).to_numpy().sum(),
    }
    display(r)
    records.append(r)

summary_counts = pd.DataFrame(records).sort_values(by='benchmark_id')
display(summary_counts)


In [158]:
summary_count = pd.DataFrame(records).sort_values(by='benchmark_id')
def links(b):
    l1 = f"""by <a href="model_{b}.html">models </a> | """
    l2 = f"""<a href="ex_{b}.html"> examples </a>"""
    return l1 + l2
summary_count['link to details'] = summary_count['benchmark_id'].apply(links)

def normalize(counts, includes):
    percent = pd.DataFrame(counts)
    for c in includes:
        percent[c] = percent[c] / percent['size']
    return percent

includes_cols = ['benchmark_id', 'size', 'p5_min', 'p5_max', 'no_solve', 'neg_tau', 'link to details']
percent_cols = ['p5_min', 'p5_max', 'no_solve', 'neg_tau']
summary_percent = normalize(summary_count, percent_cols)
display(summary_percent)

from jinja2 import Template
template_path = r"summary.html"
output_path = rf"crux-eval.github.io/eval-arena/index.html"
with open(output_path, "w", encoding="utf-8") as output_file:
    with open(template_path) as template_file:
        j2_template = Template(template_file.read())
        output_file.write(j2_template.render({
            'count_table': summary_count[includes_cols].to_html(escape=False, index=False),
            'percent_table': summary_percent[includes_cols].to_html(
                escape=False,
                index=False,
                formatters={
                    'p5_min': '{:.1%}'.format,
                    'p5_max': '{:.1%}'.format,
                    'min_dist': '{:.1%}'.format,
                    'no_solve': '{:.1%}'.format,
                    'neg_tau': '{:.1%}'.format,
                }),
        }))


Unnamed: 0,benchmark_id,size,p5_min,p5_max,min_dist,no_solve,neg_tau,link to details
0,CRUXEval-input,800,0.03125,0.03875,79,0.02375,0.09375,"by <a href=""model_CRUXEval-input.html"">models ..."
1,CRUXEval-output,800,0.03125,0.0325,67,0.035,0.06125,"by <a href=""model_CRUXEval-output.html"">models..."
4,DS1000,1000,0.021,0.032,74,0.159,0.039,"by <a href=""model_DS1000.html"">models </a> | <..."
7,humaneval,164,0.060976,0.097561,20,0.036585,0.018293,"by <a href=""model_humaneval.html"">models </a> ..."
3,humaneval+,164,0.067073,0.097561,23,0.042683,0.018293,"by <a href=""model_humaneval+.html"">models </a>..."
2,lcb_codegen,400,0.0325,0.0475,27,0.2425,0.015,"by <a href=""model_lcb_codegen.html"">models </a..."
5,mbpp,378,0.037037,0.058201,38,0.02381,0.039683,"by <a href=""model_mbpp.html"">models </a> | <a ..."
6,mbpp+,378,0.042328,0.055556,40,0.095238,0.058201,"by <a href=""model_mbpp+.html"">models </a> | <a..."


In [None]:
benchmarks = set(eval_results['benchmark_id'])
pairs = {}
pairs['humaneval'] = pairs['humaneval+'] = [
    ('claude-3-sonnet-20240229', 'claude-3-haiku-20240307'),
    ('claude-3-opus-20240229', 'claude-3-sonnet-20240229'),
    ('code-llama-multi-34b', 'code-llama-multi-13b'),
    ('wizardcoder-34b', 'wizardcoder-15b'),
]
pairs['mbpp'] = pairs['mbpp+'] = pairs['humaneval']
pairs['CRUXEval-input'] = pairs['CRUXEval-output'] = [
    ('deepseek-base-33b', 'deepseek-base-6.7b'),
    ('deepseek-instruct-33b', 'deepseek-instruct-6.7b'),
    ('codellama-34b', 'codellama-13b'),
    ('codellama-13b', 'codellama-7b')
]

def subsample(results, n=100):
    eids = set(results['example_id'])
    include_ids = np.random.choice(list(eids), n, replace=False)
    return results[results['example_id'].isin(include_ids)]

def sample_table(results):
    results = subsample(results, 164)
    battles = pass1_to_battle(results)
    # battles= battles[battles["winner"].str.contains("model_")]
    result_tbl = arena.result_table(battles, results)
    return result_tbl

for b in [humaneval+']:
    results = eval_results[eval_results['benchmark_id'] == b]
    result1 = sample_table(results)
    display(result1)
    # result2 = sample_table(results)
    # result_tblo = result1.merge(result2, on='model', suffixes=['_1', '_2'])
    # display(result_tblo)


In [130]:
def varf(pass1):
    p1 = pass1.values
    total = len(p1)
    return {
       'true_diff': 2*np.sum(p1*(1-p1)),
       'acc_std': 2 * total * np.mean(p1) * (1-np.mean(p1)), 
    }

for b in ['mbpp']:
    result = eval_results[eval_results['benchmark_id'] == b] 
    battles = arena.pass1_to_battle(result)
    agg_results = arena.result_table(battles, result)
    summary = result[['model', 'pass1']].groupby('model').aggregate(varf)['pass1'].apply(pd.Series)
    ex = arena.example_table(result, agg_results)
    display(summary)
    display(ex)
    p = ex['acc'].values
    print('example std', np.sqrt(np.sum(p*(1-p))))


1000.0


Unnamed: 0_level_0,true_diff,acc_std
model,Unnamed: 1_level_1,Unnamed: 2_level_1
CohereForAI--c4ai-command-r-plus,0.0,144.216931
HuggingFaceH4--starchat2-15b-v0.1,0.0,142.248677
Qwen--Qwen1.5-72B-Chat,0.0,150.772487
bigcode--starcoder2-15b-instruct-v0.1,0.0,129.550265
claude-3-haiku-20240307,0.0,120.238095
claude-3-opus-20240229,0.0,71.534392
claude-3-sonnet-20240229,0.0,103.661376
code-llama-13b,0.0,175.238095
code-llama-34b,0.0,160.804233
code-llama-multi-13b,0.0,176.296296


Unnamed: 0,example_id,models,acc,tau
0,Mbpp/579,"[code-llama-multi-7b, databricks--dbrx-instruc...",0.932203,0.146907
1,Mbpp/127,"[databricks--dbrx-instruct, CohereForAI--c4ai-...",0.932203,0.267697
2,Mbpp/792,"[code-llama-multi-7b, CohereForAI--c4ai-comman...",0.949153,0.168112
3,Mbpp/269,"[code-llama-multi-7b, databricks--dbrx-instruc...",0.966102,0.208615
4,Mbpp/554,"[CohereForAI--c4ai-command-r-plus, claude-3-ha...",0.542373,0.014826
...,...,...,...,...
373,Mbpp/594,"[databricks--dbrx-instruct, CohereForAI--c4ai-...",0.627119,0.426841
374,Mbpp/80,"[databricks--dbrx-instruct, CohereForAI--c4ai-...",0.677966,0.500586
375,Mbpp/725,"[CohereForAI--c4ai-command-r-plus, claude-3-ha...",0.457627,0.403599
376,Mbpp/119,"[CohereForAI--c4ai-command-r-plus, claude-3-ha...",0.406780,0.305740


example std 7.117312224079408


In [None]:
from scipy.integrate import dblquad
from scipy.special import gamma

def beta_n(x, ax, bx):
    return gamma(ax + bx) / gamma(ax) / gamma(bx) * x**(ax-1) * (1-x)**(bx-1) 
def beta_coef(y, x, ax, bx, ay, by):
    return beta_n(x, ax, bx) * beta_n(y, ay, by)
def beta(y, x):
    return beta_coef(y, x, 10, 10, 11, 9)

dblquad(beta, 0, 1, 0, lambda x: x)


In [None]:
!pip install py-irt


In [None]:
!wget https://raw.githubusercontent.com/nd-ball/py-irt/d2a27dd55a84459782a5514e752ee48d9a63626e/test_fixtures/minitest.jsonlines
!cat minitest.jsonlines

!py-irt train 1pl minitest.jsonlines test-1pl/ --lr 0.02 --epochs 100


In [None]:
import arena
import importlib
importlib.reload(arena)

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy.random as rng

tie_probs = np.concatenate((1 - 0.05 * np.random.rand(100), 0*np.random.rand(100)))
weights = rng.rand(tie_probs.size)
# print(tie_probs)

samps = []
for _ in range(1000):
    p = tie_probs.size
    response_a = (rng.rand(p) > tie_probs) * np.sign(rng.randn(p))
    response_b = response_a * -1
    response_b = np.sign(rng.randn(p))
    cdf, pvalue = arena.sign_test_niid(response_a, response_b, weights, tie_probs)
    samps.append(pvalue)

plt.hist(samps)

In [None]:
print(cdf)
ax = plt.subplot()
cdf.plot(ax)
print(cdf.evaluate(-0.1))
print(cdf.evaluate(0.1))

In [None]:
import arena
import importlib
importlib.reload(arena)
def trinomial(na, nb, n0):
    n = na + nb + n0
    cdf, pvalue = arena.sign_test_niid(([1]*na + [0]*nb + [0]*n0), np.array([0]*na + [1]*nb + [0]*n0), tie_probs=None, weights=None, sample_all=False)
    cdf, pvalue = arena.sign_test_niid(np.array([1]*na + [0]*nb + [0]*n0), np.array([0]*na + [1]*nb + [0]*n0), tie_probs=n0 / n * np.array([1] * n), weights=None, sample_all=True)
    print('binom', stats.binomtest(na, na + nb, p=0.5).pvalue)
    return pvalue

# trinomial(20, 12, 133)

def bootstrap_consistency(battles: pd.Series, num_round=1000, interpolation='nearest'):
    rows = []
    counts = Counter(battles)
    sign = np.sign(counts['model_a'] - counts['model_b'])
    for i in range(num_round):
        counts = Counter(battles.sample(frac=1.0, replace=True))
        diff = counts['model_a'] - counts['model_b']
        rows.append(diff)
    return 1 - np.mean(np.sign(rows) == sign)



print(bootstrap_ci(pd.Series(['model_a', 'model_b', 'model_a', 'both']*2)))
    
thres = stats.chi2.ppf(1-0.1, 1)
print(thres, np.mean(np.random.randn(100000)**2 > thres))

In [None]:
def fig_diff_vs_sum(battles):
    data_sz = len(set(battles['example_id']))
    bmname = set(battles['benchmark_id_a']).pop()

    print(data_sz)
    def aggfunc(input: pd.Series):
        sufs = Counter(input.values) # model_a, model_b, neither, both
        res = {} 
        res['diff'] = sufs['model_a'] - sufs['model_b']
        res['sum'] = sufs['model_a'] + sufs['model_b'] 
        # res['pvalue-chi2'] = 1 if res['diff'] == 0 else (1 - stats.chi2.cdf( (np.abs(res['diff']) - 1)**2 / res['sum'], 1))
        res['pvalue'] = stats.binomtest(sufs['model_a'], res['sum'], p=0.5).pvalue
        total = sufs.total()
        pa = sufs['model_a'] / total
        pb = sufs['model_b'] / total
        res['std'] = np.sqrt(total * (pa*(1-pa) + pb*(1-pb) + 2*pa*pb))
        return res

    diffvsum = battles[['model_a', 'model_b', 'winner']]\
        .groupby(['model_a', 'model_b'])\
        .aggregate(aggfunc)\
        ['winner'].apply(pd.Series)\
        .reset_index(drop=False)