In [None]:
import json, math, glob
from collections import Counter

import numpy as np
import pandas as pd
import scipy.stats as stats
from tqdm import tqdm


import plotly.express as px
import plotly.graph_objects as go

from arena import result_table, pass1_to_battle, compute_pairwise_win_fraction, compute_pvalues 

records = []
for fname in glob.glob(f"data/*.jsonl"):
    with open(fname, 'rt') as f:
        records.extend([json.loads(l) for l in f.readlines()])

eval_results = pd.DataFrame(records)


In [None]:

def pfunc(input: pd.Series):
    sufs = Counter(input.values) # model_a, model_b, neither, both
    res = {} 
    res['diff'] = sufs['model_a'] - sufs['model_b']
    res['sum'] = sufs['model_a'] + sufs['model_b'] 
    # res['pvalue-chi2'] = 1 if res['diff'] == 0 else (1 - stats.chi2.cdf( (np.abs(res['diff']) - 1)**2 / res['sum'], 1))
    res['pvalue-binom'] = stats.binomtest(sufs['model_a'], res['sum'], p=0.5).pvalue
    p = sufs['model_a'] / res['sum']
    p = p if p > 0.5 else 1-p
    res['wrong_chance'] = stats.binomtest(res['sum']//2, res['sum'], p=p, alternative='less').pvalue
    # res['pvalue-trinom'] = trinomial(sufs['model_a'], sufs['model_b'], sufs['both'] + sufs['neither'])
    return res
#suf_stats

# for b in ['mbpp', 'humaneval', 'CRUXEval-input']:
for b in ['humaneval', 'humaneval+']:
    results = eval_results[eval_results['benchmark_id'] == b]
    # models = list(set(results['model']))[:8]
    # results = results[results['model'].isin(models)]
    battles = pass1_to_battle(results)
    # display(battles)
    df = result_table(battles, results)
    diffvsum = battles[['model_a', 'model_b', 'winner']].groupby(['model_a', 'model_b']).aggregate({'winner': pfunc})
    diffvsum = diffvsum['winner'].apply(pd.Series)
    diffvsum = diffvsum.reset_index(drop=False)
    display(diffvsum)
    figs = px.scatter(diffvsum, x=diffvsum['diff'].abs(), y='sum', custom_data=['model_a', 'model_b', 'sum', 'diff', 'pvalue-binom', 'wrong_chance'])
    figs.update_traces(hovertemplate=
        "<br>".join([
        "Model A: %{customdata[0]}",
        "Model B: %{customdata[1]}", 
        "|A - B|: %{customdata[3]}", 
        "A + B: %{customdata[2]}", 
        "p-value: %{customdata[4]:.4f}", 
        "wrong prob.: %{customdata[5]:.4f}", 
        ])  + '<extra></extra>')
    
    dsz = len(set(results['example_id']))
    maxy = diffvsum['sum'].max()
    print('maxy', maxy)

    refs = []
    for alpha in [0.05, 0.1]:
        thres = stats.chi2.ppf(1-alpha, 1)
        print('thres', thres)
        y = np.linspace(1, maxy, 200)
        refs.append(pd.DataFrame({'x': 1 + np.sqrt(y * thres), 'y': y, 'type': f'pvalue={alpha}'}))
    x = np.linspace(0, dsz / 2, 100)
    refs.append(pd.DataFrame({'x': x, 'y': x, 'type': 'x=y'}))
    df_ref = pd.concat(refs, axis=0)
    figl = px.line(df_ref, x='x', y='y', color='type', hover_data=[])
    fig = go.Figure(data=figl.data + figs.data)
    fig.update_layout(
        width=600, height=600, title=b,
        xaxis_title="|#A_win - #B_win|",
        yaxis_title="#A_win + #B_win"
    )


In [None]:
def pfunc(input: pd.Series):
    sufs = Counter(input.values) # model_a, model_b, neither, both
    res = {} 
    total = sufs.total()
    res['diff'] = sufs['model_a'] - sufs['model_b']
    res['sum'] = sufs['model_a'] + sufs['model_b'] 
    res['accA'] = (sufs['model_a'] + sufs['both']) / total
    res['accB'] = (sufs['model_b'] + sufs['both']) / total
    # res['pvalue-chi2'] = 1 if res['diff'] == 0 else (1 - stats.chi2.cdf( (np.abs(res['diff']) - 1)**2 / res['sum'], 1))
    pv = stats.binomtest(sufs['model_a'], res['sum'], p=0.5).pvalue
    res['p_value'] = pv if pv < 0.2 else 0.2
    p = sufs['model_a'] / res['sum']
    p = p if p > 0.5 else 1-p
    res['wrong_chance'] = stats.binomtest(res['sum']//2, res['sum'], p=p, alternative='less').pvalue
    # res['pvalue-trinom'] = trinomial(sufs['model_a'], sufs['model_b'], sufs['both'] + sufs['neither'])
    return res
#suf_stats

# for b in ['mbpp', 'humaneval', 'CRUXEval-input']:
for b in ['humaneval', 'humaneval+']:
    results = eval_results[eval_results['benchmark_id'] == b]
    # models = list(set(results['model']))[:8]
    # results = results[results['model'].isin(models)]
    battles = pass1_to_battle(results)
    # display(battles)
    df = result_table(battles, results)
    diffvsum = battles[['model_a', 'model_b', 'winner']].groupby(['model_a', 'model_b']).aggregate({'winner': pfunc})
    diffvsum = diffvsum['winner'].apply(pd.Series)
    diffvsum = diffvsum.reset_index(drop=False)
    display(diffvsum)
    figs = px.scatter(diffvsum, x='accA', y='accB', color='p_value',
        custom_data=['model_a', 'model_b', 'p_value', 'wrong_chance', 'accA', 'accB'])
    figs.update_traces(hovertemplate=
        "<br>".join([
        "Model A: %{customdata[0]}",
        "Model B: %{customdata[1]}", 
        "acc(A): %{customdata[4]:.3f}", 
        "acc(B): %{customdata[5]:.3f}", 
        "p-value: %{customdata[2]:.4f}", 
        "wrong prob.: %{customdata[3]:.4f}", 
        ])  + '<extra></extra>')
    
    dsz = len(set(results['example_id']))
    maxy = diffvsum['sum'].max()
    print('maxy', maxy)

   
    # fig = go.Figure(data=figs.data)
    figs.update_layout(
        width=600, height=600, title=b,
        xaxis_title="acc(Model A)",
        yaxis_title="acc(Model B)",
        legend_title='p_value'
    )
    display(figs)


In [None]:
from scipy.integrate import dblquad
from scipy.special import gamma

def beta_n(x, ax, bx):
    return gamma(ax + bx) / gamma(ax) / gamma(bx) * x**(ax-1) * (1-x)**(bx-1) 
def beta_coef(y, x, ax, bx, ay, by):
    return beta_n(x, ax, bx) * beta_n(y, ay, by)
def beta(y, x):
    return beta_coef(y, x, 10, 10, 11, 9)

dblquad(beta, 0, 1, 0, lambda x: x)


In [None]:

ps = 0.1 * np.random.rand(10, 1)
scores = []
num_noties = []
for _ in range(10000):
    match_notie = np.random.rand(*ps.shape) < ps
    num_noties.append(match_notie.sum())

    signs = np.sign(np.random.randn(*ps.shape))[match_notie > 0]
    s = signs.sum()
    scores.append(s)

plt.hist(scores, bins=50)
plt.figure()
plt.hist(num_noties, bins=20)


In [None]:
!pip install py-irt


In [None]:
!wget https://raw.githubusercontent.com/nd-ball/py-irt/d2a27dd55a84459782a5514e752ee48d9a63626e/test_fixtures/minitest.jsonlines
!cat minitest.jsonlines

!py-irt train 1pl minitest.jsonlines test-1pl/ --lr 0.02 --epochs 100


In [None]:
import arena
import importlib
importlib.reload(arena)

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy.random as rng

tie_probs = np.concatenate((1 - 0.05 * np.random.rand(100), 0*np.random.rand(100)))
weights = rng.rand(tie_probs.size)
# print(tie_probs)

samps = []
for _ in range(1000):
    p = tie_probs.size
    response_a = (rng.rand(p) > tie_probs) * np.sign(rng.randn(p))
    response_b = response_a * -1
    response_b = np.sign(rng.randn(p))
    cdf, pvalue = arena.sign_test_niid(response_a, response_b, weights, tie_probs)
    samps.append(pvalue)

plt.hist(samps)

In [None]:
print(cdf)
ax = plt.subplot()
cdf.plot(ax)
print(cdf.evaluate(-0.1))
print(cdf.evaluate(0.1))

In [None]:
def remove_uppercase(s):
    """
    Write a function to remove uppercase substrings from a given string.
    assert remove_uppercase('cAstyoUrFavoRitETVshoWs') == 'cstyoravoitshos'
    """
    result = []
    temp = ''
    for char in s:
        if char.isupper():
            if temp:
                result.append(temp)
                temp = ''
        else:
            temp += char
    if temp:
        result.append(temp)
    return ''.join(result)


print(remove_uppercase('cAstyoUrFavoRitETVshoWs'))
if '':
   print('true')

In [None]:
import arena
import importlib
importlib.reload(arena)
def trinomial(na, nb, n0):
    n = na + nb + n0
    cdf, pvalue = arena.sign_test_niid(([1]*na + [0]*nb + [0]*n0), np.array([0]*na + [1]*nb + [0]*n0), tie_probs=None, weights=None, sample_all=False)
    cdf, pvalue = arena.sign_test_niid(np.array([1]*na + [0]*nb + [0]*n0), np.array([0]*na + [1]*nb + [0]*n0), tie_probs=n0 / n * np.array([1] * n), weights=None, sample_all=True)
    print('binom', stats.binomtest(na, na + nb, p=0.5).pvalue)
    return pvalue

# trinomial(20, 12, 133)

def bootstrap_consistency(battles: pd.Series, num_round=1000, interpolation='nearest'):
    rows = []
    counts = Counter(battles)
    sign = np.sign(counts['model_a'] - counts['model_b'])
    for i in range(num_round):
        counts = Counter(battles.sample(frac=1.0, replace=True))
        diff = counts['model_a'] - counts['model_b']
        rows.append(diff)
    return 1 - np.mean(np.sign(rows) == sign)



print(bootstrap_ci(pd.Series(['model_a', 'model_b', 'model_a', 'both']*2)))
    
thres = stats.chi2.ppf(1-0.1, 1)
print(thres, np.mean(np.random.randn(100000)**2 > thres))

In [None]:
print(stats.binomtest(61, 100, p=0.5).pvalue)
print(stats.binomtest(61, 100, p=0.5, alternative='greater').pvalue * 2)