In [None]:
from collections import defaultdict
import json, math, glob
import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
from tqdm import tqdm
import math
import os
from report_agg import result_table, pass1_to_battle

In [None]:
import plotly.graph_objects as go

def get_example_ratings(benchmark_id):
    result = eval_results[eval_results['benchmark_id'] == benchmark_id]

    battles = pass1_to_battle(result)
    battles_no_ties = battles[battles["winner"].str.contains("model_")]
    all_stats = result_table(battles_no_ties, result)
    # example_table = gen_example_table(result, all_stats)
    display(all_stats)
    records = []
    len_data = len(set(result['example_id']))
    ids = set(result['example_id']) 
    len_data = len(set(result['example_id']))
    print(np.mean(all_stats['elo']))
    
    for current_id in list(ids):
        example_data = result[result['example_id'] == current_id][['model', 'pass1']]
        fit_data = example_data.merge(all_stats[['model', 'elo']], left_on = 'model', right_on = 'model')
        # fit_data['result'] = fit_data['result']
        from sklearn.linear_model import LogisticRegression
        lr = LogisticRegression()
        # display(fit_data)
        fit_data['correct'] = np.where(fit_data['pass1'] > 0, 1, 0)
        if all(fit_data['correct'].to_numpy() == 0) or all(fit_data['correct'].to_numpy() == 1):
            mean = 1 if all(fit_data['correct'].to_numpy() == 1) else 0
            records.append({
                'sample_id': current_id,
                'elo': -3500 if mean > 0 else 3500,
                'elo_var': 0,
                'score': 1,
                'acc': mean,
            })
        else:
            Xd = fit_data['elo'].to_numpy().reshape(-1, 1)
            yd = fit_data['correct'].to_numpy() 
            lrm = lr.fit(fit_data['elo'].to_numpy().reshape(-1, 1), fit_data['correct'].to_numpy())
            X = list(range(500, 1500, 10))
            y = [lrm.predict_proba([[x]])[0, 1] for x in X]
            elo = -lrm.intercept_[0] / lrm.coef_[0][0]
            elo_var = 1/lrm.coef_[0][0]
            score = lrm.score(Xd, yd)
            records.append({
                'sample_id': current_id,
                'elo': elo,
                'elo_var': elo_var,
                'score': score,
                'acc': fit_data['correct'].to_numpy().mean()
            })
    return pd.DataFrame(records)

records = []
for fname in glob.glob(f"data/*.jsonl"):
    with open(fname, 'rt') as f:
        records.extend([json.loads(l) for l in f.readlines()])

eval_results = pd.DataFrame(records)
for b in ['humaneval+']:
    df = get_example_ratings(b)
    display(df)

from report_example import gen_example_table

def get_example_level_results(benchmark_id):
    result = eval_results[eval_results['benchmark_id'] == benchmark_id]
    battles = pass1_to_battle(result)
    battles_no_ties = battles[battles["winner"].str.contains("model_")]
    all_stats = result_table(battles_no_ties, result)
    example_table = gen_example_table(result, all_stats)
    return example_table

table = get_example_level_results('CRUXEval-input')
display(table)

In [None]:
plt.hist(table['tau'])

In [None]:
records = []
for fname in glob.glob(f"data/*.jsonl"):
    with open(fname, 'rt') as f:
        records.extend([json.loads(l) for l in f.readlines()])

eval_results = pd.DataFrame(records)[['benchmark_id', 'model', 'example_id', 'pass1']]
# display(eval_results)

def get_result(benchmark_id):
    return eval_results[eval_results['benchmark_id'] == benchmark_id]

def format_irt(result):
    """
    {"subject_id": "pedro",    "responses": {"q1": 1, "q2": 0, "q3": 1, "q4": 0}}
    """
    records = []
    for name, g in result[['example_id', 'model', 'pass1']].groupby('model'):
        records.append(
            {'subject_id': name, 'responses': {r.example_id: r.pass1 for r in g.itertuples()}}
        )
    print(records)
    return records

dfb = get_result('humaneval+')
display(dfb)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def format_irt(results):
    """
    output {"subject_id": "pedro", "q1": 1, "q2": 0, "q3": 1, "q4": 0}
    """
    records = []
    for name, g in results[['example_id', 'model', 'pass1']].groupby('model'):
        records.append(
            {'subject_id': name, **{r.example_id: r.pass1 for r in g.itertuples()}}
        )
    return records

def irt_ability(results):
    from py_irt.config import IrtConfig
    from py_irt.training import IrtModelTrainer
    from py_irt.dataset import Dataset
    from py_irt.models import OneParamLog
    df_irt = pd.DataFrame(format_irt(results))
    dataset = Dataset.from_pandas(df_irt, subject_column='subject_id')
    config = IrtConfig(model_type='2pl', log_every=200, dropout=0.2)
    trainer = IrtModelTrainer(config=config, data_path=None, dataset=dataset)
    trainer.train(epochs=5000, device='cpu')
    # trainer = OneParamLog.train(dataset)

    subjs = trainer.last_params['subject_ids']
    # print(trainer.last_params.keys())
    # dict_keys(['ability', 'diff', 'irt_model', 'item_ids', 'subject_ids'])
    items = trainer.last_params['item_ids']

    df_models = pd.DataFrame({'model': [subjs[k] for k in subjs], 'ability': trainer.last_params['ability']})
    df_items = pd.DataFrame({'example_id': [items[k] for k in items], 'diff': trainer.last_params['diff'], 'disc': trainer.last_params['disc']})
    return df_models, df_items

def get_ratings(results):
    battles = pass1_to_battle(results)
    battles_no_ties = battles[battles["winner"].str.contains("model_")]
    all_stats = result_table(battles_no_ties, results)
    ability, _ = irt_ability(results)
    all_stats = all_stats.merge(ability, on='model')
    return all_stats

def subsample(results, n=100):
    eids = set(results['example_id'])
    include_ids = np.random.choice(list(eids), n, replace=False)
    return results[results['example_id'].isin(include_ids)]


def compare_subsamples(benchmark_id):
    result = eval_results[eval_results['benchmark_id'] == benchmark_id]
    result1 = subsample(result)
    result2 = subsample(result)
    stats1 = get_ratings(result1)
    stats2 = get_ratings(result2)
    both_stats = stats1.merge(stats2, on='model')
    display(both_stats)
    # fig = px.scatter(both_stats, 'pass1_x', 'pass1_y')
    # fig = px.scatter(both_stats, 'pass1_x', 'pass1_y')
    # display(fig)
    return both_stats

for b in set(eval_results['benchmark_id']):
    result = eval_results[eval_results['benchmark_id'] == b]
    result['pass1'] = np.where(result['pass1'] > 0.1, 1, 0)
    df_irt = pd.DataFrame(format_irt(result))
    display(df_irt)
    with open(f'irt_data/{b}.jsonl', 'w') as f:
        f.write(df_irt.to_json(orient='records', lines=True, index=False))
# df_irt.save()

# result['pass1'] = np.where(result['pass1'] > 0.1, 1, 0)
# abilities, items = irt_ability(result)
# display(items)

# sns.scatterplot(items, x='diff', y='disc')

# display(all_stats)

In [None]:
both_stats = compare_subsamples('mbpp+')
display(both_stats)
plt.figure()
sns.scatterplot(both_stats, x='elo_x', y='elo_y')
plt.figure()
sns.scatterplot(both_stats, x='pass1_x', y='pass1_y')
plt.figure()
sns.scatterplot(both_stats, x='ability_x', y='ability_y')

# for prefix in ['ability', 'pass1', 'win_rate', 'elo']:
#     for prefix2 in ['ability', 'pass1', 'win_rate', 'elo']:
#         tau = stats.kendalltau(both_stats[f'{prefix}_x'], both_stats[f'{prefix2}_x'])
#         print(prefix, prefix2, f'{tau.statistic:.4f}')

for prefix in ['ability', 'pass1', 'win_rate', 'elo']:
    tau = stats.kendalltau(both_stats[f'{prefix}_x'], both_stats[f'{prefix}_y'])
    print(prefix, f'{tau.statistic:.4f}')

In [None]:
sns.scatterplot(items, y='disc', x='diff')
df_irt = pd.DataFrame(format_irt(result))
battles = pass1_to_battle(result)
battles_no_ties = battles[battles["winner"].str.contains("model_")]
all_stats = result_table(battles_no_ties, result)
# example_table = gen_example_table(result, all_stats)
display(all_stats)
display(abilities)
ids = list(set(result['example_id']))
for id in ids[100:120]:
    plt.figure()
    result_id = result[result['example_id'] == id]
    pred_v_ability = result_id.merge(all_stats[['model', 'elo']], on = 'model')
    pred_v_ability = pred_v_ability.merge(abilities[['model', 'ability']], on = 'model')
    # display(pred_v_ability)
    plt.subplot(1,2,1)
    sns.scatterplot(pred_v_ability, x='elo', y='pass1')
    plt.subplot(1,2,2)
    sns.scatterplot(pred_v_ability, x='ability', y='pass1')
    item_info = items[items['example_id'] == id]
    plt.title(item_info)

In [None]:
import report_agg
from report_agg import result_table, pass1_to_battle
import numpy.random as rng
import arena
import matplotlib.pyplot as plt
import importlib
importlib.reload(arena)

result = eval_results[eval_results['benchmark_id'] == 'mbpp+']
battles = pass1_to_battle(result)

def estimate_tie_probs(battles: pd.DataFrame):
    pass

estimate_tie_probs(battles)
win_probs = battles.groupby(by='example_id')[['winner']].aggregate(lambda x: 2*np.mean(x == 'model_a'))
tie_probs = battles.groupby(by='example_id')[['winner']].aggregate(lambda x: np.mean((x == 'neither') | (x == 'both')))
tie_probs = battles.groupby(by='example_id')[['winner']].aggregate(lambda x: np.mean((x == 'neither')))
# display(tie_probs.describe())
# tie_probs.hist()
display(tie_probs)

m = list(set(battles['model_a']))
display(battles.head())
# ma = 'claude-3-opus-20240229'
# mb = 'opencodeinterpreter-ds-33b'
# mb = 'meta-llama-3-70b-instruct'
# ma = 'deepseek-coder-33b-instruct'
# mb = 'deepseek-coder-6.7b-instruct'
ma = 'codellama-34b'
mb = 'codellama-13b'
# mb = 'codellama-7b'
ma = 'gpt-4-1106-preview'
mb = 'meta-llama-3-70b-instruct'
result_a = result[result['model'] == ma][['example_id', 'pass1']]
result_b = result[result['model'] == mb][['example_id', 'pass1']]
result_ab = result_a.merge(result_b, on='example_id', suffixes=['_a', '_b'])
res_withprob = pd.merge(tie_probs, result_ab, on='example_id')
display(res_withprob)
res_withprob['winner'].hist()
tie_probs = res_withprob['winner'].to_numpy()
win_prob = (1 - tie_probs) 
weights = 1 - win_prob 
weights = {
    'uniform': np.ones(win_prob.shape), 
    'win_prob': win_prob,
    'if2': np.where((0.05 < win_prob) & (win_prob < 0.8), 1.5, 1),
    'disc': 0.5 + result_ab[['example_id']].merge(items, on='example_id')['disc'].to_numpy(),
    'rand': np.random.rand(*win_prob.shape),
}

plt.figure()
for w in weights:
    we = weights[w]
    assert all(we >= 0)
    # cdf, pv = arena.sign_test_niid(res_withprob['pass1_a'].to_numpy(), res_withprob['pass1_b'].to_numpy(), res_withprob['winner'].to_numpy(), we) 
    cdf, pv = arena.sign_test_niid(res_withprob['pass1_a'].to_numpy(), res_withprob['pass1_b'].to_numpy(), None, we, sample_all=False) 
    plt.figure()
    ax = plt.subplot(1, 2, 1)
    cdf.plot(ax)
    plt.title(f'{w}\t {pv:.3f}')
    ax = plt.subplot(1, 2, 2)
    ax.hist(we)
    print(w, pv)

In [None]:
display(items)

In [None]:
importlib.reload(arena)

res_a = res_withprob['pass1_a'].to_numpy()
res_b = res_withprob['pass1_b'].to_numpy()
tie_probs = np.mean(res_a == res_b) * np.ones(res_b.size)
# print(tie_probs)
weights = 1*np.ones(res_b.size)
cdf, pv = arena.sign_test_niid(res_a, res_b, tie_probs, weights, sample_all=True) 

print(pv)

import scipy.stats as stats
k = np.sum(res_a > res_b)
n = np.sum(res_a != res_b)
print(k, n)
print('binom', stats.binomtest(k, n, p=0.5, alternative='two-sided').pvalue)

ax = plt.subplot()
cdf.plot(ax)

cdf.evaluate(-0.01)