In [None]:
import numpy as np
from numpy import mean, var, std
import pandas as pd
import scipy.stats as stats


from estimators import Paired, Single, PairedExperimental, SingleExperimental
from test_estimators import BernoulliModel, BernoulliModelStratified, BootstrapModel
import plotly.express as px
import importlib
import test_estimators as te
import estimators
importlib.reload(estimators)
importlib.reload(te)

In [None]:
def _table_single_model(model):
    truth = model.true_vars()
    estimators_todo = [
        Single.from_samples,
        estimators.Single.from_samples_unbiasedK,
        SingleExperimental.from_samples_unbiasedNK,
    ]
    t = te.TestSingleEstimators().estimator_results(truth, model, estimators_todo, verbose=False)
    return t

pA = np.array([[0.1], [0.9]])
pA = np.random.beta(0.2, 0.8, (400, 1))
modelA = BernoulliModelStratified(pA, N=400, K=5)
t = _table_single_model(modelA)
t_unbiased = t[t["unbiased"] == True]
assert all(t_unbiased["t_score"].abs() < 4)
display(t)

In [None]:
import utils
import plotly.graph_objects as go

df = pd.DataFrame(utils.load_jsonl_files("data/vllm_evals/highk_temp0.7.jsonl"))
df_math = df[df["benchmark_id"]=="human_eval_plus"]

def result_to_A(df):
    eids = df["example_id"]
    assert len(set(eids)) == len(eids)
    eids = df["example_id"].to_list()
    N = len(eids)

    Ks = set(df["count"])
    assert(len(Ks) == 1), Ks
    K = Ks.pop()
    A = np.zeros((N, K))
    # print(A, N, K)
    for i, id in enumerate(eids):
        rr = df[df["example_id"] == id]
        correct = rr["correct"].iloc[0]
        val =  np.where(np.arange(K) < correct, 1, 0)
        A[i, :] = val
    return A

# llamaname = "llama-3.1-70B-instruct"
# deepseek_r1_distill_qwen_32b
# qwen3-32b
# qwen3-14b
# google_gemma_3_12b_it
A = result_to_A(df_math[df_math["model"]=="deepseek_r1_distill_qwen_32b"])
B = result_to_A(df_math[df_math["model"]=="qwen3-32b"])
pA = A.mean(axis=1)
pB = B.mean(axis=1)
sorted_indices = np.argsort(pA.flatten())[::-1]
pA_sorted = pA[sorted_indices]
pB_sorted = pB[sorted_indices]


fig = go.Figure()
fig.add_scatter(x=np.arange(len(pA_sorted)), y=pA_sorted.flatten(), name="pA")
fig.add_scatter(x=np.arange(len(pB_sorted)), y=pB_sorted.flatten(), name="pB", mode='markers')
fig.update_layout(xaxis_title="Index (sorted by pA)", yaxis_title="Probability")
fig.show()

# B = result_to_A(df_math[df_math["model"]=="qwen3-8b"])
print(A.shape, A.flatten().mean())
print(B.shape, B.flatten().mean())
# Sort A and B by decreasing mean of A

N = 100; K = 10
modelA = BootstrapModel(A, N=N, K=K)
modelB = BootstrapModel(B, N=N, K=K)
modelB.idx = modelA.idx
# fig = px.scatter(x=modelA.sample_preds().mean(axis=1), y=modelB.sample_preds().mean(axis=1))
# display(fig)
def _table_paired(modelA, modelB, truth):
    estimators_to_test = [
        Paired.from_samples,
        Paired.from_samples_unbiasedK,
        Paired.from_samples_unbiasedK_off1,
    ]
    return te.TestPairedEstimators().estimator_results(truth, modelA, modelB, estimators_to_test, attempts=100, verbose=False)
truth = Paired.from_samples(A, B)
truth2 = Paired.from_samples_unbiasedK(A, B)
truth3 = Paired.from_bernoulli_prob(A.mean(axis=1, keepdims=True), B.mean(axis=1, keepdims=True))
print("noise A", Single.from_bernoulli_prob(A.mean(axis=1, keepdims=True)))
print(truth)
print(truth2)
print(truth3)
t = _table_paired(modelA, modelB, truth)
display(t)

In [None]:
t_all = pd.DataFrame()
for K in [100]:
    for N in np.arange(50, 1000, 50):
    # t = te._table_single(pA, K=20, N=N)
        modelA = BootstrapModel(A, N=N, K=K)
        modelB = BootstrapModel(B, N=N, K=K)
        modelB.idx = modelA.idx
        t = _table_paired(modelA, modelB, truth2)
        t["N"] = N
        t["K"] = K
        t_all = pd.concat([t_all, t], ignore_index=True)
display(t_all)
t_all["estimator"] = t_all["estimator"].apply(lambda x: x.replace("from_samples_", ""))


In [None]:

fig = px.line(t_all, x="N", y="rms", color="comp", facet_col="estimator", facet_row="K")
fig.update_layout(title=f"\nK={K}, N_pop={500}, MATH500")
fig.update_layout(width=1000, height=400)
fig.update_traces(line=dict(width=2))
fig.show(config={'displayModeBar': False})


In [None]:
t_all = pd.DataFrame()
Npop = 5000
pA = np.random.beta(0.2, 0.8, (Npop, 1))
pB = np.random.beta(0.2, 0.8, (Npop, 1))
pB = np.where(np.random.rand(Npop, 1) < 0.6, pA, np.random.beta(0.2, 0.8, (Npop, 1)))

for N in np.arange(100, 5000, 200):
    # t = te._table_single(pA, K=20, N=N)
    t = te._table_paired(pA, pB, K=10, N=N)
    t["N"] = N
    t_all = pd.concat([t_all, t], ignore_index=True)
display(t_all)
fig = px.line(t_all, x="N", y="rms", color="comp", facet_col="estimator")
fig.update_layout(title=f"K={K}, Npop={Npop}, correlated Beta")

In [None]:
def _table_single_model(model):
    truth = model.true_vars()
    estimators = [
        Single.from_samples,
        Single.from_samples_unbiasedK,
    ]
    t = te.TestSingleEstimators().estimator_results(truth, model, estimators, verbose=False)
    return t

def _table_paired(modelA, modelB, truth):
    estimators_to_test = [
        Paired.from_samples,
        Paired.from_samples_unbiasedK,
    ]
    return te.TestPairedEstimators().estimator_results(truth, modelA, modelB, estimators_to_test, verbose=False)

N, K = 100, 10
Npop, Kpop = 10000, 1000
pA = np.random.beta(0.3, 0.7, (Npop, 1))
pB = np.where(np.random.rand(Npop, 1) < 0.6, pA, np.random.beta(0.2, 0.8, (Npop, 1)))

model_bern_all = BernoulliModelStratified(pA, N=Npop, K=Kpop)
model_bern_allb = BernoulliModelStratified(pB, N=Npop, K=Kpop)
A = model_bern_all.sample_preds()
B = model_bern_allb.sample_preds()
truth = Paired.from_bernoulli_prob(pA, pB)

model_bern = BernoulliModel(pA, K=K, N=N)
model_bernB = BernoulliModel(pB, K=K, N=N)
kA = kB = Kpop
A_minus_B = np.zeros((N, kA*kB)) 
for i in range(N):
    diffs = A[i][:, np.newaxis] - B[i][np.newaxis, :]
    A_minus_B[i, :] = diffs.flatten()

model_boot = BootstrapModel(A_minus_B, K=K, N=N)
t = _table_single_model(model_boot)
display(t)
# print(A)
t = _table_paired(model_bern, model_bernB, truth)
display(t)

In [None]:
pA = np.random.beta(0.2, 0.8, size=(500, 1))
pA = 0.3 
A = np.random.rand(10000, 500) > 0.3
display(px.histogram(np.var(A, axis=1, keepdims=1)))
display(px.histogram(np.mean(A, axis=1, keepdims=1)))


# err_mean = 1/np.sqrt(N) * np.std(np.mean(A, axis=1))
# err_var = 1/np.sqrt(N) * np.var(np.var(A, axis=1) - 0.21)
# print(err_mean, err_var)
# print(pA)
