In [None]:
import os
import json, glob
from dataclasses import dataclass, field
from typing import Dict, Any, Iterator, Optional

import pandas as pd
from jinja2 import Template
from omegaconf import OmegaConf
from pathlib import Path

import arena
from report_example import gen_example_report
from report_model import gen_model_report

import importlib
importlib.reload(arena)

@dataclass
class ReportArgs:
    out_dir: Optional[str] = 'gh-pages/'
    data: str = "data/*.jsonl"
    recompute: bool = False# generate results for all data and summary line
    write_summary: bool = True # use results in out_dir/tmp to generate the summary table

def run_arena(args: ReportArgs):
    records = []
    for fname in glob.glob(args.data):
        with open(fname, 'rt') as f:
            records.extend([json.loads(l) for l in f.readlines()])
    eval_results = pd.DataFrame(records)
    display(eval_results)
    benchmarks = set(eval_results['benchmark_id'])
    print(benchmarks)
    return eval_results

args = ReportArgs()
results = run_arena(args)
display(results)


In [None]:
import os
import json, glob
from dataclasses import dataclass, field
from typing import Dict, Any, Iterator, Optional

import pandas as pd
from jinja2 import Template
from omegaconf import OmegaConf
from pathlib import Path

import arena
from report_example import gen_example_report
from report_model import gen_model_report, write_summary_table
from signal_noise import signal_to_noise
import json
import numpy as np
import pandas as pd
import scipy.stats as stats

import plotly.express as px
import plotly.graph_objects as go
from jinja2 import Template

@dataclass
class ReportArgs:
    out_dir: Optional[str] = 'gh-pages/'
    data: str = "data/*.jsonl"
    recompute: bool = True # generate results for all data and summary line
    write_summary: bool = True # use results in out_dir/tmp to generate the summary table
    

def run_arena(args: ReportArgs):
    tmp_dir = Path(args.out_dir) / 'tmp'
    if args.write_summary:
        records = []
        for fname in glob.glob(f'{tmp_dir}/summary-*.jsonl'):
            with open(fname, 'rt') as f:
                records.extend([json.loads(l) for l in f.readlines()])
        print(records)
        # Copy custom.css to the output directory
        css_src = Path("templates/custom.css")
        css_dst = Path(args.out_dir) / "static" / "custom.css"
        os.makedirs(Path(args.out_dir) / "static" , exist_ok=True)
        with open(css_src, "rb") as src_file, open(css_dst, "wb") as dst_file:
            dst_file.write(src_file.read())

        df_summary = pd.DataFrame(records)
        df_summary.to_csv(Path(args.out_dir) / 'summary.csv')
        # write_summary_table(pd.DataFrame(df_summary), Path(args.out_dir) / 'index.html')
        return df_summary


@dataclass
class ReportArgs:
    out_dir: Optional[str] = 'gh-pages/'
    data: str = "data/*.jsonl"
    recompute: bool = False# generate results for all data and summary line
    write_summary: bool = True # use results in out_dir/tmp to generate the summary table

args = ReportArgs()
results = run_arena(args)
# Expand dictionary columns to separate columns

# First, let's see what columns contain dictionaries
dict_columns = ['std(A)', 'E(std(A))', 'std(A-B)', 'std_signtest', 'corr(A,B)']

# Create a copy of results to work with
expanded_results = results.copy()

# Expand each dictionary column
for col in dict_columns:
    if col in expanded_results.columns:
        # Convert string representations to actual dictionaries if needed
        dict_data = expanded_results[col].apply(lambda x: x if isinstance(x, dict) else eval(x) if isinstance(x, str) else {})
        
        # Create new columns for each key in the dictionaries
        dict_df = pd.json_normalize(dict_data)
        # Rename columns to include the original column name
        dict_df.columns = [f"{col}_{key}" for key in dict_df.columns]
        
        # Drop the original dictionary column and concatenate the new columns
        expanded_results = expanded_results.drop(columns=[col])
        expanded_results = pd.concat([expanded_results, dict_df], axis=1)

display(expanded_results)
df = expanded_results
fig = px.scatter(df, x="size", y=df["std(A-B)_mean"]*np.sqrt(df["size"]), error_y="std(A-B)_std", hover_data=["benchmark_id", "corr(A,B)_mean", "no_solve", "tau-"])
fig.update_xaxes(type="log")
fig.update_yaxes(type="log")
display(fig)
# Add trend line y = 0.25 / sqrt(x)
x_trend = np.logspace(np.log10(df['size'].min()), np.log10(df['size'].max()), 100)
y_trend = np.sqrt(0.25 / x_trend)

trend_trace = go.Scatter(
    x=x_trend, 
    y=y_trend, 
    mode='lines',
    name='y = 0.25/√x',
    line=dict(color='red', dash='dash'),
    hovertemplate='Trend: 0.25/√x<extra></extra>'
)

fig.add_trace(trend_trace)
display(fig)


In [None]:
import plotly.graph_objects as go

# Basic box plot from summary statistics
fig = go.Figure()
# display(df)
fig.add_trace(go.Box(
    q1=df["std(A-B)_25%"],
    q3=df["std(A-B)_75%"],
    median=df["std(A-B)_50%"],
    mean=df["std(A-B)_mean"],
    lowerfence=df["std(A-B)_min"],
    upperfence=df["std(A-B)_max"],
    x = df["size"],
    text=df["benchmark_id"],
))
display(df["size"])
x_trend = np.logspace(np.log10(df['size'].min()), np.log10(df['size'].max()), 100)
y_trend = np.sqrt(0.25 / x_trend)

trend_trace = go.Scatter(
    x=x_trend, 
    y=y_trend, 
    mode='lines',
    name='y = 0.25/√x',
    line=dict(color='red', dash='dash'),
    hovertemplate='Trend: 0.25/√x<extra></extra>'
)
fig.add_trace(trend_trace)
fig.update_xaxes(type="log")
fig.show()

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats

import plotly.express as px
import plotly.graph_objects as go
from jinja2 import Template

import importlib

import arena
importlib.reload(arena)



bid = "gsm8k"
bid = "CRUXEval-output-T0.8"
bid = "mbpp"
bid = "humaneval+"
bid = "lcb_codegen_v6"
bid = "mmlu"
bid = "swebench-verified"
selected = results[results["benchmark_id"] == bid]
from report_model import fig_accs_and_pvalues

def get_sections(result: pd.DataFrame, benchmark_id):
    battles = arena.BattleSummary.pass1_to_battle(result)
    # display(battles)
    summary = arena.BattleSummary.battle_summary(battles)
    display(summary)
    return summary
    
# summary = get_sections(selected, bid)

# with open("gh-pages/temp.html", "w") as fo:
#     fo.write(fig.to_html())
print("Test")

In [None]:
def trend_df(selected: pd.DataFrame):
    marginals = selected.groupby(["example_id"]).agg({'pass1': 'mean'}).reset_index().sort_values(by="pass1")
    display(marginals)
    m1 = marginals["pass1"].to_numpy().copy()
    def independent_var(p: np.ndarray, alpha=1) -> float:
        """
        calculate the variance of two independent draws from p, X_i, Y_i ~ Bernoulli(p_i), I want E[(X_i - Y_i)**2]
        """
        N = len(p)
        assert np.all(p >= 0) and np.all(p <= 1)
        return np.sqrt(1 / N * np.mean(p * (1 - p)))

    df = pd.DataFrame({"alpha": np.logspace(-5, 5, 1000)})

    df["p_mean"] = df["alpha"].map(lambda alpha: np.power(m1, alpha).mean())
    df["vars"] = df["alpha"].map(lambda alpha: independent_var(np.power(m1, alpha)))
    m2 = np.where((0 < m1) & (m1 < 1), 0.5, m1)
    df["p_mean_const"] = df["alpha"].map(lambda alpha: np.power(m2, alpha).mean())
    df["vars_const"] = df["alpha"].map(lambda alpha: independent_var(np.power(m2, alpha)))
    return df


marginals = selected.groupby(["example_id"]).agg({'pass1': 'mean'}).reset_index().sort_values(by="pass1")
# Build dictionary from model to performance curves, sorted by example_id as marginals
model_curves = {}
for model in selected['model'].unique():
    model_data = selected[selected['model'] == model]
    # display(model_data)
    model_data_sorted = marginals.merge(model_data, on="example_id", how="left")
    model_curves[model] = model_data_sorted["pass1_y"]

# Verify the sorting matches marginals
print(f"Number of models: {len(model_curves)}")
for model, data in list(model_curves.items())[:3]:  # Show first 3 models
    print(f"{model}: {len(data)} examples")
    print(f"{model}: {data}")


# 
# fig = px.line(y=m1)
m1 = marginals["pass1"].to_numpy().copy()
fig = go.Figure()
for alpha in [0.2, 1, 2]:
    fig.add_scatter(y=m1**alpha, mode="lines", name=f'm*{alpha}')

# Calculate which models are in top/bottom half based on overall performance
model_performances = {}
for model in selected['model'].unique():
    model_data = selected[selected['model'] == model]
    model_performances[model] = model_data['pass1'].mean()

# Sort models by performance
sorted_models = sorted(model_performances.items(), key=lambda x: x[1])
print(sorted_models)
n_models = len(sorted_models)
bottom_half_models = [model for model, _ in sorted_models[:15]]
top_half_models = [model for model, _ in sorted_models[-15:]]

# Calculate marginals for bottom half and top half models
bottom_half_data = selected[selected['model'].isin(bottom_half_models)]
top_half_data = selected[selected['model'].isin(top_half_models)]

bottom_half_marginals = bottom_half_data.groupby(["example_id"]).agg({'pass1': 'mean'}).reset_index()
top_half_marginals = top_half_data.groupby(["example_id"]).agg({'pass1': 'mean'}).reset_index()

# Merge with original marginals to ensure same sorting
bottom_half_marginals_sorted = marginals[['example_id']].merge(bottom_half_marginals, on="example_id", how="left")
top_half_marginals_sorted = marginals[['example_id']].merge(top_half_marginals, on="example_id", how="left")

bottom_half_marginals_sorted = bottom_half_marginals_sorted.sort_values(by="pass1")
top_half_marginals_sorted = top_half_marginals_sorted.sort_values(by="pass1")
# Add to the plot
wsz = 1
smoothed = bottom_half_marginals_sorted["pass1"].rolling(window=wsz, center=True).mean()
fig.add_scatter(y=bottom_half_marginals_sorted["pass1"], name="Bottom Half Models", mode='lines', line=dict(color='red'))
fig.add_scatter(y=top_half_marginals_sorted["pass1"], name="Top Half Models", mode='lines', marker=dict(color='green', opacity=1))

# Add the first 3 model curves to the plot
# for i, (model_name, curve_data) in enumerate(list(model_curves.items())[:10]):
#     fig.add_scatter(y=curve_data, name=model_name, mode='lines')
fig.update_layout(title=bid)
display(fig)


In [None]:

def fig_cov_baseline(bmname: str, diffvsum: pd.DataFrame, dfmodel):
    df = diffvsum
    # df["is_close"] = np.where(df["sum(A-B)"].abs() < df["total"] / 20, "close", "not_close")
    df = df[df["accA"] >= df["accB"]]
    df["is_close"] = np.where(np.abs(df["accA"] - df["accB"]) / df["std(A-B)"] <= 3, "close: ≤3σ", "not close: >3σ")
    color_map = {
        "close: ≤3σ": "blue",      # Bright red
        "not close: >3σ": "#CCCCCC"     # Light gray
    } 
    figs = px.scatter(df,
                    x=np.maximum(df["accB"], df["accA"]), y="std(A-B)",
                    color="is_close",
                    color_discrete_map=color_map,
                    custom_data=["model_a", "model_b", "sum(A!=B)", "sum(A-B)", "pvalue", "std(A-B)", "accA", "accB", "corr(A,B)"])
    figs.for_each_trace(lambda trace: trace.update(opacity=0.5) 
                   if trace.name == "not close: >3σ" else None)
    
    figs.update_traces(hovertemplate=
        "<br>".join([
        "Model A: %{customdata[0]} (acc: %{customdata[6]:.1%})",
        "Model B: %{customdata[1]} (acc: %{customdata[7]:.1%})", 
        "total A≠B: %{customdata[2]:.1f}",
        "total A-B: %{customdata[3]:.1f}", 
        "std(A-B): %{customdata[5]:.2%}", 
        "p-value: %{customdata[4]:.3g}",
        "corr(A,B): %{customdata[8]:.3g}",
        ])  + "<extra></extra>")

    figs.update_traces(
        marker=dict(
            size=3,
            opacity=0.5, 
        )
    )

    data_sz = diffvsum.iloc[0]["total"]
    x = np.linspace(0, 1, 100)
    y = np.sqrt(x*(1-x) / data_sz)

    figl = go.Figure()

    figl.add_trace(go.Scatter(
        x=x, y=y, name="σ(acc)",
        # hoverinfo="skip",
        line=dict(color="lightgreen")
    ))

    figl.add_trace(go.Scatter(
        x=x, y=np.sqrt(2)*y, name="sqrt(2) σ(acc)",
        # hoverinfo="skip",
        line=dict(color="darkgreen")
    ))

    figl.add_trace(go.Scatter(
        x=dfmodel["p_mean"], y=dfmodel["vars"],
        # hoverinfo="skip",
        line=dict(color="red"),
        name="exp"
    ))
    figl.add_trace(go.Scatter(
        x=dfmodel["p_mean_const"], y=dfmodel["vars_const"],
        # hoverinfo="skip",
        line=dict(color="pink"),
        name="vars_const"
    ))

    fig = go.Figure(data=figl.data + figs.data)
    fig.update_layout(
        width=800, height=600, title=bmname,
        xaxis_title="mean(acc(A), acc(B))",
        yaxis_title="σ(A-B)"
    )
    return fig
display(summary)
display(fig_cov_baseline(bid, summary, trend_df(selected)))

In [None]:
def summary_stats(s, f=2, percent=True):
    return f"{s['mean']:.2f}±{s['std']:.2f} | [{s['min']:.2f}--{s['max']:.2f}] | n={s['count']} "

def format_stats_badge(s):
    s_percent = dict(s)
    for st in ["mean", "std", "min", "max"]:
        s_percent[st] = 100 * s[st]
    summary = summary_stats(s_percent)
    mean = 100*s["mean"]
    return f'<span title="{summary}">{mean:.2f}</span>'

def write_summary_table(summary_count: pd.DataFrame, output_path: Path):
    summary_count = summary_count.sort_values(by='benchmark_id')

    def link_detail(bid):
        l1 = f"""by <a href="model_{bid}.html">models </a> """
        l2 = f"""<a href="ex_{bid}.html"> examples </a>"""
        l3 = f"""<a href="ex_v_model_{bid}.html"> data </a>"""
        return l1 + '|' + l2 + '|' + l3
    summary_count['link to details'] = summary_count['benchmark_id'].apply(link_detail)

    def normalize(counts, includes):
        percent = counts.copy(deep=True)
        for c in includes:
            percent[c] = percent[c] / percent['size']
        return percent

    includes_cols = ['benchmark_id', 'size',  'std(A-B)', 'corr_ab', 'p5_min', 'p5_max', 'no_solve', 'tau-', 'sig_noise','link to details']
    percent_cols = ['p5_min', 'p5_max', 'no_solve', 'tau-']
    summary_percent = normalize(summary_count, percent_cols)

    display(summary_percent)
    template_path = r"templates/summary.html"

    with open(output_path, "w", encoding="utf-8") as output_file:
        with open(template_path) as template_file:
            j2_template = Template(template_file.read())
            output_file.write(j2_template.render({
                'count_table': summary_count[includes_cols].to_html(escape=False, index=False),
                'percent_table': summary_percent[includes_cols].to_html(
                    escape=False,
                    index=False,
                    formatters={
                        "std(A-B)": lambda x: format_stats_badge(x),
                        "corr_ab": lambda x: format_stats_badge(x),
                        'p5_min': lambda x: f'{x*100:.2f}',
                        'p5_max': lambda x: f'{x*100:.2f}',
                        'min_dist': '{:.2}'.format,
                        'no_solve': '{:.2}'.format,
                        'tau-': '{:.2}'.format,
                        'sig_noise': '{:.2f}'.format,
                    }),
            }))
            
def generate_summary(args: ReportArgs):
    tmp_dir = Path(args.out_dir) / 'tmp'
    os.makedirs(tmp_dir, exist_ok=True)
    
    if args.write_summary:
        records = []
        for fname in glob.glob(f'{tmp_dir}/summary-*.jsonl'):
            with open(fname, 'rt') as f:
                records.extend([json.loads(l) for l in f.readlines()])
        print(records)
        write_summary_table(pd.DataFrame(records), Path(args.out_dir) / 'index.html')

generate_summary(args)

In [None]:
import sympy as sp

# Define the symbols
p, q = sp.symbols('p q')

# Define the original expression with squared terms
expr = p*q*(1 - p*(2*q - 1))**2 + p*(1 - q)*(-1 - p*(2*q - 1))**2

print("Original expression:")
print(expr)
print()

# Simplify the expression
simplified = sp.simplify(expr)
print("Simplified expression:")
print(simplified)
print()

In [None]:
import numpy as np
from scipy.stats import beta

import plotly.graph_objects as go

# Create a range of alpha values to explore
alphas = [0.1, 0.3, 0.5, 0.7, 0.9]

# Create x values for plotting
x = np.linspace(0, 1, 1000)

fig = go.Figure()

for alpha in alphas:
    # For beta(alpha, 1-alpha), we need alpha > 0 and 1-alpha > 0, so 0 < alpha < 1
    if 0 < alpha < 1:
        beta_param = 1 - alpha
        # Calculate CDF
        cdf_values = beta.cdf(x, alpha, beta_param)
        fig.add_trace(go.Scatter(
            x=x, 
            y=cdf_values, 
            mode='lines',
            name=f'Beta({alpha}, {beta_param:.1f})'
        ))

# Add special case for alpha = 0.5 (which gives Beta(0.5, 0.5))
alpha = 0.5
beta_param = 0.5
cdf_values = beta.cdf(x, alpha, beta_param)
fig.add_trace(go.Scatter(
    x=x, 
    y=cdf_values, 
    mode='lines',
    name=f'Beta({alpha}, {beta_param}) - U-shaped',
    line=dict(width=3)
))

fig.update_layout(
    title='Cumulative Distribution Function of Beta(α, 1-α)',
    xaxis_title='x',
    yaxis_title='CDF(x)',
    width=800,
    height=600
)

fig.show()

In [None]:
import numpy as np
from scipy.integrate import quad
from scipy.special import beta, gamma
import math


In [None]:
for alpha in np.linspace(0, 1, 20):
    beta_modelpred = 2 * beta((alpha)+1, (1-alpha)+1) / beta(alpha, (1-alpha))
    var = alpha * (1-alpha)
    print(beta_modelpred, var)

In [None]:

def plot_betapdf(a=0.5, b=0.5):    
    # Define the integrand
    def integrand(x):
        return x**a * (1-x)**b / beta(a+1, b+1)

    x = np.linspace(0, 1, 100)
    return px.scatter(x=x, y=integrand(x))

alpha = 0.3
fig = plot_betapdf(a=-alpha, b=-(1-alpha))
display(fig)


In [None]:

import torch
from torch import mean, var
A = torch.randn([100, 30], dtype=torch.float64) > 0
A = A.to(torch.float32)
def var(A, dim=None):
    return torch.var(A, dim=dim, unbiased=False)
# print(A)
print(f"{var(A)=}")
print(f"{var(A, dim=0)=}")
print(f"{mean(A)=}")
print(f"{mean(var(A, dim=0))=}")
print(f"{var(mean(A, dim=0))=}")


from numpy import mean, var
A = A.numpy()
print(f"{var(A)=}")
print(f"{var(var(A, axis=0))=}")
# print(f"{var(A, axis=0)=}")
# print(f"{mean(A)=}")
print(f"{mean(var(A, axis=0))=}")
print(f"{var(mean(A, axis=0))=}")
print(f"{mean(var(A, axis=1))=}")
print(f"{var(mean(A, axis=1))=}")

In [636]:

res = []
N = 5
K = 5
import numpy as np
from numpy import mean, var, std

def cov(A, B, ddof=0):
    # okay, is there ever a time to need ddof=1?
    return np.cov(A, B, ddof=ddof)[0, 1]



class Paired:
    @staticmethod
    def sample_vars(A: np.ndarray, B: np.ndarray, dof=0) -> dict:
        assert A.shape[0] == B.shape[0] # paired data
        return {
            "var(E(A-B))": var(mean(A-B, axis=1)),
            "E(var(A-B))": mean(var(A, axis=1) + var(B, axis=1)),
            "var(A-B)": var(A) + var(B) - 2 * cov(mean(A, axis=1), mean(B, axis=1)),
            # "_var(A-B)": mean(A**2 + B**2) - 2 * mean(mean(A, axis=1) * mean(B, axis=1)) - mean(A-B)**2,
        }
    
    @staticmethod
    def sample_vars_unbiased(A: np.ndarray, B: np.ndarray, dof=0) -> dict:
        assert A.shape[0] == B.shape[0] # paired data
        kA = A.shape[1]
        kB = A.shape[1]
        return {
            "var(E(A-B))": var(mean(A-B, axis=1)) - mean(var(A, axis=1)/(kA-1) + var(B, axis=1)/(kA-1)) ,
            "E(var(A-B))": mean(var(A, axis=1)* (1 + 1/(kA-1)) + var(B, axis=1) * (1 + 1/(kB-1))),
            "var(A-B)": var(A) + var(B) - 2 * cov(mean(A, axis=1), mean(B, axis=1)),
            # "_var(A-B)": mean(A**2 + B**2) - 2 * mean(mean(A, axis=1) * mean(B, axis=1)) - mean(A-B)**2,
        }
    
    @staticmethod
    def bernoulli_sample_vars(A: np.ndarray, B: np.ndarray, dof=0) -> dict:
        ...

    @staticmethod
    def bernoulli_p_vars(pA: np.ndarray, pB: np.ndarray) -> dict:
        assert pA.shape[0] == pB.shape[0]
        assert pA.shape[1] == pB.shape[1] == 1
        pA = pA.flatten()
        pB = pB.flatten()
        return {
            "var(E(A-B))": var(pA - pB),
            "E(var(A-B))": mean(pA*(1-pA) + pB*(1-pB)),
            "var(A-B)": mean(pA)*(1-mean(pA)) + mean(pB)*(1-mean(pB)) - 2 * cov(pA, pB),
            # "_var(A-B)": mean(pA + pB - 2*pA*pB) - mean(pA - pB)**2
        }


pA = np.random.rand(N, 1)
pB = (pA + 1*(np.random.rand(N, 1)-0.5)).clip(0, 1)
for i in range(10):
    A = np.random.rand(N, K)
    A = np.where(A < pA, 1, 0)

    B = np.random.rand(N, K)
    B = np.where(B < pB, 1, 0)

    delta = (A-B).mean() 
    vars = {
        "E(A-B)": delta,
        # "var(A-B)":  np.mean(np.mean(A*A + B*B, axis=1) - 2 * A.mean(axis=1) * B.mean(axis=1)) - delta * delta,
        # "var(A-B)": mean(A*A + B*B) - 2*mean(mean(A, axis=1) * mean(B, axis=1)) - delta * delta,
        
    }
    def total_variance_test(v: dict):
        assert np.allclose(v["var(A-B)"], v["E(var(A-B))"] + v["var(E(A-B))"]), v

    def relative_error(v1, v2):
        # for k in "var(A-B)", "E(var(A-B))", "var(E(A-B))"
        ... 

    v = Paired.sample_vars(A, B)
    total_variance_test(v) 

    vstar = Paired.bernoulli_p_vars(pA, pB)
    total_variance_test(vstar)
    
    v_unb = Paired.sample_vars_unbiased(A, B)
    total_variance_test(v_unb) 

    res.append({
        **{("star", k): v for k, v in vstar.items()},
        **{("hat", k): v for k, v in v.items()},
        **{("unb", k): v for k, v in v_unb.items()},
    })

df = pd.DataFrame(res)
# df["diff"] = df["var(A-B)"] - df["E(var(A-B))"] - df["var(E(A-B))"]
# df["diff2"] = df["_var(A-B)"] - df["E(var(A-B))"] - df["var(E(A-B))"]
# df["diff_star"] = df["var(A-B)_star"] - df["E(var(A-B))_star"] - df["var(E(A-B))_star"]
display(df)
# px.scatter(df, x="x", y="y")
display(df.describe())



Unnamed: 0,"(star, var(E(A-B)))","(star, E(var(A-B)))","(star, var(A-B))","(hat, var(E(A-B)))","(hat, E(var(A-B)))","(hat, var(A-B))","(unb, var(E(A-B)))","(unb, E(var(A-B)))","(unb, var(A-B))"
0,0.071759,0.25856,0.330319,0.112,0.192,0.304,0.064,0.24,0.304
1,0.071759,0.25856,0.330319,0.0704,0.256,0.3264,0.0064,0.32,0.3264
2,0.071759,0.25856,0.330319,0.0544,0.24,0.2944,-0.0056,0.3,0.2944
3,0.071759,0.25856,0.330319,0.0576,0.176,0.2336,0.0136,0.22,0.2336
4,0.071759,0.25856,0.330319,0.1824,0.224,0.4064,0.1264,0.28,0.4064
5,0.071759,0.25856,0.330319,0.0736,0.256,0.3296,0.0096,0.32,0.3296
6,0.071759,0.25856,0.330319,0.1216,0.24,0.3616,0.0616,0.3,0.3616
7,0.071759,0.25856,0.330319,0.048,0.256,0.304,-0.016,0.32,0.304
8,0.071759,0.25856,0.330319,0.0736,0.128,0.2016,0.0416,0.16,0.2016
9,0.071759,0.25856,0.330319,0.0864,0.288,0.3744,0.0144,0.36,0.3744


Unnamed: 0,"(star, var(E(A-B)))","(star, E(var(A-B)))","(star, var(A-B))","(hat, var(E(A-B)))","(hat, E(var(A-B)))","(hat, var(A-B))","(unb, var(E(A-B)))","(unb, E(var(A-B)))","(unb, var(A-B))"
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.071759,0.25856,0.330319,0.088,0.2256,0.3136,0.0316,0.282,0.3136
std,0.0,0.0,0.0,0.04082,0.047374,0.06194,0.04272,0.059217,0.06194
min,0.071759,0.25856,0.330319,0.048,0.128,0.2016,-0.016,0.16,0.2016
25%,0.071759,0.25856,0.330319,0.0608,0.2,0.2968,0.0072,0.25,0.2968
50%,0.071759,0.25856,0.330319,0.0736,0.24,0.3152,0.014,0.3,0.3152
75%,0.071759,0.25856,0.330319,0.1056,0.256,0.3536,0.0566,0.32,0.3536
max,0.071759,0.25856,0.330319,0.1824,0.288,0.4064,0.1264,0.36,0.4064


In [None]:
class Single:
    @staticmethod
    def sample_vars(A: np.ndarray, dof=0) -> dict:
        return {
            "var(E(A))": var(mean(A, axis=1)),
            "E(var(A))": mean(var(A, axis=1)),
            "var(A)": var(A),
        } 
    
    @staticmethod
    def sample_vars_unbiased2(A: np.ndarray) -> dict:
        kA = A.shape[1]
        N = A.shape[0]
        return {
            "var(E(A))": var(mean(A, axis=1), ddof=1) - 1/(kA-1) * mean(var(A, axis=1)) + var(A)*1/(N*kA - 1), 
            "E(var(A))": mean(var(A, axis=1, ddof=1)) , 
            "var(A)": var(A, ddof=1) + 1/(N-1)*var(mean(A, axis=1)) , # this is still slightly biased
            # "var(A)": var(A),
        }
    
    @staticmethod
    def sample_vars_unbiased(A: np.ndarray) -> dict:
        kA = A.shape[1]
        N = A.shape[0]
        return {
            "var(E(A))": var(mean(A, axis=1)) - 1/(kA-1) * mean(var(A, axis=1)) + var(A)*1/(N*kA - 1), 
            "E(var(A))": mean(var(A, axis=1)) * (1 + 1/(kA-1)), 
            "var(A)": var(A) * (1 + 1/(N*kA - 1)), # this is still slightly biased
        }
    
    @staticmethod
    def bernoulli_p_vars(pA: np.ndarray) -> dict:
        return {
            "var(E(A))": var(pA),
            "E(var(A))": mean(pA*(1-pA)),
            "var(A)": mean(pA)*(1-mean(pA))
        }
    
N, K = 100, 2
pA = np.random.rand(N, 1)
# pA = 0.5 * np.ones((N, 1))
# pA[:N//2] = 0
res = []
for i in range(10000):
    A = np.random.rand(N, K)
    A = np.where(A < pA, 1, 0)
    # A = pA + 0.5 * np.random.randn(N, K)

    def total_variance_test(v: dict):
        assert np.allclose(v["var(A)"], v["E(var(A))"] + v["var(E(A))"]), v

    def relative_error(v1, v2):
        # for k in "var(A-B)", "E(var(A-B))", "var(E(A-B))"
        ... 

    v = Single.sample_vars(A)
    total_variance_test(v) 

    vstar = Single.bernoulli_p_vars(pA)
    total_variance_test(vstar)

    v_unb = Single.sample_vars_unbiased(A)
    total_variance_test(v_unb)


    res.append({
        **{("star", k): v for k, v in vstar.items()},
        **{("hat", k): v / vstar[k] for k, v in v.items()},
        **{("unb", k): v_unb[k] / vstar[k] for k, _ in vstar.items()},
    })

df = pd.DataFrame(res)
# df["diff"] = df["var(A-B)"] - df["E(var(A-B))"] - df["var(E(A-B))"]
# df["diff2"] = df["_var(A-B)"] - df["E(var(A-B))"] - df["var(E(A-B))"]
# df["diff_star"] = df["var(A-B)_star"] - df["E(var(A-B))_star"] - df["var(E(A-B))_star"]
display(df)
# px.scatter(df, x="x", y="y")
display(df.describe())

Unnamed: 0,"(star, var(E(A)))","(star, E(var(A)))","(star, var(A))","(hat, var(E(A)))","(hat, E(var(A)))","(hat, var(A))","(unb, var(E(A)))","(unb, E(var(A)))","(unb, var(A))"
0,0.102566,0.147282,0.249848,1.510248,0.645022,1.000209,0.596258,1.290044,1.005235
1,0.102566,0.147282,0.249848,1.675751,0.526202,0.998108,0.932357,1.052404,1.003123
2,0.102566,0.147282,0.249848,1.754968,0.475279,1.000609,1.084729,0.950558,1.005637
3,0.102566,0.147282,0.249848,1.593121,0.577125,0.994205,0.776556,1.154250,0.999201
4,0.102566,0.147282,0.249848,1.702319,0.509228,0.999008,0.983311,1.018455,1.004028
...,...,...,...,...,...,...,...,...,...
9995,0.102566,0.147282,0.249848,1.724500,0.458305,0.978095,1.078360,0.916610,0.983010
9996,0.102566,0.147282,0.249848,1.705244,0.509228,1.000209,0.986251,1.018455,1.005235
9997,0.102566,0.147282,0.249848,1.773249,0.458305,0.998108,1.127354,0.916610,1.003123
9998,0.102566,0.147282,0.249848,1.821998,0.424356,0.998108,1.224852,0.848713,1.003123


Unnamed: 0,"(star, var(E(A)))","(star, E(var(A)))","(star, var(A))","(hat, var(E(A)))","(hat, E(var(A)))","(hat, var(A))","(unb, var(E(A)))","(unb, E(var(A)))","(unb, var(A))"
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.102566,0.147282,0.2498478,1.712117,0.499403,0.997239,1.007195,0.998806,1.00225
std,2.7756960000000005e-17,0.0,8.327089000000001e-17,0.104024,0.07243,0.004721,0.207713,0.14486,0.004745
min,0.102566,0.147282,0.2498478,1.291608,0.23764,0.95218,0.158251,0.475279,0.956964
25%,0.102566,0.147282,0.2498478,1.64187,0.441331,0.995706,0.874054,0.882661,1.00071
50%,0.102566,0.147282,0.2498478,1.706219,0.492253,0.999008,1.011238,0.984507,1.004028
75%,0.102566,0.147282,0.2498478,1.779099,0.543176,1.000209,1.134213,1.086353,1.005235
max,0.102566,0.147282,0.2498478,2.096212,0.79779,1.000609,1.767217,1.59558,1.005637


In [None]:
print(pA.flatten())
print(2 * cov(pA.flatten(), pB.flatten(), ddof=0))
print(mean(pA*(1-pA) + pB*(1-pB)))
print(var(A))
print(pA * (1-pA))