In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
from statsmodels.stats.contingency_tables import mcnemar

In [None]:
# Wilcoxon signed-rank tests on joint-pruned dataframes

# Load the clean dataframes
std_clean = pd.read_csv("standard_perf_clean.csv")
ez_clean = pd.read_csv("ez-bf_perf_clean.csv")

# Map to original metric names for continuity
def prep(df, model_label):
    out = df[['dataset','parameter','error','mse']].copy()
    out = out.rename(columns={'error':'bias'})
    out['model'] = model_label
    return out

sm = prep(std_clean, "Standard")
ez = prep(ez_clean, "EZ-BF")

# Joint-pruned pairing = inner join of the two clean dataframes
pairs = sm.merge(ez, on=['dataset','parameter'], suffixes=('_sm','_ez'))

# Wilcoxon per parameter x metric
metrics = ["bias", "mse"]
rows = []

for param, g in pairs.groupby("parameter"):
    for metric in metrics:
        a = g[f"{metric}_sm"]
        b = g[f"{metric}_ez"]
        try:
            stat, p = wilcoxon(a, b, zero_method='wilcox')
        except ValueError:
            stat, p = np.nan, np.nan
        diff = a - b
        rows.append({
            "parameter": param,
            "metric": metric,
            "n_pairs": len(g),
            "median_diff": float(diff.median()),
            "wilcoxon_stat": float(stat) if stat==stat else np.nan,
            "p_value": float(p) if p==p else np.nan
        })

wilcoxon_joint = pd.DataFrame(rows).sort_values(["metric","parameter"])
wilcoxon_joint.round(3)

In [None]:
# Wilcoxon signed-rank tests on full dataframes

# Load the full dataframes
std_full = pd.read_csv("standard_perf_full.csv")
ez_full = pd.read_csv("ez-bf_perf_full.csv")

sm = prep(std_full, "Standard")
ez = prep(ez_full, "EZ-BF")

# Join the two full dataframes
pairs = sm.merge(ez, on=['dataset','parameter'], suffixes=('_sm','_ez'))

# Wilcoxon per parameter x metric
metrics = ["bias", "mse"]
rows = []

for param, g in pairs.groupby("parameter"):
    for metric in metrics:
        a = g[f"{metric}_sm"]
        b = g[f"{metric}_ez"]
        try:
            stat, p = wilcoxon(a, b, zero_method='wilcox')
        except ValueError:
            stat, p = np.nan, np.nan
        diff = a - b
        rows.append({
            "parameter": param,
            "metric": metric,
            "n_pairs": len(g),
            "median_diff": float(diff.median()),
            "wilcoxon_stat": float(stat) if stat==stat else np.nan,
            "p_value": float(p) if p==p else np.nan
        })

wilcoxon_joint = pd.DataFrame(rows).sort_values(["metric","parameter"])
wilcoxon_joint.round(3)


In [None]:
# Wilcoxon signed-rank tests on full dataframes for CrI widths

# Load full-data CrI dataframes
std_ci = pd.read_csv("standard_ci_full.csv")
ez_ci  = pd.read_csv("ez-bf_ci_full.csv")

# Normalize column names
cov_col_std = 'covered_95' if 'covered_95' in std_ci.columns else 'covered'
cov_col_ez  = 'covered_95' if 'covered_95' in ez_ci.columns  else 'covered'

# Paired table for widths
pairs_w = (
    std_ci[['dataset','parameter','width_95']]
    .merge(ez_ci[['dataset','parameter','width_95']],
           on=['dataset','parameter'], suffixes=('_std','_ez'))
)

pairs_counts = pairs_w.groupby('parameter').size().rename('n_pairs').reset_index()

# Wilcoxon per parameter on paired 95% widths
rows = []
for p, g in pairs_w.groupby('parameter'):
    a = g['width_95_std'].to_numpy()
    b = g['width_95_ez'].to_numpy()
    # Wilcoxon requires at least one non-zero difference
    try:
        stat, pval = wilcoxon(a, b, zero_method='wilcox')
    except ValueError:
        stat, pval = np.nan, np.nan

    med_std = float(np.median(a))
    med_ez  = float(np.median(b))
    med_diff = float(np.median(a - b))
    rows.append({
        'parameter': p,
        'median_width_std': med_std,
        'median_width_ez':  med_ez,
        'median_diff': med_diff,
        'wilcoxon_stat': float(stat) if stat==stat else np.nan,
        'p_value': float(pval) if pval==pval else np.nan,
        'n_pairs': int(len(g))
    })

wilcoxon_cri_df = pd.DataFrame(rows).sort_values('parameter')

wilcoxon_cri_df.round(3)

In [None]:
# McNemar tests on full dataframes for CrI coverage

merged_cov = (
    std_ci[['dataset','parameter',cov_col_std]]
    .merge(ez_ci[['dataset','parameter',cov_col_ez]],
           on=['dataset','parameter'], suffixes=('_sm','_ez'))
    .rename(columns={cov_col_std+'_sm':'covered_sm', cov_col_ez+'_ez':'covered_ez'})
)

mcnemar_rows = []
for p, g in merged_cov.groupby('parameter'):
    a = ((g['covered_sm']==1) & (g['covered_ez']==1)).sum()
    b = ((g['covered_sm']==1) & (g['covered_ez']==0)).sum()
    c = ((g['covered_sm']==0) & (g['covered_ez']==1)).sum()
    d = ((g['covered_sm']==0) & (g['covered_ez']==0)).sum()
    res = mcnemar([[a,b],[c,d]], exact=True)
    mcnemar_rows.append({
        'parameter': p,
        'coverage_std_%': 100.0*float(g['covered_sm'].mean()),
        'coverage_ez_%':  100.0*float(g['covered_ez'].mean()),
        'b (std=1, ez=0)': int(b),
        'c (std=0, ez=1)': int(c),
        'mcnemar_p': float(res.pvalue),
        'n_pairs': int(len(g))
    })

mcnemar_df = pd.DataFrame(mcnemar_rows).sort_values('parameter')

mcnemar_df.round(3)