In [1]:
import pandas as pd
from preprocessing import get_models_and_soups_df
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
plt.style.use("style.mplstyle")

In [3]:
soups, models = get_models_and_soups_df()
soups['soup_gain'] = soups['clean_accuracy'] - soups[['clean_accuracy_a', 'clean_accuracy_b']].max(axis=1)
soups['corrupted_gain'] = soups['corrupted_accuracy'] - soups[['corrupted_accuracy_a', 'corrupted_accuracy_b']].max(axis=1)

In [4]:
import numpy as np
import pandas as pd

np.random.seed(42)

B = 1000
alpha = 0.05
targets = [200, 250]

rows = []

for se in targets:
    df = soups[soups['shared_epochs'] == se].dropna(
        subset=['soup_gain', 'corrupted_gain']
    )

    gains = df['soup_gain'].to_numpy()
    corr_gains = df['corrupted_gain'].to_numpy()
    n = gains.size
    if n == 0:
        continue

    # point estimates
    p_pos = (gains > 0).mean()
    mean_pos = gains[gains > 0].mean() if np.any(gains > 0) else np.nan
    mean_corr_pos = corr_gains[gains > 0].mean() if np.any(gains > 0) else np.nan

    # bootstrap
    boot_p = np.empty(B)
    boot_m = np.empty(B)
    boot_mc = np.empty(B)

    for b in range(B):
        idx = np.random.randint(0, n, size=n)
        g = gains[idx]
        cg = corr_gains[idx]

        boot_p[b] = (g > 0).mean()

        pos = g > 0
        boot_m[b]  = g[pos].mean()  if np.any(pos) else np.nan
        boot_mc[b] = cg[pos].mean() if np.any(pos) else np.nan

    rows.append({
        'shared_epochs': se,

        'P(gain>0)': p_pos,
        'P_lo': np.quantile(boot_p, alpha/2),
        'P_hi': np.quantile(boot_p, 1 - alpha/2),

        'E[gain | gain>0]': mean_pos,
        'E_lo': np.nanquantile(boot_m, alpha/2),
        'E_hi': np.nanquantile(boot_m, 1 - alpha/2),

        'E[corr gain | gain>0]': mean_corr_pos,
        'C_lo': np.nanquantile(boot_mc, alpha/2),
        'C_hi': np.nanquantile(boot_mc, 1 - alpha/2),

        'n': n
    })

out_df = pd.DataFrame(rows).sort_values('shared_epochs')
out_df


Unnamed: 0,shared_epochs,P(gain>0),P_lo,P_hi,E[gain | gain>0],E_lo,E_hi,E[corr gain | gain>0],C_lo,C_hi,n
0,200,0.436242,0.355537,0.510235,0.603231,0.563415,0.642109,0.71315,0.61827,0.800356,149
1,250,0.511628,0.406977,0.604651,0.159318,0.122495,0.197319,0.047237,-0.034104,0.119598,86


In [5]:
out_df['P_pm'] = 0.5 * (out_df['P_hi'] - out_df['P_lo'])
out_df['E_pm'] = 0.5 * (out_df['E_hi'] - out_df['E_lo'])
out_df['C_pm'] = 0.5 * (out_df['C_hi'] - out_df['C_lo'])

In [6]:
fmt = lambda m, lo, hi: f'{m:.3f} [{lo:.3f}, {hi:.3f}]'

pretty = pd.DataFrame({
    'shared_epochs': out_df['shared_epochs'],
    'P(gain > 0)': [
        fmt(m, lo, hi) for m, lo, hi in
        zip(out_df['P(gain>0)'], out_df['P_lo'], out_df['P_hi'])
    ],
    'E[gain | gain > 0]': [
        fmt(m, lo, hi) for m, lo, hi in
        zip(out_df['E[gain | gain>0]'], out_df['E_lo'], out_df['E_hi'])
    ],
    'E[corr gain | gain > 0]': [
        fmt(m, lo, hi) for m, lo, hi in
        zip(out_df['E[corr gain | gain>0]'], out_df['C_lo'], out_df['C_hi'])
    ],
    'n': out_df['n']
})

pretty


Unnamed: 0,shared_epochs,P(gain > 0),E[gain | gain > 0],E[corr gain | gain > 0],n
0,200,"0.436 [0.356, 0.510]","0.603 [0.563, 0.642]","0.713 [0.618, 0.800]",149
1,250,"0.512 [0.407, 0.605]","0.159 [0.122, 0.197]","0.047 [-0.034, 0.120]",86


In [8]:
fmt_pm = lambda m, pm: f'{m:.3f} ± {pm:.3f}'

pretty_pm = pd.DataFrame({
    'shared_epochs': out_df['shared_epochs'],
    'P(gain > 0)': [
        fmt_pm(m, pm) for m, pm in zip(out_df['P(gain>0)'], out_df['P_pm'])
    ],
    'E[gain | gain > 0]': [
        fmt_pm(m, pm) for m, pm in zip(out_df['E[gain | gain>0]'], out_df['E_pm'])
    ],
    'E[corr gain | gain > 0]': [
        fmt_pm(m, pm) for m, pm in zip(out_df['E[corr gain | gain>0]'], out_df['C_pm'])
    ],
    'n': out_df['n']
})

pretty_pm

Unnamed: 0,shared_epochs,P(gain > 0),E[gain | gain > 0],E[corr gain | gain > 0],n
0,200,0.436 ± 0.077,0.603 ± 0.039,0.713 ± 0.091,149
1,250,0.512 ± 0.099,0.159 ± 0.037,0.047 ± 0.077,86


Note the above CIs are computed by bootstrapping a 95% to obtain a width, then centering this width around the observed value. However, these are nearly the same as can be seen by comparing the two tables above.

In [10]:
soups[soups['shared_epochs'] == 250]

Unnamed: 0,key_a,key_b,epoch_a,variant_a,epoch_b,variant_b,shared_epochs,clean_accuracy,clean_loss,corrupted_accuracy,...,clean_accuracy_a,clean_loss_b,clean_accuracy_b,corrupted_loss_a,corrupted_loss_b,corrupted_accuracy_a,corrupted_accuracy_b,soup_gain,permutated_gain,corrupted_gain
4410,250_1,250_2,250,1,250,2,250,45.00,2.645135,18.204211,...,78.45,0.990470,78.92,2.286406,2.256323,51.948947,53.024211,-33.92,-1.663497,-34.820000
4411,250_1,250_3,250,1,250,3,250,32.84,3.348385,12.685789,...,78.45,0.986683,79.10,2.286406,2.258532,51.948947,52.967895,-46.26,-2.366457,-40.282105
4412,250_1,250_4,250,1,250,4,250,55.71,2.094840,24.252632,...,78.45,0.989215,79.02,2.286406,2.259283,51.948947,53.150000,-23.31,-1.113460,-28.897368
4413,250_1,260_1,250,1,260,1,250,57.97,1.968709,26.069474,...,78.45,1.015160,78.10,2.286406,2.312074,51.948947,51.882105,-20.48,-0.985862,-25.879474
4414,250_1,260_2,250,1,260,2,250,32.89,3.349775,12.628421,...,78.45,0.990801,79.07,2.286406,2.267169,51.948947,52.924737,-46.18,-2.367961,-40.296316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4551,250_4,290_4,250,4,290,4,250,79.31,0.984477,53.142632,...,79.02,0.991815,79.36,2.259283,2.262247,53.150000,53.022105,-0.05,0.004739,-0.007368
4552,250_4,300_1,250,4,300,1,250,78.71,1.005920,52.437368,...,79.02,1.086659,76.95,2.259283,2.391697,53.150000,50.583158,-0.31,-0.016701,-0.712632
4553,250_4,300_2,250,4,300,2,250,79.38,0.983009,52.942632,...,79.02,0.989941,79.19,2.259283,2.278500,53.150000,52.838947,0.19,0.006206,-0.207368
4554,250_4,300_3,250,4,300,3,250,79.19,0.986447,52.581579,...,79.02,0.997869,79.11,2.259283,2.273815,53.150000,52.807895,0.08,0.002764,-0.568421
