# GPU-Accelerated Robust Statistical Validation
## Paper 1: Mollified Prime-Spectral S(T)

**8 independent tests** with GPU acceleration via CuPy.
Runs in ~5 min on Colab A100.

| Test | Description | GPU-accelerated |
|------|------------|----------------|
| T1 | Honest train/test split | |
| T2 | Permutation 50K + effect size | Yes |
| T3 | Baseline model comparison | |
| T4 | 10-fold cross-validation | |
| T5 | Monte Carlo uniqueness 100K | Yes |
| T6 | Bayesian BIC | |
| T7 | Bootstrap BCa 2K | Yes |
| T8 | Window stability | |


In [None]:
!pip install cupy-cuda12x mpmath -q


In [None]:
import cupy as cp
import numpy as np
from scipy.special import loggamma
from scipy.stats import norm, t as t_dist
import json, time, warnings, os
warnings.filterwarnings('ignore')

print(f'CuPy {cp.__version__}')
props = cp.cuda.runtime.getDeviceProperties(0)
print(f"GPU: {props['name'].decode()}, "
      f"{props['totalGlobalMem'] / 1e9:.0f} GB")

# ── Configuration ──
N_ZEROS      = 10_000
N_PERM       = 50_000
N_MC         = 100_000
N_BOOTSTRAP  = 2_000
N_KFOLD      = 10
N_BASELINE_RANDOM = 200

THETA0_OPT = 1.4091
THETA1_OPT = -3.9537
THETA_CONST = 0.9941

KERNELS = ['cosine','selberg','sharp','linear',
           'quadratic','gaussian','cubic']


In [None]:
from mpmath import zetazero
from multiprocessing import Pool

def _zz(n):
    from mpmath import zetazero as zz
    return float(zz(n).imag)

print(f'Computing {N_ZEROS} Riemann zeros (parallel)...')
t0 = time.time()
try:
    with Pool(4) as pool:
        gammas = np.array(pool.map(_zz, range(1, N_ZEROS+1)))
except Exception:
    print('  Falling back to sequential...')
    gammas = np.array([float(zetazero(i).imag)
                       for i in range(1, N_ZEROS+1)])
print(f'  Done in {time.time()-t0:.1f}s')
print(f'  Range: [{gammas[0]:.3f}, {gammas[-1]:.3f}]')


In [None]:
def rs_theta(t):
    return np.imag(loggamma(0.25 + 0.5j*t)) - 0.5*t*np.log(np.pi)

def rs_theta_deriv(t):
    return 0.5 * np.log(t / (2*np.pi))

def smooth_zeros(g, n_iter=40):
    n = np.arange(1, len(g)+1)
    target = (n - 1.5) * np.pi
    g0 = g.copy()
    for _ in range(n_iter):
        g0 -= (rs_theta(g0) - target) / rs_theta_deriv(g0)
    return g0

def sieve_primes(n_max):
    is_p = np.ones(n_max+1, dtype=bool); is_p[:2] = False
    for i in range(2, int(n_max**0.5)+1):
        if is_p[i]: is_p[i*i::i] = False
    return np.where(is_p)[0]

gammas_smooth = smooth_zeros(gammas)
delta_true = gammas - gammas_smooth
primes = sieve_primes(10_000)

print(f'Zeros: {len(gammas)}, Primes: {len(primes)}')
print(f'delta: mean={np.mean(delta_true):.6f}, '
      f'std={np.std(delta_true):.6f}')


In [None]:
# ── CPU functions ──
def compute_Sw_cpu(T, pr, theta0=THETA0_OPT, theta1=THETA1_OPT,
                   k_max=3, kernel='cosine'):
    lp = np.log(pr).astype(np.float64)
    lT = np.log(T)
    L = theta0*lT + theta1; valid = L > 0
    if not np.any(valid): return np.zeros(len(T))
    S = np.zeros(len(T)); Tv = T[valid]; Lv = L[valid]
    for m in range(1, k_max+1):
        pmh = pr.astype(np.float64)**(m/2.0)
        x = m*lp[None,:] / Lv[:,None]
        if kernel=='cosine':     w = np.where(x<1, np.cos(np.pi*x/2)**2, 0.)
        elif kernel=='selberg':  w = np.where(x<1, 1-x**2, 0.)
        elif kernel=='sharp':    w = np.where(x<1, 1., 0.)
        elif kernel=='linear':   w = np.where(x<1, 1-x, 0.)
        elif kernel=='quadratic':w = np.where(x<1, (1-x)**2, 0.)
        elif kernel=='gaussian': w = np.exp(-x**2/0.32)
        elif kernel=='cubic':    w = np.where(x<1, (1-x)**3, 0.)
        sv = np.sin(Tv[:,None]*m*lp[None,:])
        S[valid] += np.sum(w*sv/(m*pmh[None,:]), axis=1)
    return -S/np.pi

def compute_predictions(gs, Sw):
    return -np.pi * Sw / rs_theta_deriv(gs)

def compute_R2(yt, yp):
    ss_r = np.sum((yt-yp)**2); ss_t = np.sum((yt-np.mean(yt))**2)
    return float(1 - ss_r/ss_t) if ss_t > 0 else 0.

def compute_alpha_R2(dt, dp):
    d = np.sum(dp**2)
    if d == 0: return 0., -1.
    return float(np.sum(dt*dp)/d), compute_R2(dt, dp)

def find_theta_star(gs, dt, pr, rng=(0.8,1.2), n_iter=40):
    lo, hi = rng
    for _ in range(n_iter):
        mid = (lo+hi)/2
        Sw = compute_Sw_cpu(gs, pr, theta0=mid, theta1=0)
        dp = compute_predictions(gs, Sw)
        d = np.sum(dp**2)
        if d == 0: lo = mid; continue
        if np.sum(dt*dp)/d > 1.: lo = mid
        else: hi = mid
    return (lo+hi)/2

# ── Precompute optimal Sw ──
t0 = time.time()
Sw_opt = compute_Sw_cpu(gammas_smooth, primes)
delta_pred = compute_predictions(gammas_smooth, Sw_opt)
a_glob, R2_glob = compute_alpha_R2(delta_true, delta_pred)
print(f'Global: alpha={a_glob:.6f}, R2={R2_glob:.6f} '
      f'({time.time()-t0:.1f}s)')


In [None]:
# ── GPU precomputation ──
STEP = 10  # subsample factor for heavy tests
gs_sub = gammas_smooth[::STEP]
dt_sub = delta_true[::STEP]
n_sub = len(gs_sub)

log_primes_g = cp.asarray(np.log(primes), dtype=cp.float32)
n_primes = len(primes)
pm_halves_g = [None]  # index 0 unused
for m in range(1, 4):
    pm_halves_g.append(
        cp.asarray(primes.astype(np.float64)**(m/2.), dtype=cp.float32))

# Precompute sin(T*m*log_p) for subsampled zeros (shared by all configs)
gs_sub_g = cp.asarray(gs_sub, dtype=cp.float32)
sin_pre = [None]  # index 0 unused
for m in range(1, 4):
    sin_pre.append(
        cp.sin(gs_sub_g[:, None] * float(m) * log_primes_g[None, :]))
    # shape: (Z_sub, P)  ~4.8 MB each

print(f'GPU precomp: {n_sub} zeros, {n_primes} primes')


def _apply_kernel_gpu(x, kid):
    if kid == 0: return cp.where(x<1, cp.cos(cp.float32(np.pi)*x/2)**2, cp.float32(0))
    if kid == 1: return cp.where(x<1, 1-x**2, cp.float32(0))
    if kid == 2: return cp.where(x<1, cp.float32(1), cp.float32(0))
    if kid == 3: return cp.where(x<1, 1-x, cp.float32(0))
    if kid == 4: return cp.where(x<1, (1-x)**2, cp.float32(0))
    if kid == 5: return cp.exp(-x**2 / cp.float32(0.32))
    return cp.where(x<1, (1-x)**3, cp.float32(0))  # cubic


def gpu_batch_mc(theta0_arr, theta1_arr, kernel_ids,
                 dt_sub_g, batch_size=2000):
    """
    Batch MC: compute alpha, R2 for many configs on GPU.
    Groups by kernel for efficiency.
    """
    n = len(theta0_arr)
    alphas = np.full(n, np.nan, dtype=np.float32)
    R2s = np.full(n, -1., dtype=np.float32)
    log_T = cp.log(gs_sub_g)  # (Z,)
    td = cp.float32(0.5) * cp.log(gs_sub_g / cp.float32(2*np.pi))  # theta_deriv
    ss_tot = float(cp.sum((dt_sub_g - cp.mean(dt_sub_g))**2))

    for kid in range(7):
        mask = kernel_ids == kid
        idx = np.where(mask)[0]
        if len(idx) == 0: continue
        for s in range(0, len(idx), batch_size):
            e = min(s + batch_size, len(idx))
            bi = idx[s:e]; bs = e - s
            t0g = cp.asarray(theta0_arr[bi], dtype=cp.float32)
            t1g = cp.asarray(theta1_arr[bi], dtype=cp.float32)

            L = t0g[:,None]*log_T[None,:] + t1g[:,None]  # (bs, Z)
            valid = (L > 0).astype(cp.float32)
            S = cp.zeros((bs, n_sub), dtype=cp.float32)

            for m in range(1, 4):
                inv_L = cp.float32(1.) / cp.maximum(L, cp.float32(1e-10))
                x = cp.float32(m) * log_primes_g[None,None,:] * inv_L[:,:,None]
                w = _apply_kernel_gpu(x, kid)
                S += cp.sum(w * sin_pre[m][None,:,:] /
                            (cp.float32(m) * pm_halves_g[m][None,None,:]),
                            axis=2) * valid
                del x, w

            Sw = -S / cp.float32(np.pi)
            dp = -cp.float32(np.pi) * Sw / td[None,:]
            denom = cp.sum(dp**2, axis=1)
            good = denom > 0
            al = cp.where(good, cp.sum(dt_sub_g[None,:]*dp, axis=1)/denom, cp.float32(0))
            ss_res = cp.sum((dt_sub_g[None,:] - dp)**2, axis=1)
            r2 = 1 - ss_res / cp.float32(ss_tot)

            alphas[bi] = al.get()
            R2s[bi] = r2.get()
            del S, Sw, dp, L, valid, denom, al, ss_res, r2
            cp.get_default_memory_pool().free_all_blocks()

    return alphas, R2s


def gpu_batch_bisection(gs_batch_g, dt_batch_g, n_iter=40):
    """
    Batched bisection to find theta* for many replicates on GPU.
    gs_batch_g: (B, Z) smoothed zeros for each replicate
    dt_batch_g: (B, Z) delta_true for each replicate
    Returns: (B,) theta* values
    """
    B = gs_batch_g.shape[0]
    lo = cp.full(B, 0.8, dtype=cp.float32)
    hi = cp.full(B, 1.2, dtype=cp.float32)
    log_T = cp.log(gs_batch_g)  # (B, Z)
    td = cp.float32(0.5) * cp.log(gs_batch_g / cp.float32(2*np.pi))  # (B, Z)

    for _ in range(n_iter):
        mid = (lo + hi) / 2  # (B,)
        L = mid[:,None] * log_T  # (B, Z), theta1=0
        valid = (L > 0).astype(cp.float32)
        S = cp.zeros_like(gs_batch_g)  # (B, Z)

        for m in range(1, 4):
            inv_L = cp.float32(1.) / cp.maximum(L, cp.float32(1e-10))
            x = cp.float32(m) * log_primes_g[None,None,:] * inv_L[:,:,None]
            w = cp.where(x<1, cp.cos(cp.float32(np.pi)*x/2)**2, cp.float32(0))
            sv = cp.sin(gs_batch_g[:,:,None] * cp.float32(m) * log_primes_g[None,None,:])
            S += cp.sum(w * sv / (cp.float32(m) * pm_halves_g[m][None,None,:]),
                        axis=2) * valid
            del x, w, sv

        Sw = -S / cp.float32(np.pi)
        dp = -cp.float32(np.pi) * Sw / td
        denom = cp.sum(dp**2, axis=1)
        alpha = cp.sum(dt_batch_g * dp, axis=1) / cp.maximum(denom, cp.float32(1e-30))
        lo = cp.where(alpha > 1, mid, lo)
        hi = cp.where(alpha <= 1, mid, hi)
        del S, Sw, dp, L, valid, denom, alpha

    cp.get_default_memory_pool().free_all_blocks()
    return ((lo + hi) / 2).get().astype(np.float64)

print('GPU functions defined.')


In [None]:
print('='*70)
print('TEST 1: HONEST TRAIN/TEST SPLIT')
print('='*70)
t0 = time.time()

mid = len(gammas) // 2
gs_tr, gs_te = gammas_smooth[:mid], gammas_smooth[mid:]
dt_tr, dt_te = delta_true[:mid], delta_true[mid:]

theta_tr = find_theta_star(gs_tr, dt_tr, primes)
Sw_tr = compute_Sw_cpu(gs_tr, primes, theta0=theta_tr, theta1=0)
dp_tr = compute_predictions(gs_tr, Sw_tr)
a_tr, R2_tr = compute_alpha_R2(dt_tr, dp_tr)

Sw_te = compute_Sw_cpu(gs_te, primes, theta0=theta_tr, theta1=0)
dp_te = compute_predictions(gs_te, Sw_te)
a_te, R2_te = compute_alpha_R2(dt_te, dp_te)

Sw_te2 = compute_Sw_cpu(gs_te, primes)
dp_te2 = compute_predictions(gs_te, Sw_te2)
a_te2, R2_te2 = compute_alpha_R2(dt_te, dp_te2)

theta_te = find_theta_star(gs_te, dt_te, primes)

gap = R2_tr - R2_te
t1_pass = abs(gap) < 0.05 and R2_te > 0.85

result_t1 = {
    'n_train': mid, 'n_test': len(gammas)-mid,
    'theta_star_train': float(theta_tr),
    'theta_star_test': float(theta_te),
    'theta_star_paper': THETA_CONST,
    'theta_consistency': float(abs(theta_tr - theta_te)),
    'train': {'alpha': a_tr, 'R2': R2_tr},
    'test_constant_theta': {'alpha': a_te, 'R2': R2_te},
    'test_adaptive_paper': {'alpha': a_te2, 'R2': R2_te2},
    'generalization_gap': float(gap),
    'passed': bool(t1_pass),
    'runtime_s': time.time()-t0,
}
print(f'  theta_train={theta_tr:.4f}, theta_test={theta_te:.4f}')
print(f'  Train:  a={a_tr:.4f}, R2={R2_tr:.4f}')
print(f'  Test:   a={a_te:.4f}, R2={R2_te:.4f}')
print(f'  Gap: {gap:+.4f}')
print(f'  PASSED: {t1_pass}  ({result_t1["runtime_s"]:.1f}s)')


In [None]:
print('='*70)
print('TEST 2: PERMUTATION (50K, GPU)')
print('='*70)
t0 = time.time()

dt_g = cp.asarray(delta_true, dtype=cp.float32)
dp_g = cp.asarray(delta_pred, dtype=cp.float32)
R2_real = compute_R2(delta_true, delta_pred)
ss_tot_g = cp.sum((dt_g - cp.mean(dt_g))**2)

rng = np.random.default_rng(42)
R2_null = np.zeros(N_PERM, dtype=np.float32)
PBATCH = 5000

for s in range(0, N_PERM, PBATCH):
    e = min(s+PBATCH, N_PERM); bs = e - s
    idx = np.array([rng.permutation(len(delta_true)) for _ in range(bs)])
    idx_g = cp.asarray(idx)
    dt_perm = dt_g[idx_g]  # (bs, N)
    ss_res = cp.sum((dt_perm - dp_g[None,:])**2, axis=1)
    R2_null[s:e] = (1 - ss_res / ss_tot_g).get()
    del idx_g, dt_perm, ss_res
    if (e % 25000) == 0: print(f'    {e}/{N_PERM}...')

cp.get_default_memory_pool().free_all_blocks()

p_emp = float(np.mean(R2_null >= R2_real))
null_m = float(np.mean(R2_null)); null_s = float(np.std(R2_null))
z = (R2_real - null_m) / null_s if null_s > 0 else float('inf')
p_gauss = float(norm.sf(z)) if np.isfinite(z) else 0.
p_up = 3./N_PERM if p_emp == 0 else p_emp
cles = float(np.mean(R2_null < R2_real))
n_trials = 7 * 100
p_sidak = float(1 - (1 - p_up)**n_trials)
p_bonf = float(min(1., n_trials * p_up))

t2_pass = p_up < 0.001 and z > 10
result_t2 = {
    'R2_original': float(R2_real),
    'R2_null_mean': null_m, 'R2_null_std': null_s,
    'R2_null_max': float(np.max(R2_null)),
    'z_score': float(z),
    'p_empirical': p_emp, 'p_upper_bound': float(p_up),
    'p_gaussian_tail': p_gauss,
    'n_permutations': N_PERM,
    'effect_size': {
        'cohens_d': float(z), 'cles': cles,
        'interpretation': 'enormous' if z>2 else 'large' if z>0.8 else 'medium',
    },
    'look_elsewhere': {
        'n_trials': n_trials,
        'p_sidak': p_sidak, 'p_bonferroni': p_bonf,
        'still_significant': bool(p_bonf < 0.05),
    },
    'passed': bool(t2_pass),
    'runtime_s': time.time()-t0,
}
print(f'  R2={R2_real:.6f} vs null {null_m:.6f}+/-{null_s:.6f}')
print(f'  Z={z:.1f}, p<={p_up:.2e}, Cohen d={z:.1f}')
print(f'  PASSED: {t2_pass}  ({result_t2["runtime_s"]:.1f}s)')


In [None]:
print('='*70)
print('TEST 3: BASELINE COMPARISON')
print('='*70)
t0 = time.time()
n = len(delta_true)
idx = np.arange(n, dtype=np.float64)

R2_f = compute_R2(delta_true, delta_pred)

# Linear
A = np.column_stack([np.ones(n), idx])
c = np.linalg.lstsq(A, delta_true, rcond=None)[0]
R2_lin = compute_R2(delta_true, A@c)

# Poly 5, 10
ix = idx/n
for deg, name in [(5,'poly5'), (10,'poly10')]:
    Ap = np.column_stack([ix**k for k in range(deg+1)])
    cp_ = np.linalg.lstsq(Ap, delta_true, rcond=None)[0]
    exec(f'R2_{name} = compute_R2(delta_true, Ap@cp_)')

# Random frequency
rng3 = np.random.default_rng(777)
R2_rand = np.zeros(N_BASELINE_RANDOM)
step3 = max(1, n//2000)
Ts = gammas_smooth[::step3]; dts = delta_true[::step3]
for trial in range(N_BASELINE_RANDOM):
    freqs = rng3.uniform(0.5, 10., 50)
    amps = 1./np.sqrt(np.arange(1,51))
    Sr = np.sum(amps[:,None]*np.sin(freqs[:,None]*Ts[None,:]), axis=0)
    if np.std(Sr)>0: Sr = Sr/np.std(Sr)*np.std(dts)
    R2_rand[trial] = max(0, compute_R2(dts, Sr))

t3_pass = R2_f > R2_poly10 and R2_f > np.max(R2_rand)
result_t3 = {
    'R2_formula': float(R2_f), 'R2_null': 0.,
    'R2_linear': float(R2_lin),
    'R2_polynomial_5': float(R2_poly5),
    'R2_polynomial_10': float(R2_poly10),
    'R2_random_frequency': {
        'n_trials': N_BASELINE_RANDOM,
        'mean': float(np.mean(R2_rand)),
        'max': float(np.max(R2_rand)),
        'p99': float(np.percentile(R2_rand, 99)),
    },
    'formula_beats_all': bool(t3_pass),
    'passed': bool(t3_pass),
    'runtime_s': time.time()-t0,
}
print(f'  Formula:      {R2_f:.6f}')
print(f'  Linear:       {R2_lin:.6f}')
print(f'  Poly-5:       {R2_poly5:.6f}')
print(f'  Poly-10:      {R2_poly10:.6f}')
print(f'  Random (max): {np.max(R2_rand):.6f}')
print(f'  PASSED: {t3_pass}  ({result_t3["runtime_s"]:.1f}s)')


In [None]:
print('='*70)
print(f'TEST 4: {N_KFOLD}-FOLD CROSS-VALIDATION')
print('='*70)
t0 = time.time()
n = len(gammas_smooth); fold = n // N_KFOLD
R2_folds = []; alpha_folds = []; theta_folds = []

for k in range(N_KFOLD):
    ts = k*fold; te = (k+1)*fold if k < N_KFOLD-1 else n
    mask = np.ones(n, dtype=bool); mask[ts:te] = False
    gs_tr_ = gammas_smooth[mask]; dt_tr_ = delta_true[mask]
    gs_te_ = gammas_smooth[~mask]; dt_te_ = delta_true[~mask]
    step_ = max(1, len(gs_tr_)//1000)
    tk = find_theta_star(gs_tr_[::step_], dt_tr_[::step_], primes)
    Sw_ = compute_Sw_cpu(gs_te_, primes, theta0=tk, theta1=0)
    dp_ = compute_predictions(gs_te_, Sw_)
    ak, rk = compute_alpha_R2(dt_te_, dp_)
    R2_folds.append(rk); alpha_folds.append(ak)
    theta_folds.append(float(tk))
    print(f'  Fold {k+1}: theta={tk:.4f}, a={ak:.4f}, R2={rk:.4f}')

Ra = np.array(R2_folds); Aa = np.array(alpha_folds)
t4_pass = np.mean(Ra)>0.85 and np.std(Ra)<0.05 and np.mean(np.abs(Aa-1))<0.05
result_t4 = {
    'n_folds': N_KFOLD,
    'per_fold': {'R2': [float(x) for x in R2_folds],
                 'alpha': [float(x) for x in alpha_folds],
                 'theta_star': theta_folds},
    'R2_mean': float(np.mean(Ra)), 'R2_std': float(np.std(Ra)),
    'alpha_mean': float(np.mean(Aa)), 'alpha_std': float(np.std(Aa)),
    'theta_star_mean': float(np.mean(theta_folds)),
    'theta_star_std': float(np.std(theta_folds)),
    'passed': bool(t4_pass),
    'runtime_s': time.time()-t0,
}
print(f'  R2: {np.mean(Ra):.4f} +/- {np.std(Ra):.4f}')
print(f'  PASSED: {t4_pass}  ({result_t4["runtime_s"]:.1f}s)')


In [None]:
print('='*70)
print('TEST 5: MONTE CARLO UNIQUENESS (100K, GPU)')
print('='*70)
t0 = time.time()

rng5 = np.random.default_rng(123)
theta0_s = rng5.uniform(0.3, 2.5, N_MC).astype(np.float32)
theta1_s = rng5.uniform(-10., 2., N_MC).astype(np.float32)
kernel_ids = rng5.integers(0, 7, N_MC)

dt_sub_g = cp.asarray(dt_sub, dtype=cp.float32)

alphas_mc, R2s_mc = gpu_batch_mc(
    theta0_s, theta1_s, kernel_ids, dt_sub_g, batch_size=2000)

elapsed5 = time.time() - t0
print(f'  GPU MC completed in {elapsed5:.1f}s')

valid5 = ~np.isnan(alphas_mc)
close_a = valid5 & (np.abs(alphas_mc - 1.) < 0.01)
hi_r2 = valid5 & (R2s_mc > 0.90)
vhi_r2 = valid5 & (R2s_mc > 0.93)
both5 = close_a & vhi_r2

# Optimal on same subsample
Sw_sub = compute_Sw_cpu(gs_sub, primes)
dp_sub = compute_predictions(gs_sub, Sw_sub)
_, R2_opt_sub = compute_alpha_R2(dt_sub, dp_sub)
pctile = float(np.mean(R2s_mc[valid5] < R2_opt_sub) * 100)

t5_pass = np.mean(both5) < 0.001 and pctile > 99.
result_t5 = {
    'n_configurations': N_MC,
    'theta0_range': [0.3, 2.5], 'theta1_range': [-10., 2.],
    'n_kernels': 7,
    'frac_close_alpha': float(np.mean(close_a)),
    'frac_R2_above_90': float(np.mean(hi_r2)),
    'frac_R2_above_93': float(np.mean(vhi_r2)),
    'frac_both_criteria': float(np.mean(both5)),
    'best_random_R2': float(np.nanmax(R2s_mc)),
    'R2_optimal_subsample': float(R2_opt_sub),
    'percentile_rank': pctile,
    'R2_distribution': {
        'mean': float(np.nanmean(R2s_mc[valid5])),
        'std': float(np.nanstd(R2s_mc[valid5])),
        'p50': float(np.nanpercentile(R2s_mc[valid5], 50)),
        'p90': float(np.nanpercentile(R2s_mc[valid5], 90)),
        'p99': float(np.nanpercentile(R2s_mc[valid5], 99)),
    },
    'passed': bool(t5_pass),
    'runtime_s': elapsed5,
}
print(f'  |a-1|<0.01: {np.mean(close_a):.4%}')
print(f'  R2>0.93:    {np.mean(vhi_r2):.4%}')
print(f'  Both:       {np.mean(both5):.4%}')
print(f'  Best random R2: {np.nanmax(R2s_mc):.6f}')
print(f'  Optimal R2:     {R2_opt_sub:.6f} (rank {pctile:.2f}%ile)')
print(f'  PASSED: {t5_pass}')


In [None]:
print('='*70)
print('TEST 6: BAYESIAN BIC')
print('='*70)
t0 = time.time()
n = len(delta_true)
idx_n = np.arange(n, dtype=np.float64) / n
models = {}

# Formula
RSS_f = np.sum((delta_true - delta_pred)**2)
BIC_f = n*np.log(RSS_f/n) + 2*np.log(n)
models['formula'] = {'k': 2, 'RSS': float(RSS_f), 'BIC': float(BIC_f)}

# Null
RSS_0 = np.sum((delta_true - np.mean(delta_true))**2)
BIC_0 = n*np.log(RSS_0/n) + 1*np.log(n)
models['null'] = {'k': 1, 'RSS': float(RSS_0), 'BIC': float(BIC_0)}

# Linear
A_ = np.column_stack([np.ones(n), np.arange(n)])
c_ = np.linalg.lstsq(A_, delta_true, rcond=None)[0]
RSS_l = np.sum((delta_true - A_@c_)**2)
BIC_l = n*np.log(RSS_l/n) + 2*np.log(n)
models['linear'] = {'k': 2, 'RSS': float(RSS_l), 'BIC': float(BIC_l)}

# Poly 5, 10
for deg in [5, 10]:
    Ap = np.column_stack([idx_n**k for k in range(deg+1)])
    cp_ = np.linalg.lstsq(Ap, delta_true, rcond=None)[0]
    RSS_p = np.sum((delta_true - Ap@cp_)**2)
    BIC_p = n*np.log(RSS_p/n) + (deg+1)*np.log(n)
    models[f'poly_{deg}'] = {'k': deg+1, 'RSS': float(RSS_p), 'BIC': float(BIC_p)}

bayes_factors = {}
for nm, m in models.items():
    if nm != 'formula':
        db = m['BIC'] - BIC_f
        lbf = db / (2*np.log(10))
        bayes_factors[f'formula_vs_{nm}'] = {
            'delta_BIC': float(db), 'log10_BF': float(lbf),
            'interpretation': 'decisive' if lbf>2 else 'very_strong' if lbf>1.5
               else 'strong' if lbf>1 else 'substantial' if lbf>0.5 else 'inconclusive'
        }

best_other = min(m['BIC'] for nm,m in models.items() if nm!='formula')
t6_pass = BIC_f < best_other

result_t6 = {
    'models': models, 'bayes_factors': bayes_factors,
    'formula_has_lowest_BIC': bool(t6_pass),
    'passed': bool(t6_pass),
    'runtime_s': time.time()-t0,
}
for nm, m in models.items():
    tag = ' <- BEST' if nm=='formula' and t6_pass else ''
    print(f'  {nm:>12s} (k={m["k"]:>2d}): BIC={m["BIC"]:>12.1f}{tag}')
for nm, bf in bayes_factors.items():
    print(f'    {nm}: log10(BF)={bf["log10_BF"]:.1f} ({bf["interpretation"]})')
print(f'  PASSED: {t6_pass}  ({result_t6["runtime_s"]:.1f}s)')


In [None]:
print('='*70)
print(f'TEST 7: BOOTSTRAP BCa ({N_BOOTSTRAP}, GPU)')
print('='*70)
t0 = time.time()

# Subsample for bootstrap (every 5th from sub for speed)
bs_step = 5
gs_bs = gs_sub[::bs_step]; dt_bs = dt_sub[::bs_step]
n_bs = len(gs_bs)

# Original theta*
theta_orig = find_theta_star(gs_bs, dt_bs, primes)
Sw_o = compute_Sw_cpu(gs_bs, primes, theta0=theta_orig, theta1=0)
dp_o = compute_predictions(gs_bs, Sw_o)
a_orig, R2_orig = compute_alpha_R2(dt_bs, dp_o)

# Generate all bootstrap indices
rng7 = np.random.default_rng(999)
boot_idx = np.array([np.sort(rng7.choice(n_bs, n_bs, replace=True))
                      for _ in range(N_BOOTSTRAP)])

# Batched bisection on GPU
BBATCH = 500
theta_boots = np.zeros(N_BOOTSTRAP)
alpha_boots = np.zeros(N_BOOTSTRAP)
R2_boots = np.zeros(N_BOOTSTRAP)

gs_bs_np = gs_bs.astype(np.float32)
dt_bs_np = dt_bs.astype(np.float32)

for s in range(0, N_BOOTSTRAP, BBATCH):
    e = min(s + BBATCH, N_BOOTSTRAP)
    bi = boot_idx[s:e]  # (bs, n_bs)
    gs_b_g = cp.asarray(gs_bs_np[bi])  # (bs, n_bs)
    dt_b_g = cp.asarray(dt_bs_np[bi])  # (bs, n_bs)

    # Batched bisection
    theta_batch = gpu_batch_bisection(gs_b_g, dt_b_g, n_iter=40)
    theta_boots[s:e] = theta_batch

    # Evaluate at found theta
    for i in range(e - s):
        Sw_b = compute_Sw_cpu(gs_bs[bi[i]], primes,
                              theta0=theta_batch[i], theta1=0)
        dp_b = compute_predictions(gs_bs[bi[i]], Sw_b)
        alpha_boots[s+i], R2_boots[s+i] = compute_alpha_R2(
            dt_bs[bi[i]], dp_b)

    del gs_b_g, dt_b_g
    cp.get_default_memory_pool().free_all_blocks()
    print(f'  {e}/{N_BOOTSTRAP} resamples ({time.time()-t0:.0f}s)')

# BCa correction
z0 = norm.ppf(np.mean(theta_boots < theta_orig))
if not np.isfinite(z0): z0 = 0.

# Jackknife for acceleration
n_jack = min(50, n_bs)
jack_v = np.zeros(n_jack)
jstep = max(1, n_bs // n_jack)
for j in range(n_jack):
    m_ = np.ones(n_bs, dtype=bool)
    m_[j*jstep:min((j+1)*jstep, n_bs)] = False
    jack_v[j] = find_theta_star(gs_bs[m_], dt_bs[m_], primes)
jm = np.mean(jack_v)
num_ = np.sum((jm - jack_v)**3)
den_ = 6 * np.sum((jm - jack_v)**2)**1.5
a_acc = num_/den_ if den_ > 0 else 0.

z_lo = norm.ppf(0.025); z_hi = norm.ppf(0.975)
a1 = norm.cdf(z0 + (z0+z_lo)/(1-a_acc*(z0+z_lo)))
a2 = norm.cdf(z0 + (z0+z_hi)/(1-a_acc*(z0+z_hi)))
ci_lo = float(np.percentile(theta_boots, 100*max(0,a1)))
ci_hi = float(np.percentile(theta_boots, 100*min(1,a2)))

t7_pass = ci_lo < THETA_CONST < ci_hi and np.std(theta_boots) < 0.01

result_t7 = {
    'n_bootstrap': N_BOOTSTRAP,
    'theta_star': {
        'original': float(theta_orig),
        'mean': float(np.mean(theta_boots)),
        'std': float(np.std(theta_boots)),
        'ci95_bca': [ci_lo, ci_hi],
        'bca_z0': float(z0), 'bca_acceleration': float(a_acc),
        'contains_paper_value': bool(ci_lo < THETA_CONST < ci_hi),
    },
    'alpha': {'mean': float(np.mean(alpha_boots)),
              'std': float(np.std(alpha_boots))},
    'R2': {'mean': float(np.mean(R2_boots)),
           'std': float(np.std(R2_boots))},
    'passed': bool(t7_pass),
    'runtime_s': time.time()-t0,
}
print(f'  theta*={np.mean(theta_boots):.4f}+/-{np.std(theta_boots):.4f}')
print(f'  BCa CI: [{ci_lo:.4f}, {ci_hi:.4f}]')
print(f'  Contains {THETA_CONST}: {ci_lo < THETA_CONST < ci_hi}')
print(f'  PASSED: {t7_pass}  ({result_t7["runtime_s"]:.1f}s)')


In [None]:
print('='*70)
print('TEST 8: WINDOW STABILITY (10 windows)')
print('='*70)
t0 = time.time()
n = len(gammas); nw = 10; ws = n // nw
wa = []; wr = []; wt = []

for w in range(nw):
    s = w*ws; e = (w+1)*ws if w < nw-1 else n
    aw, rw = compute_alpha_R2(delta_true[s:e], delta_pred[s:e])
    wa.append(aw); wr.append(rw)
    wt.append(float(gammas[s + (e-s)//2]))
    print(f'  Win {w+1}: T~{gammas[s]:.0f}-{gammas[e-1]:.0f}, '
          f'a={aw:.4f}, R2={rw:.4f}')

wa_ = np.array(wa); wr_ = np.array(wr)
ix8 = np.arange(nw, dtype=np.float64)
sl, _ = np.polyfit(ix8, wa_, 1)
res8 = wa_ - (sl*ix8 + _)
se = np.sqrt(np.sum(res8**2)/(nw-2) / np.sum((ix8-ix8.mean())**2))
tst = sl/se if se > 0 else 0.
p_tr = float(2*t_dist.sf(abs(tst), df=nw-2))

t8_pass = (p_tr > 0.05) and np.std(wa_) < 0.03 and np.std(wr_) < 0.05

result_t8 = {
    'n_windows': nw,
    'per_window': {'T_mid': wt, 'alpha': [float(x) for x in wa],
                   'R2': [float(x) for x in wr]},
    'alpha_summary': {
        'mean': float(np.mean(wa_)), 'std': float(np.std(wa_)),
        'range': float(np.ptp(wa_)), 'drift_slope': float(sl),
        'drift_t_stat': float(tst), 'drift_p_value': p_tr,
        'significant_drift': p_tr <= 0.05,
    },
    'R2_summary': {
        'mean': float(np.mean(wr_)), 'std': float(np.std(wr_)),
        'range': float(np.ptp(wr_)), 'min': float(np.min(wr_)),
    },
    'passed': bool(t8_pass),
    'runtime_s': time.time()-t0,
}
print(f'  alpha: {np.mean(wa_):.4f}+/-{np.std(wa_):.4f}')
print(f'  R2:    {np.mean(wr_):.4f}+/-{np.std(wr_):.4f}')
print(f'  Drift p={p_tr:.3f}')
print(f'  PASSED: {t8_pass}  ({result_t8["runtime_s"]:.1f}s)')


In [None]:
# ── Assemble results ──
results = {
    'metadata': {
        'n_zeros': N_ZEROS,
        'theta0_optimal': THETA0_OPT,
        'theta1_optimal': THETA1_OPT,
        'theta_constant': THETA_CONST,
        'alpha_global': float(a_glob),
        'R2_global': float(R2_glob),
        'n_primes': len(primes),
        'date': time.strftime('%Y-%m-%d %H:%M:%S'),
        'script': 'paper1_gpu_validation.ipynb',
        'gpu': cp.cuda.runtime.getDeviceProperties(0)['name'].decode(),
    },
    'test_1_train_test_split': result_t1,
    'test_2_permutation': result_t2,
    'test_3_baselines': result_t3,
    'test_4_kfold_cv': result_t4,
    'test_5_monte_carlo': result_t5,
    'test_6_bayesian_bic': result_t6,
    'test_7_bootstrap_bca': result_t7,
    'test_8_window_stability': result_t8,
}

test_keys = [k for k in results if k.startswith('test_')]
n_pass = sum(1 for k in test_keys if results[k].get('passed'))
n_tot = len(test_keys)
verdict = 'VALIDATED' if n_pass == n_tot else (
    'PARTIALLY_VALIDATED' if n_pass >= n_tot-1 else 'FAILED')

results['summary'] = {
    'tests_passed': n_pass, 'tests_total': n_tot,
    'overall_verdict': verdict,
    'per_test': {k: results[k]['passed'] for k in test_keys},
}

# ── Display ──
print()
print('=' * 60)
print('          VALIDATION VERDICT')
print('=' * 60)
labels = {
    'test_1_train_test_split': 'T1 Train/Test Split',
    'test_2_permutation': 'T2 Permutation + LEE',
    'test_3_baselines': 'T3 Baseline Comparison',
    'test_4_kfold_cv': 'T4 K-Fold CV',
    'test_5_monte_carlo': 'T5 Monte Carlo (100K)',
    'test_6_bayesian_bic': 'T6 Bayesian BIC',
    'test_7_bootstrap_bca': 'T7 Bootstrap BCa',
    'test_8_window_stability': 'T8 Window Stability',
}
for k in test_keys:
    st = 'PASS' if results[k]['passed'] else 'FAIL'
    rt = results[k].get('runtime_s', 0)
    print(f'  {labels.get(k,k):<30s} {st:<6s} ({rt:.1f}s)')
print('-' * 60)
print(f'  OVERALL: {n_pass}/{n_tot} -> {verdict}')
print('=' * 60)

# ── Save JSON ──
out_path = 'paper1_robust_results.json'
with open(out_path, 'w') as f:
    json.dump(results, f, indent=2, default=lambda o:
        int(o) if isinstance(o, (np.integer,)) else
        float(o) if isinstance(o, (np.floating,)) else
        bool(o) if isinstance(o, (np.bool_,)) else
        o.tolist() if isinstance(o, np.ndarray) else o)
print(f'\nResults saved to {out_path}')

# Download link for Colab
try:
    from google.colab import files
    files.download(out_path)
except ImportError:
    pass
