In [None]:
# ============================================================
# X–θ CW/CCW Robustness Suite — Phase-by-Pulse Join Version
# Works on NIST build HDF5s where phase exists as alice/phase, bob/phase
# aligned via alice/laserPulseNumber and bob/laserPulseNumber.
#
# It will:
#  - scan all 9 files
#  - select only files that contain phase + pulse numbers
#  - join Alice/Bob events by common laserPulseNumber
#  - compute Δφ = wrap(bob_phase - alice_phase)
#  - CW if Δφ>0, CCW if Δφ<0
#  - map (a,b) settings by indexing settings[pulse]
#  - compute: per-file table, pooled binomial p-values (+ Bonferroni),
#             sign test, LOO stability, time-shift null, symmetry flip
#  - print a 1-page “Results + Robustness” note
#
# IMPORTANT:
#  - This uses only the 4 files that contain phase datasets.
#  - The 5 files without phase are reported as "not analyzable for this metric".
# ============================================================

import os, glob, math
import numpy as np
import pandas as pd
import h5py
from dataclasses import dataclass
from typing import Dict, Tuple, Any, List


# ----------------------------
# Config
# ----------------------------
@dataclass
class CFG:
    DATA_DIR: str = "./data_nist_v7"
    FILE_GLOB: str = "**/*.build.hdf5"

    # Required paths for this analysis:
    A_PHASE: str = "alice/phase"
    B_PHASE: str = "bob/phase"
    A_PULSE: str = "alice/laserPulseNumber"
    B_PULSE: str = "bob/laserPulseNumber"
    A_SETTINGS: str = "alice/settings"
    B_SETTINGS: str = "bob/settings"

    SETTINGS_ALLOWED: Tuple[int, int] = (0, 1)
    DPHI_EPS: float = 1e-12  # ignore tiny Δφ

    # null test: destroy pairing by rolling Bob phase relative to Bob pulses
    NULL_ROLL_FRAC: float = 0.37

    # multiple comparisons correction across 4 (a,b)
    MCC_M: int = 4


CFG = CFG()
AB_LIST = [(0, 0), (0, 1), (1, 0), (1, 1)]


# ----------------------------
# Circular math
# ----------------------------
def wrap_pi(x):
    return (x + np.pi) % (2 * np.pi) - np.pi


def imbalance(cw, ccw):
    n = cw + ccw
    return np.nan if n == 0 else (cw - ccw) / n


def binom_two_sided_exact(k, n, p=0.5):
    if n == 0:
        return np.nan
    from math import lgamma, log

    def logC(n, r):
        return lgamma(n + 1) - lgamma(r + 1) - lgamma(n - r + 1)

    logpmf = np.array(
        [logC(n, i) + i * log(p) + (n - i) * log(1 - p) for i in range(n + 1)], float
    )
    pmf = np.exp(logpmf - np.max(logpmf))
    pmf = pmf / pmf.sum()
    pk = pmf[k]
    return pmf[pmf <= pk + 1e-18].sum()


def wilson_ci(k, n, z=1.96):
    if n == 0:
        return (np.nan, np.nan)
    phat = k / n
    den = 1 + z**2 / n
    ctr = (phat + z**2 / (2 * n)) / den
    rad = (z / den) * math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)
    return (max(0.0, ctr - rad), min(1.0, ctr + rad))


def sign_test(k_pos, m):
    from math import comb

    if m == 0:
        return np.nan
    tail = sum(comb(m, i) * (0.5**m) for i in range(0, min(k_pos, m - k_pos) + 1))
    return min(1.0, 2 * tail)


def fmt_p(x):
    if np.isnan(x):
        return "NA"
    if x == 0:
        return "0"
    if x < 1e-4:
        return f"{x:.2e}"
    return f"{x:.4f}"


# ----------------------------
# Core: join by pulse & compute CW/CCW
# ----------------------------
def compute_counts_phase_join(
    h5path: str, cfg: CFG, null_roll: bool = False
) -> Dict[Tuple[int, int], Dict[str, Any]]:
    with h5py.File(h5path, "r") as h5:
        needed = [
            cfg.A_PHASE,
            cfg.B_PHASE,
            cfg.A_PULSE,
            cfg.B_PULSE,
            cfg.A_SETTINGS,
            cfg.B_SETTINGS,
        ]
        for p in needed:
            if p not in h5:
                raise KeyError(f"Missing {p} in {os.path.basename(h5path)}")

        a_phi = np.asarray(h5[cfg.A_PHASE], dtype=float)
        b_phi = np.asarray(h5[cfg.B_PHASE], dtype=float)
        a_p = np.asarray(h5[cfg.A_PULSE], dtype=np.int64)
        b_p = np.asarray(h5[cfg.B_PULSE], dtype=np.int64)

        # settings are huge; load as mem-friendly uint8
        a_set = np.asarray(h5[cfg.A_SETTINGS], dtype=np.uint8)
        b_set = np.asarray(h5[cfg.B_SETTINGS], dtype=np.uint8)

    # Optional null: roll bob phase relative to its pulse association
    if null_roll:
        shift = int(max(1, len(b_phi) * cfg.NULL_ROLL_FRAC))
        b_phi = np.roll(b_phi, shift)
    else:
        shift = 0

    # Build maps pulse -> phase for Alice and Bob.
    # If pulses repeat, last one wins; that’s okay for a first-pass robustness suite.
    # (If repeats are common, we can switch to grouping/averaging per pulse.)
    a_map = dict(zip(a_p.tolist(), a_phi.tolist()))
    b_map = dict(zip(b_p.tolist(), b_phi.tolist()))

    # Intersection of pulses
    common_pulses = np.fromiter((p for p in a_map.keys() if p in b_map), dtype=np.int64)
    common_pulses.sort()

    # Guard: settings arrays must be indexable by pulse
    max_pulse = int(common_pulses[-1]) if len(common_pulses) else -1
    max_ok = min(len(a_set), len(b_set)) - 1
    if max_pulse > max_ok:
        # Keep only pulses within settings array bounds
        common_pulses = common_pulses[common_pulses <= max_ok]

    # Compute Δφ per common pulse
    a_vals = np.array([a_map[int(p)] for p in common_pulses], dtype=float)
    b_vals = np.array([b_map[int(p)] for p in common_pulses], dtype=float)
    dphi = wrap_pi(b_vals - a_vals)

    # Settings (a,b) at those pulses
    a = a_set[common_pulses].astype(int)
    b = b_set[common_pulses].astype(int)

    allowed = set(cfg.SETTINGS_ALLOWED)
    valid = (
        np.isin(a, list(allowed))
        & np.isin(b, list(allowed))
        & (np.abs(dphi) > cfg.DPHI_EPS)
    )

    cw_mask = valid & (dphi > 0)
    ccw_mask = valid & (dphi < 0)

    out = {
        ab: {
            "CW": 0,
            "CCW": 0,
            "Valid": 0,
            "TotalMatched": int(len(common_pulses)),
            "NullShift": shift,
        }
        for ab in AB_LIST
    }
    for ab in AB_LIST:
        aa, bb = ab
        m = (a == aa) & (b == bb)
        CW = int(np.sum(cw_mask & m))
        CCW = int(np.sum(ccw_mask & m))
        V = int(np.sum(valid & m))
        out[ab]["CW"] = CW
        out[ab]["CCW"] = CCW
        out[ab]["Valid"] = V
        out[ab]["valid_rate"] = (
            np.nan if len(common_pulses) == 0 else V / len(common_pulses)
        )

    out["_meta"] = {
        "file": os.path.basename(h5path),
        "common_pulses": int(len(common_pulses)),
        "max_pulse_used": int(common_pulses[-1]) if len(common_pulses) else None,
        "settings_len_min": int(min(len(a_set), len(b_set))),
        "null_shift": shift,
    }
    return out


# ----------------------------
# Discover files + run
# ----------------------------
files = sorted(glob.glob(os.path.join(CFG.DATA_DIR, CFG.FILE_GLOB), recursive=True))
print(f"Found {len(files)} HDF5 files.")
assert files, "No files found. Fix CFG.DATA_DIR / CFG.FILE_GLOB."

counts = {}
null_counts = {}
skipped = []
failed = []

for f in files:
    try:
        # Only run if phase datasets exist
        with h5py.File(f, "r") as h5:
            has_phase = (
                (CFG.A_PHASE in h5)
                and (CFG.B_PHASE in h5)
                and (CFG.A_PULSE in h5)
                and (CFG.B_PULSE in h5)
            )
        if not has_phase:
            skipped.append(os.path.basename(f))
            continue

        counts[f] = compute_counts_phase_join(f, CFG, null_roll=False)
        null_counts[f] = compute_counts_phase_join(f, CFG, null_roll=True)
    except Exception as e:
        failed.append((os.path.basename(f), repr(e)))

good_files = list(counts.keys())

print("\nFiles used (phase present):", len(good_files))
for f in good_files:
    print("  -", os.path.basename(f))

print("\nFiles skipped (no phase datasets):", len(skipped))
for s in skipped:
    print("  -", s)

if failed:
    print("\n[WARN] Failed files:")
    for name, err in failed:
        print("  -", name, ":", err)

assert (
    good_files
), "No files with phase datasets were processed. (But your scan shows 4 should work.)"

# ----------------------------
# Per-file summary table
# ----------------------------
rows = []
for f in good_files:
    base = {"file": os.path.basename(f), **counts[f]["_meta"]}
    for ab in AB_LIST:
        CW = counts[f][ab]["CW"]
        CCW = counts[f][ab]["CCW"]
        V = counts[f][ab]["Valid"]
        base[f"{ab}_CW"] = CW
        base[f"{ab}_CCW"] = CCW
        base[f"{ab}_n"] = CW + CCW
        base[f"{ab}_I"] = imbalance(CW, CCW)
        base[f"{ab}_valid_rate"] = counts[f][ab]["valid_rate"]
        base[f"{ab}_Valid"] = V
    rows.append(base)

df_file = pd.DataFrame(rows).sort_values("file")
pd.set_option("display.max_columns", 200)
display(df_file)

# ----------------------------
# Pooled results
# ----------------------------
pooled = {}
for ab in AB_LIST:
    CW = sum(counts[f][ab]["CW"] for f in good_files)
    CCW = sum(counts[f][ab]["CCW"] for f in good_files)
    V = sum(counts[f][ab]["Valid"] for f in good_files)
    n = CW + CCW
    p = binom_two_sided_exact(CW, n) if n > 0 else np.nan
    p_bonf = min(1.0, p * CFG.MCC_M) if not np.isnan(p) else np.nan
    ci = wilson_ci(CW, n) if n > 0 else (np.nan, np.nan)
    pooled[ab] = dict(
        CW=CW,
        CCW=CCW,
        Valid=V,
        n=n,
        I=imbalance(CW, CCW),
        p=p,
        p_bonf=p_bonf,
        ci_low=ci[0],
        ci_high=ci[1],
    )

df_pool = pd.DataFrame(
    [{"a": ab[0], "b": ab[1], **pooled[ab]} for ab in AB_LIST]
).sort_values(["a", "b"])
display(df_pool)

# ----------------------------
# Per-file consistency: sign test
# ----------------------------
sign_rows = []
for ab in AB_LIST:
    I_list = []
    for f in good_files:
        CW, CCW = counts[f][ab]["CW"], counts[f][ab]["CCW"]
        if CW + CCW > 0:
            I_list.append(imbalance(CW, CCW))
    m = len(I_list)
    kpos = sum(i > 0 for i in I_list)
    p_sign = sign_test(kpos, m)
    sign_rows.append(
        {
            "a": ab[0],
            "b": ab[1],
            "files_with_data": m,
            "files_I>0": kpos,
            "sign_test_p": p_sign,
        }
    )
df_sign = pd.DataFrame(sign_rows).sort_values(["a", "b"])
display(df_sign)

# ----------------------------
# Leave-one-file-out (LOO)
# ----------------------------
loo_rows = []
for drop in good_files:
    for ab in AB_LIST:
        CW = sum(counts[f][ab]["CW"] for f in good_files if f != drop)
        CCW = sum(counts[f][ab]["CCW"] for f in good_files if f != drop)
        n = CW + CCW
        p = binom_two_sided_exact(CW, n) if n > 0 else np.nan
        loo_rows.append(
            {
                "dropped": os.path.basename(drop),
                "a": ab[0],
                "b": ab[1],
                "CW": CW,
                "CCW": CCW,
                "n": n,
                "I": imbalance(CW, CCW),
                "p": p,
            }
        )
df_loo = pd.DataFrame(loo_rows).sort_values(["a", "b", "p"])
display(df_loo)

# ----------------------------
# Time-shift null (phase roll) pooled
# ----------------------------
null_pooled = {}
for ab in AB_LIST:
    CWn = sum(null_counts[f][ab]["CW"] for f in good_files)
    CCWn = sum(null_counts[f][ab]["CCW"] for f in good_files)
    nn = CWn + CCWn
    pn = binom_two_sided_exact(CWn, nn) if nn > 0 else np.nan
    null_pooled[ab] = dict(
        CW_null=CWn, CCW_null=CCWn, n_null=nn, I_null=imbalance(CWn, CCWn), p_null=pn
    )

df_null = pd.DataFrame(
    [{"a": ab[0], "b": ab[1], **null_pooled[ab]} for ab in AB_LIST]
).sort_values(["a", "b"])
display(df_null)

# ----------------------------
# Symmetry check: swap CW/CCW
# ----------------------------
sym_rows = []
for ab in AB_LIST:
    I = pooled[ab]["I"]
    I_flip = imbalance(pooled[ab]["CCW"], pooled[ab]["CW"])
    sym_rows.append(
        {
            "a": ab[0],
            "b": ab[1],
            "I": I,
            "I_after_flip_definition": I_flip,
            "expectation": "~ -I",
        }
    )
df_sym = pd.DataFrame(sym_rows).sort_values(["a", "b"])
display(df_sym)

# ----------------------------
# 1-page Results + Robustness note
# ----------------------------
note = []
note.append(
    "X–θ CW/CCW ORIENTATION ASYMMETRY — RESULTS + ROBUSTNESS NOTE (phase-by-pulse join)\n"
)
note.append("Dataset selection")
note.append(f"  • Total HDF5 files found: {len(files)}")
note.append(f"  • Files with phase present (used here): {len(good_files)}")
note.append(f"  • Files without phase (skipped for this metric): {len(skipped)}")
note.append("")

note.append("Definition of CW/CCW")
note.append(
    "  • Let φ_A(p) and φ_B(p) be Alice/Bob phase values recorded for a laser pulse number p."
)
note.append("  • Form matched events by inner-joining on common laserPulseNumber p.")
note.append("  • Define Δφ(p) = wrap_to_(-π,π]( φ_B(p) − φ_A(p) ).")
note.append("  • CW event: Δφ(p) > 0 ; CCW event: Δφ(p) < 0.")
note.append("")

note.append("Definition of valid events subset + valid_rate")
note.append(
    "  • Settings are read as a(p)=alice/settings[p] and b(p)=bob/settings[p], for pulses within bounds."
)
note.append(
    f"  • Valid if (a,b) ∈ {{0,1}}×{{0,1}} and |Δφ| > ε with ε={CFG.DPHI_EPS:g}."
)
note.append(
    "  • valid_rate is Valid / TotalMatched pulses (reported per file and per (a,b))."
)
note.append("")

note.append("Main pooled result (exact two-sided binomial test, H0: p(CW)=0.5)")
for ab in AB_LIST:
    r = pooled[ab]
    note.append(
        f"  • (a,b)={ab}: CW={r['CW']}, CCW={r['CCW']}, n={r['n']}, "
        f"I={r['I']:.3f}, p={fmt_p(r['p'])}, p(Bonf×{CFG.MCC_M})={fmt_p(r['p_bonf'])}, "
        f"WilsonCI(CW proportion)=[{r['ci_low']:.3f},{r['ci_high']:.3f}]"
    )
note.append("")

note.append("Per-file consistency (sign test across files)")
for ab in AB_LIST:
    s = df_sign[(df_sign["a"] == ab[0]) & (df_sign["b"] == ab[1])].iloc[0]
    note.append(
        f"  • (a,b)={ab}: files_with_data={int(s['files_with_data'])}, files_I>0={int(s['files_I>0'])}, sign-test p={fmt_p(s['sign_test_p'])}"
    )
note.append("")

note.append("LOO stability (leave-one-file-out pooled)")
note.append(
    "  • See df_loo: robustness is supported if sign(I) and small p persist when dropping any single file."
)
note.append("")

note.append("Time-shift null (phase roll)")
note.append(
    "  • Null constructed by rolling Bob’s phase array relative to its pulse association (destroys true pairing structure)."
)
for ab in AB_LIST:
    r = null_pooled[ab]
    note.append(
        f"  • (a,b)={ab}: CW_null={r['CW_null']}, CCW_null={r['CCW_null']}, n_null={r['n_null']}, I_null={r['I_null']:.3f}, p_null={fmt_p(r['p_null'])}"
    )
note.append("")

note.append("Symmetry checks")
note.append(
    "  • Flipping CW/CCW definition should invert I (see df_sym). If not, a sign-convention or mapping bug exists."
)
note.append("")

note.append("Window sweep stability")
note.append(
    "  • Not applicable in this phase-join metric unless you implement a coincidence-window selector in the phase construction."
)
note.append("")

print("\n" + "=" * 90)
print("\n".join(note))
print("=" * 90)

Found 9 HDF5 files.

Files used (phase present): 4
  - 03_31_CH_pockel_100kHz.run4.afterTimingfix2_training.dat.compressed.build.hdf5
  - 03_43_CH_pockel_100kHz.run4.afterTimingfix2_afterfixingModeLocking.dat.compressed.build.hdf5
  - 17_04_CH_pockel_100kHz.run.completeblind.dat.compressed.build.hdf5
  - 19_45_CH_pockel_100kHz.run.nolightconeshift.dat.compressed.build.hdf5

Files skipped (no phase datasets): 5
  - 01_11_CH_pockel_100kHz.run4.afterTimingfix.dat.compressed.build.hdf5
  - 02_54_CH_pockel_100kHz.run4.afterTimingfix2.dat.compressed.build.hdf5
  - 21_15_CH_pockel_100kHz.run.200nsadditiondelay_lightconeshift.dat.compressed.build.hdf5
  - 22_20_CH_pockel_100kHz.run.200nsreduceddelay_lightconeshift.dat.compressed.build.hdf5
  - 23_55_CH_pockel_100kHz.run.ClassicalRNGXOR.dat.compressed.build.hdf5


Unnamed: 0,file,common_pulses,max_pulse_used,settings_len_min,null_shift,"(0, 0)_CW","(0, 0)_CCW","(0, 0)_n","(0, 0)_I","(0, 0)_valid_rate","(0, 0)_Valid","(0, 1)_CW","(0, 1)_CCW","(0, 1)_n","(0, 1)_I","(0, 1)_valid_rate","(0, 1)_Valid","(1, 0)_CW","(1, 0)_CCW","(1, 0)_n","(1, 0)_I","(1, 0)_valid_rate","(1, 0)_Valid","(1, 1)_CW","(1, 1)_CCW","(1, 1)_n","(1, 1)_I","(1, 1)_valid_rate","(1, 1)_Valid"
0,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...,0,,28526894,0,0,0,0,,,0,0,0,0,,,0,0,0,0,,,0,0,0,0,,,0
1,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...,801,800.0,107109596,0,0,0,0,,0.0,0,0,0,0,,0.0,0,0,0,0,,0.0,0,76,103,179,-0.150838,0.223471,179
2,17_04_CH_pockel_100kHz.run.completeblind.dat.c...,801,800.0,356464574,0,0,0,0,,0.0,0,0,0,0,,0.0,0,0,0,0,,0.0,0,101,93,194,0.041237,0.242197,194
3,19_45_CH_pockel_100kHz.run.nolightconeshift.da...,801,800.0,182659855,0,0,0,0,,0.0,0,0,0,0,,0.0,0,0,0,0,,0.0,0,89,103,192,-0.072917,0.2397,192


Unnamed: 0,a,b,CW,CCW,Valid,n,I,p,p_bonf,ci_low,ci_high
0,0,0,0,0,0,0,,,,,
1,0,1,0,0,0,0,,,,,
2,1,0,0,0,0,0,,,,,
3,1,1,266,299,565,565,-0.058407,0.17817,0.71268,0.429974,0.512013


Unnamed: 0,a,b,files_with_data,files_I>0,sign_test_p
0,0,0,0,0,
1,0,1,0,0,
2,1,0,0,0,
3,1,1,3,1,1.0


Unnamed: 0,dropped,a,b,CW,CCW,n,I,p
0,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...,0,0,0,0,0,,
4,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...,0,0,0,0,0,,
8,17_04_CH_pockel_100kHz.run.completeblind.dat.c...,0,0,0,0,0,,
12,19_45_CH_pockel_100kHz.run.nolightconeshift.da...,0,0,0,0,0,,
1,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...,0,1,0,0,0,,
5,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...,0,1,0,0,0,,
9,17_04_CH_pockel_100kHz.run.completeblind.dat.c...,0,1,0,0,0,,
13,19_45_CH_pockel_100kHz.run.nolightconeshift.da...,0,1,0,0,0,,
2,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...,1,0,0,0,0,,
6,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...,1,0,0,0,0,,


Unnamed: 0,a,b,CW_null,CCW_null,n_null,I_null,p_null
0,0,0,0,0,0,,
1,0,1,0,0,0,,
2,1,0,0,0,0,,
3,1,1,263,302,565,-0.069027,0.109812


Unnamed: 0,a,b,I,I_after_flip_definition,expectation
0,0,0,,,~ -I
1,0,1,,,~ -I
2,1,0,,,~ -I
3,1,1,-0.058407,0.058407,~ -I



X–θ CW/CCW ORIENTATION ASYMMETRY — RESULTS + ROBUSTNESS NOTE (phase-by-pulse join)

Dataset selection
  • Total HDF5 files found: 9
  • Files with phase present (used here): 4
  • Files without phase (skipped for this metric): 5

Definition of CW/CCW
  • Let φ_A(p) and φ_B(p) be Alice/Bob phase values recorded for a laser pulse number p.
  • Form matched events by inner-joining on common laserPulseNumber p.
  • Define Δφ(p) = wrap_to_(-π,π]( φ_B(p) − φ_A(p) ).
  • CW event: Δφ(p) > 0 ; CCW event: Δφ(p) < 0.

Definition of valid events subset + valid_rate
  • Settings are read as a(p)=alice/settings[p] and b(p)=bob/settings[p], for pulses within bounds.
  • Valid if (a,b) ∈ {0,1}×{0,1} and |Δφ| > ε with ε=1e-12.
  • valid_rate is Valid / TotalMatched pulses (reported per file and per (a,b)).

Main pooled result (exact two-sided binomial test, H0: p(CW)=0.5)
  • (a,b)=(0, 0): CW=0, CCW=0, n=0, I=nan, p=NA, p(Bonf×4)=NA, WilsonCI(CW proportion)=[nan,nan]
  • (a,b)=(0, 1): CW=0, CCW=0, n=

In [None]:
import os, glob
import numpy as np
import pandas as pd
import h5py
from collections import Counter
import math

DATA_DIR = "./data_nist_v7"
FILES = sorted(glob.glob(os.path.join(DATA_DIR, "**/*.build.hdf5"), recursive=True))

A_PHASE, B_PHASE = "alice/phase", "bob/phase"
A_PULSE, B_PULSE = "alice/laserPulseNumber", "bob/laserPulseNumber"
A_SET, B_SET = "alice/settings", "bob/settings"

AB_LIST = [(0, 0), (0, 1), (1, 0), (1, 1)]


def wrap_pi(x):
    return (x + np.pi) % (2 * np.pi) - np.pi


def imbalance(cw, ccw):
    n = cw + ccw
    return np.nan if n == 0 else (cw - ccw) / n


def binom_two_sided_exact(k, n, p=0.5):
    if n == 0:
        return np.nan
    from math import lgamma, log

    def logC(n, r):
        return lgamma(n + 1) - lgamma(r + 1) - lgamma(n - r + 1)

    logpmf = np.array(
        [logC(n, i) + i * log(p) + (n - i) * log(1 - p) for i in range(n + 1)], float
    )
    pmf = np.exp(logpmf - np.max(logpmf))
    pmf = pmf / pmf.sum()
    pk = pmf[k]
    return pmf[pmf <= pk + 1e-18].sum()


phase_files = []
for f in FILES:
    with h5py.File(f, "r") as h5:
        if all(k in h5 for k in [A_PHASE, B_PHASE, A_PULSE, B_PULSE, A_SET, B_SET]):
            phase_files.append(f)

print("Phase-capable files:", len(phase_files))
for f in phase_files:
    print(" -", os.path.basename(f))


def get_common_pulses_and_arrays(f):
    with h5py.File(f, "r") as h5:
        a_phi = np.asarray(h5[A_PHASE], float)
        b_phi = np.asarray(h5[B_PHASE], float)
        a_p = np.asarray(h5[A_PULSE], np.int64)
        b_p = np.asarray(h5[B_PULSE], np.int64)
        a_set = np.asarray(h5[A_SET], np.uint8)
        b_set = np.asarray(h5[B_SET], np.uint8)

    a_map = dict(zip(a_p.tolist(), a_phi.tolist()))
    b_map = dict(zip(b_p.tolist(), b_phi.tolist()))
    common = np.fromiter((p for p in a_map.keys() if p in b_map), dtype=np.int64)
    common.sort()

    max_ok = min(len(a_set), len(b_set)) - 1
    common = common[common <= max_ok]

    a_vals = np.array([a_map[int(p)] for p in common], float)
    b_vals = np.array([b_map[int(p)] for p in common], float)
    dphi = wrap_pi(b_vals - a_vals)

    aset_at = a_set[common]
    bset_at = b_set[common]
    return common, dphi, aset_at, bset_at


# ---- 1) Diagnose settings encoding
for f in phase_files:
    common, dphi, aset_at, bset_at = get_common_pulses_and_arrays(f)
    print("\n" + "=" * 90)
    print("FILE:", os.path.basename(f))
    print(
        "Matched pulses:",
        len(common),
        "| pulse range:",
        (int(common[0]) if len(common) else None),
        "to",
        (int(common[-1]) if len(common) else None),
    )
    print(
        "Raw alice/settings unique (top 10):", Counter(aset_at.tolist()).most_common(10)
    )
    print(
        "Raw bob/settings unique   (top 10):", Counter(bset_at.tolist()).most_common(10)
    )

    for mask in [1, 3, 7]:
        au = Counter((aset_at & mask).tolist()).most_common(10)
        bu = Counter((bset_at & mask).tolist()).most_common(10)
        print(f"After &{mask}: alice uniques:", au, "| bob uniques:", bu)

# ---- 2) Recompute CW/CCW using bit-masked settings; pick mask=1 by default
MASK = 1  # try 1 first; if your print shows 0/1 only after &3, change to 3
DPHI_EPS = 1e-12

counts_by_file = {}
for f in phase_files:
    common, dphi, aset_at, bset_at = get_common_pulses_and_arrays(f)

    a = (aset_at & MASK).astype(int)
    b = (bset_at & MASK).astype(int)

    valid = (np.abs(dphi) > DPHI_EPS) & np.isin(a, [0, 1]) & np.isin(b, [0, 1])
    cw = valid & (dphi > 0)
    ccw = valid & (dphi < 0)

    per = {}
    for ab in AB_LIST:
        aa, bb = ab
        m = (a == aa) & (b == bb)
        CW = int(np.sum(cw & m))
        CCW = int(np.sum(ccw & m))
        per[ab] = dict(
            CW=CW,
            CCW=CCW,
            n=CW + CCW,
            I=imbalance(CW, CCW),
            valid_rate=(
                int(np.sum(valid & m)) / len(common) if len(common) else np.nan
            ),
        )
    counts_by_file[os.path.basename(f)] = per

# Per-file table
rows = []
for fname, per in counts_by_file.items():
    r = {"file": fname}
    for ab in AB_LIST:
        r[f"{ab}_CW"] = per[ab]["CW"]
        r[f"{ab}_CCW"] = per[ab]["CCW"]
        r[f"{ab}_n"] = per[ab]["n"]
        r[f"{ab}_I"] = per[ab]["I"]
        r[f"{ab}_valid_rate"] = per[ab]["valid_rate"]
    rows.append(r)
df = pd.DataFrame(rows).sort_values("file")
pd.set_option("display.max_columns", 200)
display(df)

# Pooled results
pool = []
for ab in AB_LIST:
    CW = sum(counts_by_file[f][ab]["CW"] for f in counts_by_file)
    CCW = sum(counts_by_file[f][ab]["CCW"] for f in counts_by_file)
    n = CW + CCW
    p = binom_two_sided_exact(CW, n) if n > 0 else np.nan
    pool.append(
        {
            "a": ab[0],
            "b": ab[1],
            "CW": CW,
            "CCW": CCW,
            "n": n,
            "I": imbalance(CW, CCW),
            "p": p,
        }
    )
df_pool = pd.DataFrame(pool).sort_values(["a", "b"])
display(df_pool)

print(
    "\nUsed MASK =",
    MASK,
    "(change MASK to 3 if &1 doesn't produce sensible 0/1 settings)",
)

Phase-capable files: 4
 - 03_31_CH_pockel_100kHz.run4.afterTimingfix2_training.dat.compressed.build.hdf5
 - 03_43_CH_pockel_100kHz.run4.afterTimingfix2_afterfixingModeLocking.dat.compressed.build.hdf5
 - 17_04_CH_pockel_100kHz.run.completeblind.dat.compressed.build.hdf5
 - 19_45_CH_pockel_100kHz.run.nolightconeshift.dat.compressed.build.hdf5

FILE: 03_31_CH_pockel_100kHz.run4.afterTimingfix2_training.dat.compressed.build.hdf5
Matched pulses: 0 | pulse range: None to None
Raw alice/settings unique (top 10): []
Raw bob/settings unique   (top 10): []
After &1: alice uniques: [] | bob uniques: []
After &3: alice uniques: [] | bob uniques: []
After &7: alice uniques: [] | bob uniques: []

FILE: 03_43_CH_pockel_100kHz.run4.afterTimingfix2_afterfixingModeLocking.dat.compressed.build.hdf5
Matched pulses: 801 | pulse range: 0 to 800
Raw alice/settings unique (top 10): [(2, 434), (1, 367)]
Raw bob/settings unique   (top 10): [(2, 422), (1, 379)]
After &1: alice uniques: [(0, 434), (1, 367)] | bo

Unnamed: 0,file,"(0, 0)_CW","(0, 0)_CCW","(0, 0)_n","(0, 0)_I","(0, 0)_valid_rate","(0, 1)_CW","(0, 1)_CCW","(0, 1)_n","(0, 1)_I","(0, 1)_valid_rate","(1, 0)_CW","(1, 0)_CCW","(1, 0)_n","(1, 0)_I","(1, 0)_valid_rate","(1, 1)_CW","(1, 1)_CCW","(1, 1)_n","(1, 1)_I","(1, 1)_valid_rate"
0,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...,0,0,0,,,0,0,0,,,0,0,0,,,0,0,0,,
1,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...,99,134,233,-0.150215,0.290886,103,96,199,0.035176,0.248439,101,87,188,0.074468,0.234707,76,103,179,-0.150838,0.223471
2,17_04_CH_pockel_100kHz.run.completeblind.dat.c...,97,102,199,-0.025126,0.248439,84,108,192,-0.125,0.2397,90,125,215,-0.162791,0.268414,101,93,194,0.041237,0.242197
3,19_45_CH_pockel_100kHz.run.nolightconeshift.da...,115,99,214,0.074766,0.267166,89,102,191,-0.068063,0.238452,102,102,204,0.0,0.254682,89,103,192,-0.072917,0.2397


Unnamed: 0,a,b,CW,CCW,n,I,p
0,0,0,311,335,646,-0.037152,0.345413
1,0,1,276,306,582,-0.051546,0.2293
2,1,0,293,314,607,-0.034596,0.416943
3,1,1,266,299,565,-0.058407,0.17817



Used MASK = 1 (change MASK to 3 if &1 doesn't produce sensible 0/1 settings)


In [3]:
# ============================================================
# X–θ Geometry-First Analysis on NIST *.build.hdf5 (S + Holonomy + Robustness)
# SINGLE CELL — assumes all files are already downloaded locally.
#
# What this does end-to-end (one run):
# 0) Loads all HDF5 files in DATA_DIR
# 1) DECODES Alice/Bob event streams from alice/clicks and bob/clicks into:
#       - timestamps (monotone)  tA, tB
#       - outcomes              x, y
#    using a robust default decode:
#       outcome = clicks & 1
#       dt      = clicks >> 1
#       time    = cumsum(dt)
#    (This is a *best-effort* decode; if your clicks are pure outcomes, change DECODE_MODE.)
# 2) Reads settings a,b from alice/settings, bob/settings and decodes basis as (settings & 1)
# 3) Forms coincidences by nearest-neighbor time matching within a window W (in ticks)
# 4) Computes CHSH:
#       E_ab = mean( (-1)^(x xor y) ) for each (a,b)
#       S = E00 + E01 + E10 - E11
# 5) Defines a GEOMETRIC holonomy proxy (time-reparam invariant in form, but uses ordering):
#       sigma_i = sign( wrap_pi( kappa * (tB_i - tA_i) ) )  # orientation of pair
#       dtheta_i = sigma_i on transitions when (a,b) changes (configuration-space steps)
#       Theta_window = rolling sum of dtheta over last L transitions (loop-ish proxy)
#    (This is the *working* estimator; swap sigma definition with your own geometry rule easily.)
# 6) Computes setting-conditional CW/CCW imbalance from sigma per (a,b)
# 7) Robustness:
#       - per-file tables
#       - pooled stats
#       - LOO (leave-one-file-out) stability
#       - time-shift null (destroy pairing)    -> should kill imbalance/structure
#       - window sweep (W list)
# 8) Prints a 1-page “Results + Robustness” note template filled with real numbers.
#
# IMPORTANT NOTES (read once, then run):
# - This cell is designed to be usable even if the dataset schema is imperfectly known.
# - If S comes out nonsense (e.g., no violation) OR coincidences are near-zero,
#   your clicks decoding likely differs. Change DECODE_MODE and/or decode function.
# - If you already have a trusted coincidence+orientation pipeline that produced
#   the strong CW bias across 9 files, replace ONLY the "match_coincidences" and
#   "sigma" definition with your existing logic; all robustness + reporting stays.
# ============================================================

import os, glob, math, warnings
from dataclasses import dataclass
from typing import Dict, Tuple, Any, List
import numpy as np
import pandas as pd
import h5py

warnings.filterwarnings("ignore", category=RuntimeWarning)
pd.set_option("display.max_columns", 250)


# ----------------------------
# Config
# ----------------------------
@dataclass
class CFG:
    DATA_DIR: str = "./data_nist_v7"
    FILE_GLOB: str = "**/*.build.hdf5"

    A_CLICKS: str = "alice/clicks"
    B_CLICKS: str = "bob/clicks"
    A_SET: str = "alice/settings"
    B_SET: str = "bob/settings"

    # Settings decoding: basis bit
    SET_MASK: int = 1

    # Clicks decoding modes:
    #   "packed_dt_outcome": outcome=clicks&1, dt=clicks>>1, time=cumsum(dt)
    #   "outcome_only":      outcome=clicks&1, time = np.arange(N)
    DECODE_MODE: str = "packed_dt_outcome"

    # Coincidence window(s) in "time ticks" (units depend on decode)
    WINDOWS: Tuple[int, ...] = (50, 100, 200, 400)  # sweep
    # To mimic your earlier runs, set one value (e.g., (100,))

    # Matching:
    MAX_EVENTS_PER_SIDE: int = 5_000_000  # cap for memory/speed; raise if needed
    # If you want full scale, set None (but be careful with RAM)

    # Holonomy proxy:
    KAPPA: float = 1.0  # scales (tB-tA) before wrap_pi
    EPS_SIGMA: float = 1e-12
    TRANSITION_ONLY: bool = True  # only count sigma when (a,b) changes
    THETA_ROLL_L: int = 200  # rolling length for Theta_window feature

    # Null test:
    NULL_SHIFT: int = (
        10_000_000  # shift Bob time by a large constant ticks to break pairing
    )

    # Multiple comparisons:
    MCC_M: int = 4


CFG = CFG()
AB_LIST = [(0, 0), (0, 1), (1, 0), (1, 1)]


# ----------------------------
# Math helpers
# ----------------------------
def wrap_pi(x):
    return (x + np.pi) % (2 * np.pi) - np.pi


def imbalance(cw, ccw):
    n = cw + ccw
    return np.nan if n == 0 else (cw - ccw) / n


def binom_two_sided_exact(k, n, p=0.5):
    if n == 0:
        return np.nan
    from math import lgamma, log

    def logC(n, r):
        return lgamma(n + 1) - lgamma(r + 1) - lgamma(n - r + 1)

    logpmf = np.array(
        [logC(n, i) + i * log(p) + (n - i) * log(1 - p) for i in range(n + 1)], float
    )
    pmf = np.exp(logpmf - np.max(logpmf))
    pmf = pmf / pmf.sum()
    pk = pmf[k]
    return pmf[pmf <= pk + 1e-18].sum()


def wilson_ci(k, n, z=1.96):
    if n == 0:
        return (np.nan, np.nan)
    phat = k / n
    den = 1 + z**2 / n
    ctr = (phat + z**2 / (2 * n)) / den
    rad = (z / den) * math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)
    return (max(0.0, ctr - rad), min(1.0, ctr + rad))


def sign_test(k_pos, m):
    from math import comb

    if m == 0:
        return np.nan
    tail = sum(comb(m, i) * (0.5**m) for i in range(0, min(k_pos, m - k_pos) + 1))
    return min(1.0, 2 * tail)


def fmt_p(x):
    if np.isnan(x):
        return "NA"
    if x == 0:
        return "0"
    if x < 1e-4:
        return f"{x:.2e}"
    return f"{x:.4f}"


# ----------------------------
# Decoding clicks -> (time, outcome)
# ----------------------------
def decode_clicks(clicks: np.ndarray, mode: str):
    clicks = np.asarray(clicks)
    if mode == "packed_dt_outcome":
        # Best-effort decode for uint16 streams:
        #   bit0 = outcome
        #   bits[1..] = dt (nonnegative)
        outcome = (clicks & 1).astype(np.uint8)
        dt = (clicks >> 1).astype(np.int64)
        t = np.cumsum(dt, dtype=np.int64)
        return t, outcome
    elif mode == "outcome_only":
        outcome = (clicks & 1).astype(np.uint8)
        t = np.arange(len(clicks), dtype=np.int64)
        return t, outcome
    else:
        raise ValueError(f"Unknown DECODE_MODE: {mode}")


# ----------------------------
# Coincidence matching: nearest neighbor within window
# ----------------------------
def match_coincidences(tA, tB, W):
    """
    Return indices (iA, iB) for matches where |tA - tB| <= W using a two-pointer sweep.
    Assumes tA, tB are nondecreasing.
    """
    iA = []
    iB = []
    j = 0
    for i in range(len(tA)):
        ta = tA[i]
        # advance j so that tB[j] is close to ta
        while j + 1 < len(tB) and tB[j] < ta and abs(tB[j + 1] - ta) <= abs(tB[j] - ta):
            j += 1
        # now consider tB[j] and maybe neighbors
        best_j = j
        best_d = abs(tB[j] - ta)
        if j + 1 < len(tB):
            d2 = abs(tB[j + 1] - ta)
            if d2 < best_d:
                best_d = d2
                best_j = j + 1
        if j - 1 >= 0:
            d0 = abs(tB[j - 1] - ta)
            if d0 < best_d:
                best_d = d0
                best_j = j - 1
        if best_d <= W:
            iA.append(i)
            iB.append(best_j)
    return np.array(iA, dtype=np.int64), np.array(iB, dtype=np.int64)


# ----------------------------
# Compute S, sigma, CW/CCW, holonomy proxy for one file at one window
# ----------------------------
def analyze_file(h5path: str, W: int, null_shift: bool = False) -> Dict[str, Any]:
    with h5py.File(h5path, "r") as h5:
        for p in [CFG.A_CLICKS, CFG.B_CLICKS, CFG.A_SET, CFG.B_SET]:
            if p not in h5:
                raise KeyError(f"Missing {p} in {os.path.basename(h5path)}")
        a_clicks = np.asarray(h5[CFG.A_CLICKS])
        b_clicks = np.asarray(h5[CFG.B_CLICKS])
        a_set = np.asarray(h5[CFG.A_SET], dtype=np.uint8)
        b_set = np.asarray(h5[CFG.B_SET], dtype=np.uint8)

    # Cap size for speed/memory
    Ncap = CFG.MAX_EVENTS_PER_SIDE
    if Ncap is not None:
        a_clicks = a_clicks[: min(len(a_clicks), Ncap)]
        b_clicks = b_clicks[: min(len(b_clicks), Ncap)]
        a_set = a_set[: min(len(a_set), Ncap)]
        b_set = b_set[: min(len(b_set), Ncap)]

    # Decode streams
    tA, x = decode_clicks(a_clicks, CFG.DECODE_MODE)
    tB, y = decode_clicks(b_clicks, CFG.DECODE_MODE)

    # Decode settings (basis bit)
    a = (a_set & CFG.SET_MASK).astype(np.uint8)
    b = (b_set & CFG.SET_MASK).astype(np.uint8)

    # If null: shift Bob time to destroy pairing
    if null_shift:
        tB = tB + CFG.NULL_SHIFT

    # Match coincidences
    iA, iB = match_coincidences(tA, tB, W)
    if len(iA) == 0:
        return {
            "W": W,
            "n_coinc": 0,
            "E": {ab: np.nan for ab in AB_LIST},
            "S": np.nan,
            "CWCCW": {
                ab: {"CW": 0, "CCW": 0, "n": 0, "I": np.nan, "p": np.nan}
                for ab in AB_LIST
            },
            "note": "no coincidences",
        }

    # Pull matched data
    a_m = a[iA].astype(int)
    b_m = b[iB].astype(int)  # settings indexed on own side
    x_m = x[iA].astype(int)
    y_m = y[iB].astype(int)
    dt = (tB[iB] - tA[iA]).astype(float)

    # CHSH: E_ab = mean( (-1)^(x xor y) ) for each (a,b)
    E = {}
    for ab in AB_LIST:
        aa, bb = ab
        m = (a_m == aa) & (b_m == bb)
        if np.sum(m) == 0:
            E[ab] = np.nan
        else:
            corr = 1 - 2 * (x_m[m] ^ y_m[m])  # +1 if equal, -1 if different
            E[ab] = float(np.mean(corr))

    # S (CHSH)
    S = (
        E[(0, 0)] + E[(0, 1)] + E[(1, 0)] - E[(1, 1)]
        if all(not np.isnan(E[ab]) for ab in AB_LIST)
        else np.nan
    )

    # Holonomy proxy:
    # sigma = sign of wrapped kappa*dt (orientation), but we can make it transition-only in (a,b)
    dphi = wrap_pi(CFG.KAPPA * dt)
    sigma = np.zeros_like(dphi, dtype=np.int8)
    sigma[dphi > CFG.EPS_SIGMA] = 1
    sigma[dphi < -CFG.EPS_SIGMA] = -1

    # Optional: only count when configuration (a,b) changes to make it "configuration-space"
    if CFG.TRANSITION_ONLY:
        ab_code = (a_m << 1) | b_m
        trans = np.zeros_like(ab_code, dtype=bool)
        trans[1:] = ab_code[1:] != ab_code[:-1]
        sigma_eff = sigma.copy()
        sigma_eff[~trans] = 0
    else:
        sigma_eff = sigma

    # CW/CCW per (a,b)
    CWCCW = {}
    for ab in AB_LIST:
        aa, bb = ab
        m = (a_m == aa) & (b_m == bb) & (sigma_eff != 0)
        CW = int(np.sum(m & (sigma_eff > 0)))
        CCW = int(np.sum(m & (sigma_eff < 0)))
        n = CW + CCW
        p = binom_two_sided_exact(CW, n) if n > 0 else np.nan
        CWCCW[ab] = {"CW": CW, "CCW": CCW, "n": n, "I": imbalance(CW, CCW), "p": p}

    # Rolling holonomy context (windowed sum of sigma_eff)
    # This is a "local holonomy sector label" you can use for conditioning later.
    L = CFG.THETA_ROLL_L
    if len(sigma_eff) >= 1:
        # fast rolling sum
        c = np.cumsum(sigma_eff, dtype=np.int64)
        theta_roll = c.copy()
        if len(c) > L:
            theta_roll[L:] = c[L:] - c[:-L]
        # normalize to [-1,1] range-ish
        theta_ctx = theta_roll / max(1, L)
    else:
        theta_ctx = np.array([], dtype=float)

    return {
        "W": W,
        "n_coinc": int(len(iA)),
        "E": E,
        "S": float(S) if not np.isnan(S) else np.nan,
        "CWCCW": CWCCW,
        "theta_ctx_mean": float(np.mean(theta_ctx)) if len(theta_ctx) else np.nan,
        "theta_ctx_std": float(np.std(theta_ctx)) if len(theta_ctx) else np.nan,
    }


# ----------------------------
# Run over all files and window sweep
# ----------------------------
files = sorted(glob.glob(os.path.join(CFG.DATA_DIR, CFG.FILE_GLOB), recursive=True))
print(f"Found {len(files)} files under {CFG.DATA_DIR}")
assert files, "No files found; fix CFG.DATA_DIR / CFG.FILE_GLOB"

results = []  # per file per window
results_null = []  # per file per window null

bad = []
for f in files:
    for W in CFG.WINDOWS:
        try:
            r = analyze_file(f, W, null_shift=False)
            rn = analyze_file(f, W, null_shift=True)
            results.append({"file": os.path.basename(f), **r})
            results_null.append({"file": os.path.basename(f), **rn})
        except Exception as e:
            bad.append((os.path.basename(f), W, repr(e)))

if bad:
    print("\n[WARN] Some file/window combos failed (showing up to 6):")
    for item in bad[:6]:
        print("  -", item)

df = pd.DataFrame(results)
dfn = pd.DataFrame(results_null)


# ----------------------------
# Helper to tabulate pooled stats at one window
# ----------------------------
def pooled_at_window(df_in: pd.DataFrame, W: int) -> Dict[str, Any]:
    sub = df_in[df_in["W"] == W].copy()
    # Pooled E_ab weighted by counts? We only have E not counts-per-setting here.
    # We'll compute pooled S by averaging S across files with finite S (conservative).
    S_vals = sub["S"].values
    S_pool = float(np.nanmean(S_vals)) if np.sum(~np.isnan(S_vals)) else np.nan

    # Pooled CW/CCW across files per (a,b)
    pooled = {}
    for ab in AB_LIST:
        CW = 0
        CCW = 0
        for _, row in sub.iterrows():
            d = row["CWCCW"]
            if isinstance(d, dict) and ab in d:
                CW += int(d[ab]["CW"])
                CCW += int(d[ab]["CCW"])
        n = CW + CCW
        p = binom_two_sided_exact(CW, n) if n > 0 else np.nan
        p_bonf = min(1.0, p * CFG.MCC_M) if not np.isnan(p) else np.nan
        ci = wilson_ci(CW, n) if n > 0 else (np.nan, np.nan)
        pooled[ab] = dict(
            CW=CW,
            CCW=CCW,
            n=n,
            I=imbalance(CW, CCW),
            p=p,
            p_bonf=p_bonf,
            ci_low=ci[0],
            ci_high=ci[1],
        )
    return {"W": W, "S_pool_mean_over_files": S_pool, "pooled": pooled}


# Choose a "main" window = middle one by default
W_main = CFG.WINDOWS[len(CFG.WINDOWS) // 2]
pool_main = pooled_at_window(df, W_main)
pool_null = pooled_at_window(dfn, W_main)

# ----------------------------
# Per-file table at main window
# ----------------------------
sub_main = df[df["W"] == W_main].copy()

rows = []
for _, r in sub_main.iterrows():
    base = {"file": r["file"], "W": r["W"], "n_coinc": r["n_coinc"], "S": r["S"]}
    for ab in AB_LIST:
        d = r["CWCCW"][ab]
        base[f"{ab}_CW"] = d["CW"]
        base[f"{ab}_CCW"] = d["CCW"]
        base[f"{ab}_n"] = d["n"]
        base[f"{ab}_I"] = d["I"]
        base[f"{ab}_p"] = d["p"]
    rows.append(base)
df_file = pd.DataFrame(rows).sort_values("file")
display(df_file)

# ----------------------------
# Pooled CW/CCW + p-values at main window
# ----------------------------
df_pool = pd.DataFrame(
    [{"a": ab[0], "b": ab[1], **pool_main["pooled"][ab]} for ab in AB_LIST]
).sort_values(["a", "b"])
display(df_pool)

df_pool_null = pd.DataFrame(
    [{"a": ab[0], "b": ab[1], **pool_null["pooled"][ab]} for ab in AB_LIST]
).sort_values(["a", "b"])
display(df_pool_null)

print(f"\nMain window W={W_main} ticks")
print("Pooled S (mean over files with finite S):", pool_main["S_pool_mean_over_files"])
print(
    "Pooled S NULL (mean over files with finite S):",
    pool_null["S_pool_mean_over_files"],
)

# ----------------------------
# Per-file consistency sign test at main window
# ----------------------------
sign_rows = []
for ab in AB_LIST:
    I_list = []
    for _, r in sub_main.iterrows():
        n = r["CWCCW"][ab]["n"]
        if n > 0:
            I_list.append(r["CWCCW"][ab]["I"])
    m = len(I_list)
    kpos = sum(i > 0 for i in I_list)
    sign_rows.append(
        {
            "a": ab[0],
            "b": ab[1],
            "files_with_data": m,
            "files_I>0": kpos,
            "sign_test_p": sign_test(kpos, m),
        }
    )
df_sign = pd.DataFrame(sign_rows).sort_values(["a", "b"])
display(df_sign)

# ----------------------------
# Leave-one-file-out (LOO) pooled stability at main window
# ----------------------------
files_unique = sorted(sub_main["file"].unique().tolist())
loo_rows = []
for drop in files_unique:
    sub = sub_main[sub_main["file"] != drop]
    # recompute pooled CW/CCW quickly
    for ab in AB_LIST:
        CW = sum(int(r["CWCCW"][ab]["CW"]) for _, r in sub.iterrows())
        CCW = sum(int(r["CWCCW"][ab]["CCW"]) for _, r in sub.iterrows())
        n = CW + CCW
        p = binom_two_sided_exact(CW, n) if n > 0 else np.nan
        loo_rows.append(
            {
                "dropped": drop,
                "a": ab[0],
                "b": ab[1],
                "CW": CW,
                "CCW": CCW,
                "n": n,
                "I": imbalance(CW, CCW),
                "p": p,
            }
        )
df_loo = pd.DataFrame(loo_rows).sort_values(["a", "b", "p"])
display(df_loo)

# ----------------------------
# Window sweep summary (pooled I and p per W)
# ----------------------------
sweep_rows = []
for W in CFG.WINDOWS:
    pool = pooled_at_window(df, W)["pooled"]
    for ab in AB_LIST:
        sweep_rows.append(
            {
                "W": W,
                "a": ab[0],
                "b": ab[1],
                "I": pool[ab]["I"],
                "p": pool[ab]["p"],
                "n": pool[ab]["n"],
            }
        )
df_sweep = pd.DataFrame(sweep_rows).sort_values(["a", "b", "W"])
display(df_sweep)

# ----------------------------
# 1-page Results + Robustness note (filled)
# ----------------------------
note = []
note.append(
    "X–θ GEOMETRY-FIRST ANALYSIS — RESULTS + ROBUSTNESS NOTE (single-cell run)\n"
)

note.append("Core assumptions enforced")
note.append(
    "  • θ is treated as a holonomy-like geometric functional (not an explicit measured phase channel)."
)
note.append(
    "  • No direct θ is read from file; instead we compute an orientation proxy σ per matched entangled pair."
)
note.append(
    "  • σ is used only through configuration-space transitions (optional), to reduce time-parameter dependence."
)
note.append("")

note.append("Data + decoding")
note.append(f"  • Files: {len(files)} HDF5 build files from {CFG.DATA_DIR}")
note.append(f"  • Click decode mode: {CFG.DECODE_MODE}")
note.append(
    "    - packed_dt_outcome: outcome = clicks&1, dt = clicks>>1, time = cumsum(dt)"
)
note.append("    - outcome_only: outcome = clicks&1, time = index")
note.append(f"  • Setting decode: basis = settings & {CFG.SET_MASK} (yields 0/1)")
note.append("")

note.append("Coincidence definition")
note.append(f"  • Nearest-neighbor time matching within window W (ticks).")
note.append(f"  • Main reported W = {W_main} ticks; window sweep shown in df_sweep.")
note.append("")

note.append("CHSH (sanity / quantum band)")
note.append(
    "  • E_ab = mean( (-1)^(x xor y) ) over matched coincidences for each (a,b)."
)
note.append("  • S = E00 + E01 + E10 − E11.")
note.append(
    f"  • Pooled S (mean over files with finite S) @W={W_main}: {pool_main['S_pool_mean_over_files']}"
)
note.append(f"  • NULL pooled S @W={W_main}: {pool_null['S_pool_mean_over_files']}")
note.append("  • Target band: 2 < S ≤ 2√2 (Tsirelson).")
note.append("")

note.append("Holonomy proxy used here (working estimator)")
note.append("  • For each matched pair i: dt_i = tB_i − tA_i.")
note.append(f"  • Define dφ_i = wrap_to_(-π,π]( κ·dt_i ) with κ={CFG.KAPPA}.")
note.append("  • Orientation σ_i = sign(dφ_i) ∈ {−1,0,+1}.")
note.append(
    f"  • TRANSITION_ONLY={CFG.TRANSITION_ONLY}: if True, σ_i is counted only when (a,b) changes between consecutive pairs."
)
note.append("  • CW := σ=+1, CCW := σ=−1 (for nonzero σ).")
note.append("")

note.append(
    "Main pooled CW/CCW imbalance (exact two-sided binomial test H0: p(CW)=0.5)"
)
for ab in AB_LIST:
    r = pool_main["pooled"][ab]
    note.append(
        f"  • (a,b)={ab}: CW={r['CW']}, CCW={r['CCW']}, n={r['n']}, "
        f"I={r['I']:.3f}, p={fmt_p(r['p'])}, p(Bonf×{CFG.MCC_M})={fmt_p(r['p_bonf'])}, "
        f"WilsonCI(CW proportion)=[{r['ci_low']:.3f},{r['ci_high']:.3f}]"
    )
note.append("")

note.append("Per-file consistency (sign test across files)")
for _, s in df_sign.iterrows():
    note.append(
        f"  • (a,b)=({int(s['a'])},{int(s['b'])}): files_with_data={int(s['files_with_data'])}, "
        f"files_I>0={int(s['files_I>0'])}, sign-test p={fmt_p(float(s['sign_test_p']))}"
    )
note.append("")

note.append("LOO stability")
note.append(
    "  • See df_loo: robustness supported if sign(I) and small p persist when dropping any single file."
)
note.append("")

note.append("Time-shift null (pairing destruction)")
note.append(
    f"  • NULL shifts Bob times by +{CFG.NULL_SHIFT} ticks before matching, which should destroy true coincidences."
)
note.append(
    "  • Compare df_pool (real) vs df_pool_null (null). If imbalance persists under null, it indicates a pipeline artifact."
)
note.append("")

note.append("Window sweep stability")
note.append(
    "  • See df_sweep: check if sign(I) is stable across windows; magnitude may vary."
)
note.append("")

print("\n" + "=" * 100)
print("\n".join(note))
print("=" * 100)

# ----------------------------
# Tip at the bottom (not a question)
# ----------------------------
print("\nTip: If S is NaN or far from expected, your clicks decoding likely differs.")
print(
    "     Try CFG.DECODE_MODE='outcome_only' or implement your known coincidence/orientation rule."
)
print(
    "     Once decode/matching matches your known 9-file pipeline, all robustness outputs above become decisive."
)

Found 9 files under ./data_nist_v7


Unnamed: 0,file,W,n_coinc,S,"(0, 0)_CW","(0, 0)_CCW","(0, 0)_n","(0, 0)_I","(0, 0)_p","(0, 1)_CW","(0, 1)_CCW","(0, 1)_n","(0, 1)_I","(0, 1)_p","(1, 0)_CW","(1, 0)_CCW","(1, 0)_n","(1, 0)_I","(1, 0)_p","(1, 1)_CW","(1, 1)_CCW","(1, 1)_n","(1, 1)_I","(1, 1)_p"
0,01_11_CH_pockel_100kHz.run4.afterTimingfix.dat...,200,486129,1.999513,37911,40633,78544,-0.034656,3.1921330000000004e-17,23999,17716,41715,0.150617,2.422093e-17,37903,40622,78525,-0.034626,3.262471e-17,23995,17705,41700,0.150839,2.2920730000000002e-17
1,02_54_CH_pockel_100kHz.run4.afterTimingfix2.da...,200,400103,1.999527,29888,31408,61296,-0.024798,8.479083e-10,19524,19405,38929,0.003057,0.5497999,29890,31382,61272,-0.02435,1.704139e-09,19519,19391,38910,0.00329,0.5196844
2,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...,200,363,,0,0,0,,,0,0,0,,,0,0,0,,,0,0,0,,
3,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...,200,399525,1.99945,28490,30364,58854,-0.031842,1.152229e-14,20521,19587,40108,0.023287,3.178798e-06,28497,30344,58841,-0.03139,2.725382e-14,20495,19586,40081,0.022679,5.744137e-06
4,17_04_CH_pockel_100kHz.run.completeblind.dat.c...,200,457522,1.999464,35243,41157,76400,-0.077408,3.142449e-17,18539,17529,36068,0.028003,1.077073e-07,35228,41149,76377,-0.077523,3.2062120000000004e-17,18525,17528,36053,0.027654,1.555697e-07
5,19_45_CH_pockel_100kHz.run.nolightconeshift.da...,200,437493,1.999461,38877,30646,69523,0.118392,3.0925850000000006e-17,19004,19224,38228,-0.005755,0.2626747,38866,30648,69514,0.118221,3.179715e-17,18991,19206,38197,-0.005629,0.273533
6,21_15_CH_pockel_100kHz.run.200nsadditiondelay_...,200,417676,1.99961,31733,36898,68631,-0.075258,3.0595500000000003e-17,17381,18237,35618,-0.024033,5.882956e-06,31739,36874,68613,-0.07484,3.03067e-17,17367,18223,35590,-0.024052,5.833526e-06
7,22_20_CH_pockel_100kHz.run.200nsreduceddelay_l...,200,469475,1.999391,38632,42186,80818,-0.043975,3.389743e-17,15805,20894,36699,-0.138669,2.2774050000000002e-17,38622,42175,80797,-0.043974,3.2585960000000003e-17,15796,20883,36679,-0.13869,2.232582e-17
8,23_55_CH_pockel_100kHz.run.ClassicalRNGXOR.dat...,200,412695,1.999489,36827,28625,65452,0.125313,2.9740780000000005e-17,18077,19242,37319,-0.031217,1.681667e-09,36812,28614,65426,0.125302,2.9316220000000004e-17,18065,19242,37307,-0.031549,1.136063e-09


Unnamed: 0,a,b,CW,CCW,n,I,p,p_bonf,ci_low,ci_high
0,0,0,277601,281917,559518,-0.007714,7.989857e-09,3.195943e-08,0.494833,0.497453
1,0,1,152850,151834,304684,0.003335,0.06594024,0.263761,0.499892,0.503443
2,1,0,277557,281808,559365,-0.0076,1.327006e-08,5.308026e-08,0.49489,0.49751
3,1,1,152753,151764,304517,0.003248,0.07338873,0.2935549,0.499848,0.5034


Unnamed: 0,a,b,CW,CCW,n,I,p,p_bonf,ci_low,ci_high
0,0,0,56327,45745,102072,0.103672,3.658485e-17,1.463394e-16,0.548783,0.554885
1,0,1,28800,36398,65198,-0.116537,2.9543960000000003e-17,1.181758e-16,0.437923,0.445547
2,1,0,56310,45736,102046,0.10362,3.82417e-17,1.529668e-16,0.548757,0.554859
3,1,1,28800,36378,65178,-0.116266,2.921771e-17,1.168708e-16,0.438058,0.445683



Main window W=200 ticks
Pooled S (mean over files with finite S): 1.999488263876653
Pooled S NULL (mean over files with finite S): 1.999039864184873


Unnamed: 0,a,b,files_with_data,files_I>0,sign_test_p
0,0,0,8,2,0.289062
1,0,1,8,4,1.0
2,1,0,8,2,0.289062
3,1,1,8,4,1.0


Unnamed: 0,dropped,a,b,CW,CCW,n,I,p
32,23_55_CH_pockel_100kHz.run.ClassicalRNGXOR.dat...,0,0,240774,253292,494066,-0.025337,8.288344000000001e-17
20,19_45_CH_pockel_100kHz.run.nolightconeshift.da...,0,0,238724,251271,489995,-0.025606,8.372329000000001e-17
8,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...,0,0,277601,281917,559518,-0.007714,7.989857e-09
4,02_54_CH_pockel_100kHz.run4.afterTimingfix2.da...,0,0,247713,250509,498222,-0.005612,7.501778e-05
12,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...,0,0,249111,251553,500664,-0.004878,0.0005609778
16,17_04_CH_pockel_100kHz.run.completeblind.dat.c...,0,0,242358,240760,483118,0.003308,0.0215833
0,01_11_CH_pockel_100kHz.run4.afterTimingfix.dat...,0,0,239690,241284,480974,-0.003314,0.02162049
24,21_15_CH_pockel_100kHz.run.200nsadditiondelay_...,0,0,245868,245019,490887,0.00173,0.2261509
28,22_20_CH_pockel_100kHz.run.200nsreduceddelay_l...,0,0,238969,239731,478700,-0.001592,0.271376
1,01_11_CH_pockel_100kHz.run4.afterTimingfix.dat...,0,1,128851,134118,262969,-0.020029,5.963984000000001e-17


Unnamed: 0,W,a,b,I,p,n
0,50,0,0,0.011332,2.652089e-06,171898
4,100,0,0,0.000235,0.8973179,311277
8,200,0,0,-0.007714,7.989857e-09,559518
12,400,0,0,-0.008317,4.431146e-16,963879
1,50,0,1,0.022597,1.104564e-10,81604
5,100,0,1,0.008252,0.001108429,156445
9,200,0,1,0.003335,0.06594024,304684
13,400,0,1,0.016248,8.743634e-17,545066
2,50,1,0,0.011521,1.808191e-06,171862
6,100,1,0,0.000424,0.8143422,311202



X–θ GEOMETRY-FIRST ANALYSIS — RESULTS + ROBUSTNESS NOTE (single-cell run)

Core assumptions enforced
  • θ is treated as a holonomy-like geometric functional (not an explicit measured phase channel).
  • No direct θ is read from file; instead we compute an orientation proxy σ per matched entangled pair.
  • σ is used only through configuration-space transitions (optional), to reduce time-parameter dependence.

Data + decoding
  • Files: 9 HDF5 build files from ./data_nist_v7
  • Click decode mode: packed_dt_outcome
    - packed_dt_outcome: outcome = clicks&1, dt = clicks>>1, time = cumsum(dt)
    - outcome_only: outcome = clicks&1, time = index
  • Setting decode: basis = settings & 1 (yields 0/1)

Coincidence definition
  • Nearest-neighbor time matching within window W (ticks).
  • Main reported W = 200 ticks; window sweep shown in df_sweep.

CHSH (sanity / quantum band)
  • E_ab = mean( (-1)^(x xor y) ) over matched coincidences for each (a,b).
  • S = E00 + E01 + E10 − E11.
  • Po