X–θ test on NIST processed_compressed build.hdf5

We define a candidate hidden phase θ (either from settings-holonomy or from a trial clock), bin trials by θ, compute an Eberhard-style J per bin, and use a drift-preserving permutation test to see if modulation is real or an artifact.


In [1]:
import os, math, json
from dataclasses import dataclass
from typing import Dict, Any, Tuple

import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
import os
import requests
import xml.etree.ElementTree as ET

# Configuration
BUCKET_URL = "https://nist-belltestdata.s3.amazonaws.com/"
PREFIX = "belldata/processed_compressed/hdf5/2015_09_18/"
LOCAL_DIR = "./data_nist_v7/"

os.makedirs(LOCAL_DIR, exist_ok=True)


def get_file_list():
    """Fetch the list of files from the S3 public XML listing."""
    response = requests.get(f"{BUCKET_URL}?prefix={PREFIX}")
    response.raise_for_status()

    # Parse the XML response
    root = ET.fromstring(response.content)
    # S3 uses namespaces, we need to handle them
    ns = {"s3": "http://s3.amazonaws.com/doc/2006-03-01/"}

    files = []
    for content in root.findall("s3:Contents", ns):
        key = content.find("s3:Key", ns).text
        if key.endswith(".hdf5"):
            files.append(key)
    return files


def download_files(file_keys):
    for key in file_keys:
        filename = os.path.basename(key)
        out_path = os.path.join(LOCAL_DIR, filename)
        url = f"{BUCKET_URL}{key}"

        if os.path.exists(out_path):
            print(f"Skipping {filename} (already exists)")
            continue

        print(f"Downloading {filename}...")
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(out_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)


# Execute
hdf5_files = get_file_list()
print(f"Found {len(hdf5_files)} HDF5 files.")
download_files(hdf5_files)

Found 9 HDF5 files.
Skipping 01_11_CH_pockel_100kHz.run4.afterTimingfix.dat.compressed.build.hdf5 (already exists)
Skipping 02_54_CH_pockel_100kHz.run4.afterTimingfix2.dat.compressed.build.hdf5 (already exists)
Skipping 03_31_CH_pockel_100kHz.run4.afterTimingfix2_training.dat.compressed.build.hdf5 (already exists)
Skipping 03_43_CH_pockel_100kHz.run4.afterTimingfix2_afterfixingModeLocking.dat.compressed.build.hdf5 (already exists)
Skipping 17_04_CH_pockel_100kHz.run.completeblind.dat.compressed.build.hdf5 (already exists)
Skipping 19_45_CH_pockel_100kHz.run.nolightconeshift.dat.compressed.build.hdf5 (already exists)
Skipping 21_15_CH_pockel_100kHz.run.200nsadditiondelay_lightconeshift.dat.compressed.build.hdf5 (already exists)
Skipping 22_20_CH_pockel_100kHz.run.200nsreduceddelay_lightconeshift.dat.compressed.build.hdf5 (already exists)
Skipping 23_55_CH_pockel_100kHz.run.ClassicalRNGXOR.dat.compressed.build.hdf5 (already exists)


In [3]:
import h5py
import numpy as np
import glob
from tqdm import tqdm


def remap_settings(x):
    """Remaps the two most frequent setting values to 0 and 1."""
    vals, cnt = np.unique(x, return_counts=True)
    if len(vals) < 2:
        return x
    order = np.argsort(cnt)[::-1]
    v0, v1 = vals[order[0]], vals[order[1]]
    out = np.full(x.shape, -1, dtype=np.int8)
    out[x == v0] = 0
    out[x == v1] = 1
    return out


def aggregate_x_theta_full_run(directory_path):
    total_stats = {
        (a, b): {"CW": 0, "CCW": 0, "Total_Coin": 0} for a in (0, 1) for b in (0, 1)
    }
    files = glob.glob(f"{directory_path}/*.hdf5")

    for file_path in files:
        print(f"Processing: {file_path}")
        with h5py.File(file_path, "r") as h5:
            # 1. Load raw data
            a_c_raw = h5["alice/clicks"][:]
            b_c_raw = h5["bob/clicks"][:]
            a_s_raw = h5["alice/settings"][:]
            b_s_raw = h5["bob/settings"][:]

            # 2. ALIGNMENT FIX: Clip to the shortest common length
            # This prevents the IndexError you saw
            min_len = min(len(a_c_raw), len(b_c_raw), len(a_s_raw), len(b_s_raw))
            a_clicks = a_c_raw[:min_len]
            b_clicks = b_c_raw[:min_len]
            a_sets = remap_settings(a_s_raw[:min_len])
            b_sets = remap_settings(b_s_raw[:min_len])

            # 3. Find coincidences
            coin = (a_clicks != 0) & (b_clicks != 0)
            if not np.any(coin):
                continue

            # 4. Extract directional delta (theta proxy)
            # Only process coincidences to save memory/time
            slot_a = np.log2(a_clicks[coin]).astype(np.int8)
            slot_b = np.log2(b_clicks[coin]).astype(np.int8)
            delta = (slot_a - slot_b) % 16

            c_a_sets = a_sets[coin]
            c_b_sets = b_sets[coin]

            # 5. Accumulate into Global Stats
            for a_val in (0, 1):
                for b_val in (0, 1):
                    # Only count trials where both settings are valid (0 or 1)
                    s_mask = (c_a_sets == a_val) & (c_b_sets == b_val)
                    total_stats[(a_val, b_val)]["CW"] += int(
                        np.sum((delta[s_mask] >= 1) & (delta[s_mask] <= 7))
                    )
                    total_stats[(a_val, b_val)]["CCW"] += int(
                        np.sum((delta[s_mask] >= 9) & (delta[s_mask] <= 15))
                    )
                    total_stats[(a_val, b_val)]["Total_Coin"] += int(np.sum(s_mask))

    return total_stats


results = aggregate_x_theta_full_run("./data_nist_v7/")
print("\n--- Final X-Theta Aggregate Results ---")
for k, v in results.items():
    print(f"Setting {k}: {v}")

Processing: ./data_nist_v7\01_11_CH_pockel_100kHz.run4.afterTimingfix.dat.compressed.build.hdf5
Processing: ./data_nist_v7\02_54_CH_pockel_100kHz.run4.afterTimingfix2.dat.compressed.build.hdf5
Processing: ./data_nist_v7\03_31_CH_pockel_100kHz.run4.afterTimingfix2_training.dat.compressed.build.hdf5
Processing: ./data_nist_v7\03_43_CH_pockel_100kHz.run4.afterTimingfix2_afterfixingModeLocking.dat.compressed.build.hdf5
Processing: ./data_nist_v7\17_04_CH_pockel_100kHz.run.completeblind.dat.compressed.build.hdf5
Processing: ./data_nist_v7\19_45_CH_pockel_100kHz.run.nolightconeshift.dat.compressed.build.hdf5
Processing: ./data_nist_v7\21_15_CH_pockel_100kHz.run.200nsadditiondelay_lightconeshift.dat.compressed.build.hdf5
Processing: ./data_nist_v7\22_20_CH_pockel_100kHz.run.200nsreduceddelay_lightconeshift.dat.compressed.build.hdf5
Processing: ./data_nist_v7\23_55_CH_pockel_100kHz.run.ClassicalRNGXOR.dat.compressed.build.hdf5

--- Final X-Theta Aggregate Results ---
Setting (0, 0): {'CW': 124

In [None]:
import numpy as np
from math import comb


def binom_two_sided_p(k, n, p=0.5):
    # exact two-sided: sum of probs <= prob(k)
    from math import comb

    probs = np.array(
        [comb(n, i) * (p**i) * ((1 - p) ** (n - i)) for i in range(n + 1)], float
    )
    pk = probs[k]
    return probs[probs <= pk + 1e-18].sum()


def imbalance(cw, ccw):
    n = cw + ccw
    if n == 0:
        return np.nan
    return (cw - ccw) / n


def leave_one_out(files, counts):
    out = {}
    for drop in files:
        pooled = {}
        for ab in [(0, 0), (0, 1), (1, 0), (1, 1)]:
            CW = sum(counts[f][ab]["CW"] for f in files if f != drop)
            CCW = sum(counts[f][ab]["CCW"] for f in files if f != drop)
            n = CW + CCW
            p = binom_two_sided_p(CW, n) if n > 0 else np.nan
            pooled[ab] = dict(CW=CW, CCW=CCW, I=imbalance(CW, CCW), p=p)
        out[drop] = pooled
    return out

In [None]:
def sign_test_positive_I(files, counts, ab=(0, 1)):
    vals = []
    for f in files:
        CW, CCW = counts[f][ab]["CW"], counts[f][ab]["CCW"]
        I = imbalance(CW, CCW)
        if not np.isnan(I):
            vals.append(I)
    m = len(vals)
    k = sum(v > 0 for v in vals)
    # two-sided sign test under p=0.5
    p = sum(comb(m, i) * (0.5**m) for i in range(m + 1) if i <= min(k, m - k))
    return k, m, 2 * p  # approx two-sided

In [6]:
# ============================================================
# X–θ / CW–CCW Robustness Suite (ONE CELL)
# Assumes: all NIST *.build.hdf5 files are already downloaded locally.
#
# What this cell does (end-to-end):
# 1) Discovers all HDF5 files in DATA_DIR
# 2) Autodetects settings arrays (alice/settings, bob/settings)
# 3) Autodetects a "phase-like" per-trial array (float, length ~N) OR lets you set PHASE_PATH
# 4) Computes CW/CCW from wrapped phase increments Δφ on S¹ (CW if Δφ>0, CCW if Δφ<0)
# 5) Builds per-file counts + valid_rate table
# 6) Pooled binomial tests + multiple-comparison correction
# 7) Per-file sign test for consistency
# 8) Leave-One-Out (LOO) stability
# 9) Time-shift null test (destroys correlations)
# 10) Optional “window sweep” hook (if you later wire a coincidence selector)
# 11) Prints a 1-page “Results + Robustness” note (brutally honest template)
#
# IMPORTANT:
# - If autodetection picks the wrong phase dataset, set PHASE_PATH explicitly.
# - Your earlier printout suggests CW/CCW are computed on a subset of coincidences.
#   Here, "valid" = events where we can compute a signed Δφ. You can tighten/loosen VALID_MASK.
# ============================================================

import os, math, glob, textwrap
from dataclasses import dataclass
from typing import Dict, Tuple, Any, Optional, List

import numpy as np
import pandas as pd
import h5py


# ----------------------------
# Config
# ----------------------------
@dataclass
class CFG:
    DATA_DIR: str = "./data_nist_v7"  # change if needed
    FILE_GLOB: str = "**/*.build.hdf5"  # recursive
    A_SETTINGS_PATH: str = "alice/settings"
    B_SETTINGS_PATH: str = "bob/settings"

    # If None, we'll autodetect a phase-like float array length ~N.
    # If you KNOW the phase dataset path, set it here (recommended for stability).
    PHASE_PATH: Optional[str] = None

    # Phase increment threshold: ignore tiny Δφ that are numerically ambiguous.
    DPHI_EPS: float = 1e-12

    # Time-shift null:
    # If timetags exist and you later wire coincidence matching, shift those.
    # For now, we destroy correlations by circularly shifting the PHASE array.
    NULL_SHIFT_FRAC: float = 0.37  # fraction of length to roll (irrational-ish)

    # Settings accepted values:
    SETTINGS_ALLOWED: Tuple[int, int] = (0, 1)

    # Multiple comparison correction across 4 (a,b) tests:
    MCC_M: int = 4


CFG = CFG()

AB_LIST = [(0, 0), (0, 1), (1, 0), (1, 1)]


# ----------------------------
# Helpers: HDF5 discovery
# ----------------------------
def h5_list_datasets(h5: h5py.File) -> List[Tuple[str, Tuple[int, ...], str]]:
    out = []

    def visitor(name, obj):
        if isinstance(obj, h5py.Dataset):
            out.append((name, tuple(obj.shape), str(obj.dtype)))

    h5.visititems(visitor)
    return out


def pick_phase_path(
    h5: h5py.File, N: int, exclude_prefixes=("alice/settings", "bob/settings")
) -> Optional[str]:
    """
    Heuristic: pick a 1D float dataset with length ~N that isn't settings.
    Prefer names containing 'phase', 'phi', 'theta' if present.
    """
    ds = h5_list_datasets(h5)
    cands = []
    for name, shape, dtype in ds:
        if any(name.startswith(p) for p in exclude_prefixes):
            continue
        if (
            len(shape) == 1
            and shape[0] >= max(1000, int(0.7 * N))
            and shape[0] <= int(1.3 * N)
        ):
            if ("float" in dtype) or ("f" in dtype):
                score = 0
                ln = name.lower()
                if "phase" in ln:
                    score += 5
                if "phi" in ln:
                    score += 4
                if "theta" in ln:
                    score += 3
                if "angle" in ln:
                    score += 2
                # mild preference for alice/bob groups (often where physics arrays live)
                if ln.startswith("alice/") or ln.startswith("bob/"):
                    score += 1
                cands.append((score, name, shape[0], dtype))
    if not cands:
        return None
    cands.sort(reverse=True)
    return cands[0][1]


# ----------------------------
# Circular math
# ----------------------------
def wrap_pi(x):
    # wrap to (-pi, pi]
    return (x + np.pi) % (2 * np.pi) - np.pi


def imbalance(cw, ccw):
    n = cw + ccw
    return np.nan if n == 0 else (cw - ccw) / n


def binom_two_sided_exact(k, n, p=0.5):
    """Exact two-sided binomial p-value (sum of outcomes with prob <= prob(k))."""
    if n == 0:
        return np.nan
    # compute pmf via log to stay stable for n up to ~few thousand; here n is small anyway
    from math import lgamma, log, exp

    def logC(n, r):
        return lgamma(n + 1) - lgamma(r + 1) - lgamma(n - r + 1)

    logpmf = np.array(
        [logC(n, i) + i * log(p) + (n - i) * log(1 - p) for i in range(n + 1)], float
    )
    pmf = np.exp(logpmf - np.max(logpmf))
    pmf = pmf / pmf.sum()
    pk = pmf[k]
    return pmf[pmf <= pk + 1e-18].sum()


def wilson_ci(k, n, z=1.96):
    """Wilson score CI for proportion k/n."""
    if n == 0:
        return (np.nan, np.nan)
    phat = k / n
    den = 1 + z**2 / n
    ctr = (phat + z**2 / (2 * n)) / den
    rad = (z / den) * math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)
    return (max(0.0, ctr - rad), min(1.0, ctr + rad))


def sign_test(k_pos, m):
    """Two-sided exact sign test under p=0.5."""
    from math import comb

    if m == 0:
        return np.nan
    tail = sum(comb(m, i) * (0.5**m) for i in range(0, min(k_pos, m - k_pos) + 1))
    return min(1.0, 2 * tail)


# ----------------------------
# Core: compute CW/CCW counts from one file
# ----------------------------
def compute_counts_for_file(
    h5path: str, cfg: CFG
) -> Dict[Tuple[int, int], Dict[str, Any]]:
    """
    Defines CW/CCW from sign of wrapped phase increments:
      Δφ_i = wrap(φ_{i+1} - φ_i)
      CW if Δφ_i > 0, CCW if Δφ_i < 0
    We attribute Δφ_i to the settings (a_i, b_i) at index i.

    "Valid events" here means indices i where:
      - both settings in {0,1}
      - Δφ magnitude > DPHI_EPS
    """
    with h5py.File(h5path, "r") as h5:
        if cfg.A_SETTINGS_PATH not in h5 or cfg.B_SETTINGS_PATH not in h5:
            raise KeyError(f"Missing settings paths in {h5path}")

        a = np.asarray(h5[cfg.A_SETTINGS_PATH]).astype(int)
        b = np.asarray(h5[cfg.B_SETTINGS_PATH]).astype(int)
        N = min(len(a), len(b))
        a = a[:N]
        b = b[:N]

        phase_path = cfg.PHASE_PATH or pick_phase_path(h5, N)
        if phase_path is None or phase_path not in h5:
            raise KeyError(
                f"Could not autodetect PHASE dataset in {h5path}. "
                f"Set CFG.PHASE_PATH explicitly."
            )
        phi = np.asarray(h5[phase_path])
        if phi.ndim != 1:
            raise ValueError(f"Phase dataset {phase_path} is not 1D in {h5path}")
        phi = phi[:N].astype(float)

    # Compute Δφ on S¹
    dphi = wrap_pi(phi[1:] - phi[:-1])
    a0 = a[:-1]
    b0 = b[:-1]

    allowed = set(cfg.SETTINGS_ALLOWED)
    valid = (
        np.isin(a0, list(allowed))
        & np.isin(b0, list(allowed))
        & (np.abs(dphi) > cfg.DPHI_EPS)
    )

    # CW/CCW
    cw_mask = valid & (dphi > 0)
    ccw_mask = valid & (dphi < 0)

    out = {
        ab: {"CW": 0, "CCW": 0, "Valid": 0, "N_trials": int(N), "PhasePath": phase_path}
        for ab in AB_LIST
    }

    for ab in AB_LIST:
        aa, bb = ab
        m = (a0 == aa) & (b0 == bb)
        CW = int(np.sum(cw_mask & m))
        CCW = int(np.sum(ccw_mask & m))
        V = int(np.sum(valid & m))
        out[ab]["CW"] = CW
        out[ab]["CCW"] = CCW
        out[ab]["Valid"] = V
        out[ab]["valid_rate"] = np.nan if N <= 1 else V / (N - 1)

    return out


# ----------------------------
# Null: destroy correlations (phase roll)
# ----------------------------
def compute_counts_time_shift_null(
    h5path: str, cfg: CFG
) -> Dict[Tuple[int, int], Dict[str, Any]]:
    """
    Null test: keep settings same, but destroy phase alignment by rolling φ by large offset.
    This should collapse CW/CCW imbalance if effect depends on real alignment.
    """
    with h5py.File(h5path, "r") as h5:
        a = np.asarray(h5[cfg.A_SETTINGS_PATH]).astype(int)
        b = np.asarray(h5[cfg.B_SETTINGS_PATH]).astype(int)
        N = min(len(a), len(b))
        a = a[:N]
        b = b[:N]

        phase_path = cfg.PHASE_PATH or pick_phase_path(h5, N)
        if phase_path is None or phase_path not in h5:
            raise KeyError(
                f"Could not autodetect PHASE dataset in {h5path} for null. Set CFG.PHASE_PATH."
            )
        phi = np.asarray(h5[phase_path])[:N].astype(float)

    shift = int(max(1, (N - 1) * cfg.NULL_SHIFT_FRAC))
    phi_null = np.roll(phi, shift)

    dphi = wrap_pi(phi_null[1:] - phi_null[:-1])
    a0 = a[:-1]
    b0 = b[:-1]

    allowed = set(cfg.SETTINGS_ALLOWED)
    valid = (
        np.isin(a0, list(allowed))
        & np.isin(b0, list(allowed))
        & (np.abs(dphi) > cfg.DPHI_EPS)
    )
    cw_mask = valid & (dphi > 0)
    ccw_mask = valid & (dphi < 0)

    out = {
        ab: {
            "CW": 0,
            "CCW": 0,
            "Valid": 0,
            "N_trials": int(N),
            "PhasePath": phase_path,
            "NullShift": shift,
        }
        for ab in AB_LIST
    }
    for ab in AB_LIST:
        aa, bb = ab
        m = (a0 == aa) & (b0 == bb)
        CW = int(np.sum(cw_mask & m))
        CCW = int(np.sum(ccw_mask & m))
        V = int(np.sum(valid & m))
        out[ab]["CW"] = CW
        out[ab]["CCW"] = CCW
        out[ab]["Valid"] = V
        out[ab]["valid_rate"] = np.nan if N <= 1 else V / (N - 1)

    return out


# ----------------------------
# Discover files
# ----------------------------
files = sorted(glob.glob(os.path.join(CFG.DATA_DIR, CFG.FILE_GLOB), recursive=True))
if not files:
    raise FileNotFoundError(
        f"No files found under {CFG.DATA_DIR} with glob {CFG.FILE_GLOB}. Update CFG.DATA_DIR / CFG.FILE_GLOB."
    )

print(f"Found {len(files)} HDF5 files.")
print("First 3:", *files[:3], sep="\n  ")

# ----------------------------
# Compute per-file counts
# ----------------------------
counts = {}
null_counts = {}
bad = []

for f in files:
    try:
        counts[f] = compute_counts_for_file(f, CFG)
        null_counts[f] = compute_counts_time_shift_null(f, CFG)
    except Exception as e:
        bad.append((f, repr(e)))

if bad:
    print("\n[WARN] Some files failed (showing up to 5):")
    for bf, err in bad[:5]:
        print(" -", bf)
        print("   ", err)
    print(
        "\nFix by setting CFG.PHASE_PATH to the correct dataset path (and/or check file integrity)."
    )

good_files = [f for f in files if f in counts]
if not good_files:
    raise RuntimeError("No files processed successfully. Set CFG.PHASE_PATH correctly.")

# ----------------------------
# Build per-file summary table
# ----------------------------
rows = []
for f in good_files:
    base = {"file": os.path.basename(f)}
    # keep the detected phase path for auditability
    base["phase_path"] = counts[f][(0, 0)].get("PhasePath", "")
    for ab in AB_LIST:
        CW = counts[f][ab]["CW"]
        CCW = counts[f][ab]["CCW"]
        V = counts[f][ab]["Valid"]
        base[f"{ab}_CW"] = CW
        base[f"{ab}_CCW"] = CCW
        base[f"{ab}_I"] = imbalance(CW, CCW)
        base[f"{ab}_valid_rate"] = counts[f][ab]["valid_rate"]
        base[f"{ab}_Valid"] = V
    rows.append(base)

df_file = pd.DataFrame(rows).sort_values("file")
pd.set_option("display.max_columns", 200)
display(df_file)

# ----------------------------
# Pooled results + stats
# ----------------------------
pooled = {}
for ab in AB_LIST:
    CW = sum(counts[f][ab]["CW"] for f in good_files)
    CCW = sum(counts[f][ab]["CCW"] for f in good_files)
    V = sum(counts[f][ab]["Valid"] for f in good_files)
    n = CW + CCW
    p = binom_two_sided_exact(CW, n) if n > 0 else np.nan
    p_bonf = min(1.0, p * CFG.MCC_M) if not np.isnan(p) else np.nan
    ci = wilson_ci(CW, n) if n > 0 else (np.nan, np.nan)
    pooled[ab] = dict(
        CW=CW,
        CCW=CCW,
        Valid=V,
        n=n,
        I=imbalance(CW, CCW),
        p=p,
        p_bonf=p_bonf,
        ci_low=ci[0],
        ci_high=ci[1],
    )

df_pool = pd.DataFrame(
    [{"a": ab[0], "b": ab[1], **pooled[ab]} for ab in AB_LIST]
).sort_values(["a", "b"])
display(df_pool)

# ----------------------------
# Per-file consistency: sign test (does direction replicate?)
# ----------------------------
sign_rows = []
for ab in AB_LIST:
    I_list = []
    for f in good_files:
        CW, CCW = counts[f][ab]["CW"], counts[f][ab]["CCW"]
        if CW + CCW > 0:
            I_list.append(imbalance(CW, CCW))
    m = len(I_list)
    kpos = sum(i > 0 for i in I_list)
    p_sign = sign_test(kpos, m)
    sign_rows.append(
        {
            "a": ab[0],
            "b": ab[1],
            "files_with_data": m,
            "files_I>0": kpos,
            "sign_test_p": p_sign,
        }
    )
df_sign = pd.DataFrame(sign_rows).sort_values(["a", "b"])
display(df_sign)

# ----------------------------
# Leave-one-file-out (LOO) stability (pooled)
# ----------------------------
loo_rows = []
for drop in good_files:
    for ab in AB_LIST:
        CW = sum(counts[f][ab]["CW"] for f in good_files if f != drop)
        CCW = sum(counts[f][ab]["CCW"] for f in good_files if f != drop)
        n = CW + CCW
        p = binom_two_sided_exact(CW, n) if n > 0 else np.nan
        loo_rows.append(
            {
                "dropped": os.path.basename(drop),
                "a": ab[0],
                "b": ab[1],
                "CW": CW,
                "CCW": CCW,
                "n": n,
                "I": imbalance(CW, CCW),
                "p": p,
            }
        )
df_loo = pd.DataFrame(loo_rows).sort_values(["a", "b", "p"])
display(df_loo)

# ----------------------------
# Time-shift null comparison (pooled)
# ----------------------------
null_pooled = {}
for ab in AB_LIST:
    CWn = sum(null_counts[f][ab]["CW"] for f in good_files)
    CCWn = sum(null_counts[f][ab]["CCW"] for f in good_files)
    nn = CWn + CCWn
    pn = binom_two_sided_exact(CWn, nn) if nn > 0 else np.nan
    null_pooled[ab] = dict(
        CW_null=CWn, CCW_null=CCWn, n_null=nn, I_null=imbalance(CWn, CCWn), p_null=pn
    )

df_null = pd.DataFrame(
    [{"a": ab[0], "b": ab[1], **null_pooled[ab]} for ab in AB_LIST]
).sort_values(["a", "b"])
display(df_null)

# ----------------------------
# Symmetry checks (cheap, essential)
# ----------------------------
# 1) If we flip CW/CCW definition, pooled imbalance must flip sign.
sym_rows = []
for ab in AB_LIST:
    CW = pooled[ab]["CW"]
    CCW = pooled[ab]["CCW"]
    I = pooled[ab]["I"]
    I_flipped = imbalance(CCW, CW)
    sym_rows.append(
        {
            "a": ab[0],
            "b": ab[1],
            "I": I,
            "I_after_flip_definition": I_flipped,
            "check": "should be ~ -I",
        }
    )
df_sym = pd.DataFrame(sym_rows).sort_values(["a", "b"])
display(df_sym)


# ----------------------------
# 1-page Results + Robustness note (auto-filled template)
# ----------------------------
def fmt_p(x):
    if np.isnan(x):
        return "NA"
    if x == 0:
        return "0"
    if x < 1e-4:
        return f"{x:.2e}"
    return f"{x:.4f}"


note_lines = []
note_lines.append(
    "X–θ CW/CCW ORIENTATION ASYMMETRY — RESULTS + ROBUSTNESS (AUTO NOTE)\n"
)
note_lines.append("Definition of CW/CCW")
note_lines.append(
    "  • Let φ_i be a per-trial phase-like variable on S¹ (dataset: PHASE_PATH, autodetected if not specified)."
)
note_lines.append("  • Define Δφ_i = wrap_to_(-π,π](φ_{i+1} − φ_i).")
note_lines.append("  • CW event: Δφ_i > 0 ; CCW event: Δφ_i < 0.")
note_lines.append("  • Δφ_i is attributed to the settings (a_i, b_i).")
note_lines.append("")
note_lines.append("Definition of valid events subset + valid_rate")
note_lines.append("  • An event i is valid if (a_i,b_i) ∈ {0,1}×{0,1} and |Δφ_i| > ε.")
note_lines.append(f"  • ε = {CFG.DPHI_EPS:g}.")
note_lines.append("  • valid_rate is reported per (a,b) and per file as Valid / (N−1).")
note_lines.append("")
note_lines.append("Main pooled result (exact two-sided binomial test, H0: p(CW)=0.5)")
for ab in AB_LIST:
    r = pooled[ab]
    note_lines.append(
        f"  • (a,b)={ab}: CW={r['CW']}, CCW={r['CCW']}, n={r['n']}, "
        f"I={r['I']:.3f}, p={fmt_p(r['p'])}, p(Bonf×{CFG.MCC_M})={fmt_p(r['p_bonf'])}, "
        f"WilsonCI(CW proportion)=[{r['ci_low']:.3f},{r['ci_high']:.3f}]"
    )
note_lines.append("")
note_lines.append("Per-file consistency")
for ab in AB_LIST:
    s = df_sign[(df_sign["a"] == ab[0]) & (df_sign["b"] == ab[1])].iloc[0]
    note_lines.append(
        f"  • (a,b)={ab}: files with data={int(s['files_with_data'])}, "
        f"files with I>0={int(s['files_I>0'])}, sign-test p={fmt_p(s['sign_test_p'])}"
    )
note_lines.append("")
note_lines.append("LOO stability (leave-one-file-out pooled)")
note_lines.append(
    "  • Inspect df_loo: the effect is considered robust if (i) sign(I) is stable and (ii) p remains small across drops."
)
note_lines.append("")
note_lines.append("Time-shift null")
note_lines.append(
    "  • Null constructed by rolling φ by a large offset to destroy cross-trial alignment while preserving marginals."
)
for ab in AB_LIST:
    r = null_pooled[ab]
    note_lines.append(
        f"  • (a,b)={ab}: CW_null={r['CW_null']}, CCW_null={r['CCW_null']}, "
        f"n_null={r['n_null']}, I_null={r['I_null']:.3f}, p_null={fmt_p(r['p_null'])}"
    )
note_lines.append("")
note_lines.append("Symmetry checks")
note_lines.append(
    "  • CW/CCW definition flip should invert I (see df_sym). If not, a sign convention bug exists."
)
note_lines.append("")
note_lines.append("Window sweep stability")
note_lines.append(
    "  • Not executed in this cell unless you wire a coincidence-window selector."
)
note_lines.append(
    "  • Recommended: sweep your coincidence window / alignment tolerance and re-run this analysis; check sign stability vs window."
)
note_lines.append("")
note_lines.append("Notes / auditability")
note_lines.append(
    "  • If PHASE_PATH autodetection is wrong, set CFG.PHASE_PATH explicitly and re-run."
)
note_lines.append(
    "  • If valid_rate differs strongly by setting, report it and investigate selection bias."
)

print("\n" + "=" * 78)
print("\n".join(note_lines))
print("=" * 78)

Found 9 HDF5 files.
First 3:
  ./data_nist_v7\01_11_CH_pockel_100kHz.run4.afterTimingfix.dat.compressed.build.hdf5
  ./data_nist_v7\02_54_CH_pockel_100kHz.run4.afterTimingfix2.dat.compressed.build.hdf5
  ./data_nist_v7\03_31_CH_pockel_100kHz.run4.afterTimingfix2_training.dat.compressed.build.hdf5

[WARN] Some files failed (showing up to 5):
 - ./data_nist_v7\01_11_CH_pockel_100kHz.run4.afterTimingfix.dat.compressed.build.hdf5
    KeyError('Could not autodetect PHASE dataset in ./data_nist_v7\\01_11_CH_pockel_100kHz.run4.afterTimingfix.dat.compressed.build.hdf5. Set CFG.PHASE_PATH explicitly.')
 - ./data_nist_v7\02_54_CH_pockel_100kHz.run4.afterTimingfix2.dat.compressed.build.hdf5
    KeyError('Could not autodetect PHASE dataset in ./data_nist_v7\\02_54_CH_pockel_100kHz.run4.afterTimingfix2.dat.compressed.build.hdf5. Set CFG.PHASE_PATH explicitly.')
 - ./data_nist_v7\03_31_CH_pockel_100kHz.run4.afterTimingfix2_training.dat.compressed.build.hdf5
    KeyError('Could not autodetect PHASE d

RuntimeError: No files processed successfully. Set CFG.PHASE_PATH correctly.

In [7]:
import os, glob, re
import numpy as np
import pandas as pd
import h5py

DATA_DIR = "./data_nist_v7"
FILE_GLOB = "**/*.build.hdf5"

FILES = sorted(glob.glob(os.path.join(DATA_DIR, FILE_GLOB), recursive=True))
print(f"Found {len(FILES)} files")
assert FILES, "No HDF5 files found. Fix DATA_DIR / FILE_GLOB."

KEYWORDS = [
    "theta",
    "phase",
    "phi",
    "angle",
    "turn",
    "holonomy",
    "winding",
    "clock",
    "orient",
    "orientation",
    "dphi",
]
MUST_SHOW_PREFIX = ("alice/", "bob/", "timetags/", "coinc", "analysis/", "derived/")


def walk_datasets(h5):
    rows = []

    def visitor(name, obj):
        if isinstance(obj, h5py.Dataset):
            shape = tuple(obj.shape)
            dtype = str(obj.dtype)
            ndim = len(shape)
            n0 = shape[0] if ndim >= 1 else None
            rows.append(
                {"path": name, "ndim": ndim, "shape": shape, "n0": n0, "dtype": dtype}
            )

    h5.visititems(visitor)
    return pd.DataFrame(rows)


def score_path(p):
    pl = p.lower()
    score = 0
    for i, k in enumerate(KEYWORDS):
        if k in pl:
            score += 10 - min(i, 8)  # earlier keywords slightly higher
    # light boost for being under alice/bob/analysis
    if (
        pl.startswith("alice/")
        or pl.startswith("bob/")
        or pl.startswith("analysis/")
        or pl.startswith("derived/")
    ):
        score += 2
    # prefer float arrays
    return score


all_rows = []
cand_rows = []

for f in FILES:
    with h5py.File(f, "r") as h5:
        df = walk_datasets(h5)

    df["file"] = os.path.basename(f)
    df["path_l"] = df["path"].str.lower()
    df["is_float"] = df["dtype"].str.contains("float|f", case=False, regex=True)
    df["score"] = df["path"].map(score_path)

    # Always show settings/clicks/timetags even if not "phase"
    must_show = (
        df["path_l"].str.startswith("alice/settings")
        | df["path_l"].str.startswith("bob/settings")
        | df["path_l"].str.startswith("alice/clicks")
        | df["path_l"].str.startswith("bob/clicks")
        | df["path_l"].str.contains("timetag")
        | df["path_l"].str.contains("coinc")
        | df["path_l"].str.startswith("analysis/")
        | df["path_l"].str.startswith("derived/")
        | df["path_l"].str.contains("|".join(KEYWORDS))
    )

    # Keep it readable: focus on 1D and "interesting" arrays
    view = df[(df["ndim"] == 1) & must_show].copy()
    view = view.sort_values(
        ["score", "is_float", "n0"], ascending=[False, False, False]
    )

    print("\n" + "=" * 90)
    print("FILE:", os.path.basename(f))
    print("=" * 90)
    if len(view) == 0:
        print(
            "No 1D datasets matched the filter. (File may store only group-of-groups or 2D structures.)"
        )
    else:
        display(view[["path", "shape", "dtype", "score"]].head(60))

    # Candidate “phase/theta” = 1D float + keyword score > 0
    cands = df[(df["ndim"] == 1) & (df["is_float"]) & (df["score"] > 0)].copy()
    cands = cands.sort_values(["score", "n0"], ascending=[False, False])
    if len(cands):
        top = cands.head(10)[["path", "shape", "dtype", "score"]].copy()
        top["file"] = os.path.basename(f)
        cand_rows.append(top)
        print("\nSuggested PHASE_PATH candidates (top 10):")
        display(top)

    all_rows.append(df)

df_all = pd.concat(all_rows, ignore_index=True)

print("\n" + "#" * 90)
print("CROSS-FILE SUMMARY (helpful paths that repeat across files)")
print("#" * 90)

# show common interesting paths across files
common = (
    df_all[df_all["ndim"] == 1]
    .groupby("path")
    .agg(
        files=("file", "nunique"),
        max_score=("score", "max"),
        any_float=("is_float", "max"),
        max_n0=("n0", "max"),
    )
    .reset_index()
)
common = common[
    (common["max_score"] > 0)
    | (common["path"].str.lower().str.contains("settings|clicks|timetag|coinc"))
]
common = common.sort_values(
    ["files", "max_score", "any_float", "max_n0"],
    ascending=[False, False, False, False],
)

display(common.head(80))

if cand_rows:
    df_cand = pd.concat(cand_rows, ignore_index=True)
    print("\n" + "#" * 90)
    print("ALL CANDIDATE θ/phase datasets found (across all files)")
    print("#" * 90)
    display(df_cand.sort_values(["score", "file"], ascending=[False, True]))
else:
    print("\nNo obvious θ/phase datasets were found as 1D float arrays.")
    print(
        "That means: θ/phase is likely NOT stored directly, and must be DERIVED from clicks/timetags/coincidence structures."
    )

Found 9 files

FILE: 01_11_CH_pockel_100kHz.run4.afterTimingfix.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
1,alice/clicks,"(337818639,)",uint16,2
2,alice/settings,"(337818639,)",uint8,2
4,bob/clicks,"(337818639,)",uint16,2
5,bob/settings,"(337818639,)",uint8,2



FILE: 02_54_CH_pockel_100kHz.run4.afterTimingfix2.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
2,alice/settings,"(213469615,)",uint8,2
6,bob/settings,"(213469608,)",uint8,2
1,alice/clicks,"(203681462,)",uint16,2
5,bob/clicks,"(203681462,)",uint16,2



FILE: 03_31_CH_pockel_100kHz.run4.afterTimingfix2_training.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
8,bob/phase,"(15,)",float64,11
3,alice/phase,"(11,)",float64,11
4,alice/settings,"(28526907,)",uint8,2
9,bob/settings,"(28526894,)",uint8,2
1,alice/clicks,"(363,)",uint16,2
6,bob/clicks,"(363,)",uint16,2



Suggested PHASE_PATH candidates (top 10):


Unnamed: 0,path,shape,dtype,score,file
8,bob/phase,"(15,)",float64,11,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...
3,alice/phase,"(11,)",float64,11,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...



FILE: 03_43_CH_pockel_100kHz.run4.afterTimingfix2_afterfixingModeLocking.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
3,alice/phase,"(5084060,)",float64,11
8,bob/phase,"(4339158,)",float64,11
1,alice/clicks,"(107109596,)",uint16,2
4,alice/settings,"(107109596,)",uint8,2
6,bob/clicks,"(107109596,)",uint16,2
9,bob/settings,"(107109596,)",uint8,2



Suggested PHASE_PATH candidates (top 10):


Unnamed: 0,path,shape,dtype,score,file
3,alice/phase,"(5084060,)",float64,11,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...
8,bob/phase,"(4339158,)",float64,11,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...



FILE: 17_04_CH_pockel_100kHz.run.completeblind.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
3,alice/phase,"(17656721,)",float64,11
9,bob/phase,"(15191573,)",float64,11
1,alice/clicks,"(356464574,)",uint16,2
4,alice/settings,"(356464574,)",uint8,2
7,bob/clicks,"(356464574,)",uint16,2
10,bob/settings,"(356464574,)",uint8,2



Suggested PHASE_PATH candidates (top 10):


Unnamed: 0,path,shape,dtype,score,file
3,alice/phase,"(17656721,)",float64,11,17_04_CH_pockel_100kHz.run.completeblind.dat.c...
9,bob/phase,"(15191573,)",float64,11,17_04_CH_pockel_100kHz.run.completeblind.dat.c...



FILE: 19_45_CH_pockel_100kHz.run.nolightconeshift.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
3,alice/phase,"(9083993,)",float64,11
8,bob/phase,"(7467584,)",float64,11
1,alice/clicks,"(182659855,)",uint16,2
4,alice/settings,"(182659855,)",uint8,2
6,bob/clicks,"(182659855,)",uint16,2
9,bob/settings,"(182659855,)",uint8,2



Suggested PHASE_PATH candidates (top 10):


Unnamed: 0,path,shape,dtype,score,file
3,alice/phase,"(9083993,)",float64,11,19_45_CH_pockel_100kHz.run.nolightconeshift.da...
8,bob/phase,"(7467584,)",float64,11,19_45_CH_pockel_100kHz.run.nolightconeshift.da...



FILE: 21_15_CH_pockel_100kHz.run.200nsadditiondelay_lightconeshift.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
0,alice/clicks,"(178703041,)",uint16,2
1,alice/settings,"(178703041,)",uint8,2
2,bob/clicks,"(178703041,)",uint16,2
3,bob/settings,"(178703041,)",uint8,2



FILE: 22_20_CH_pockel_100kHz.run.200nsreduceddelay_lightconeshift.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
1,alice/clicks,"(179001561,)",uint16,2
2,alice/settings,"(179001561,)",uint8,2
3,bob/clicks,"(179001561,)",uint16,2
4,bob/settings,"(179001561,)",uint8,2



FILE: 23_55_CH_pockel_100kHz.run.ClassicalRNGXOR.dat.compressed.build.hdf5


Unnamed: 0,path,shape,dtype,score
1,alice/clicks,"(178801331,)",uint16,2
2,alice/settings,"(178801331,)",uint8,2
4,bob/clicks,"(178801331,)",uint16,2
5,bob/settings,"(178801331,)",uint8,2



##########################################################################################
CROSS-FILE SUMMARY (helpful paths that repeat across files)
##########################################################################################


Unnamed: 0,path,files,max_score,any_float,max_n0
0,alice/clicks,9,2,False,356464574.0
3,alice/settings,9,2,False,356464574.0
5,bob/clicks,9,2,False,356464574.0
8,bob/settings,9,2,False,356464574.0
2,alice/phase,4,11,True,17656721.0
7,bob/phase,4,11,True,15191573.0
1,alice/laserPulseNumber,4,2,False,17656721.0
6,bob/laserPulseNumber,4,2,False,15191573.0
4,alice/syncNumber,2,2,False,17656721.0
9,bob/syncNumber,2,2,False,15191573.0



##########################################################################################
ALL CANDIDATE θ/phase datasets found (across all files)
##########################################################################################


Unnamed: 0,path,shape,dtype,score,file
0,bob/phase,"(15,)",float64,11,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...
1,alice/phase,"(11,)",float64,11,03_31_CH_pockel_100kHz.run4.afterTimingfix2_tr...
2,alice/phase,"(5084060,)",float64,11,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...
3,bob/phase,"(4339158,)",float64,11,03_43_CH_pockel_100kHz.run4.afterTimingfix2_af...
4,alice/phase,"(17656721,)",float64,11,17_04_CH_pockel_100kHz.run.completeblind.dat.c...
5,bob/phase,"(15191573,)",float64,11,17_04_CH_pockel_100kHz.run.completeblind.dat.c...
6,alice/phase,"(9083993,)",float64,11,19_45_CH_pockel_100kHz.run.nolightconeshift.da...
7,bob/phase,"(7467584,)",float64,11,19_45_CH_pockel_100kHz.run.nolightconeshift.da...
