# Compare Methods (1/2/3)

This notebook compares privacy, utility, and distortion across Method 1 (SNP Laplace DP), Method 2 (Haploblock DP), and Method 3 (GRR LDP) for chr2_5Mb and chr10_1Mb using a common epsilon grid.


In [1]:
import json
from pathlib import Path
import glob
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------
# PROJECT_ROOT detection
# -------------------------
def find_project_root(start=None) -> Path:
    p = Path(start or Path.cwd()).resolve()
    for parent in [p] + list(p.parents):
        if (parent / "data").exists() or (parent / ".git").exists() or (parent / "requirements.txt").exists():
            return parent
    return p

PROJECT_ROOT = find_project_root()
print("PROJECT_ROOT =", PROJECT_ROOT)

FIG_DIR = PROJECT_ROOT / "results" / "figures" / "comparisons"
TAB_DIR = PROJECT_ROOT / "results" / "tables" / "comparisons"
FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR.mkdir(parents=True, exist_ok=True)

regions = ["chr2_5Mb", "chr10_1Mb"]
eps_grid = [0.03, 0.1, 0.3, 1.0, 3.0, 10.0]

methods = {
    "method1": "Method 1 (SNP Laplace DP)",
    "method2": "Method 2 (Haploblock DP)",
    "method3": "Method 3 (GRR LDP)",
}

case_pop_json = PROJECT_ROOT / "data" / "processed" / "hapmap" / "cohorts" / "hapmap_case_pop.json"
if not case_pop_json.exists():
    raise FileNotFoundError(f"Missing case_pop JSON: {case_pop_json}")
case_info = json.loads(case_pop_json.read_text())
CASE_POP = case_info["case_pop"]
print("CASE_POP =", CASE_POP)

PVALUE_CUTOFF = 1e-3
P_CLIP = 1e-6


PROJECT_ROOT = /Users/erkmenerken/Desktop/proje430
CASE_POP = MKK


In [2]:
# -------------------------
# Output helpers
# -------------------------
def with_timestamp(path: Path) -> Path:
    if not path.exists():
        return path
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    return path.with_name(f"{path.stem}_{ts}{path.suffix}")


def save_csv_no_overwrite(df: pd.DataFrame, path: Path) -> Path:
    out = with_timestamp(path)
    df.to_csv(out, index=False)
    return out


def df_to_markdown_simple(df: pd.DataFrame) -> str:
    cols = list(df.columns)
    header = "| " + " | ".join(cols) + " |"
    sep = "| " + " | ".join(["---"] * len(cols)) + " |"
    rows = []
    for _, row in df.iterrows():
        rows.append("| " + " | ".join(str(row[c]) for c in cols) + " |")
    return "\n".join([header, sep] + rows)


def save_md_no_overwrite(text: str, path: Path) -> Path:
    out = with_timestamp(path)
    out.write_text(text)
    return out


SyntaxError: EOL while scanning string literal (409694631.py, line 22)

In [None]:
# -------------------------
# Load per-seed CSVs if available
# -------------------------
def load_method_per_seed(method: str) -> pd.DataFrame:
    table_dir = PROJECT_ROOT / "results" / "tables" / method
    candidates = [
        table_dir / f"{method}_per_seed_{CASE_POP}.csv",
        table_dir / f"{method}_per_seed.csv",
    ]
    for p in candidates:
        if p.exists():
            df = pd.read_csv(p)
            df["_source_csv"] = str(p)
            return df
    return None

def normalize_per_seed(method: str, df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Region column may already be present
    if "region" not in df.columns:
        raise ValueError(f"Missing region column in {method} per-seed CSV")

    # Standardize metric columns
    if method == "method2":
        lr_col = "lr_power_case_above_threshold"
    else:
        lr_col = "lr_power"

    if lr_col not in df.columns:
        raise ValueError(f"Missing {lr_col} in {method} per-seed CSV")
    if "jaccard" not in df.columns:
        raise ValueError(f"Missing jaccard in {method} per-seed CSV")
    if "maf_mae" not in df.columns:
        raise ValueError(f"Missing maf_mae in {method} per-seed CSV")

    df = df.rename(columns={
        lr_col: "lr_power",
        "jaccard": "utility_jaccard",
        "maf_mae": "distortion_mae",
    })
    df["method"] = method
    df["method_label"] = methods[method]
    return df[["method", "method_label", "region", "eps", "seed", "lr_power", "utility_jaccard", "distortion_mae"]]

per_seed_frames = []
missing_methods = []
for method in methods:
    df = load_method_per_seed(method)
    if df is None:
        missing_methods.append(method)
        continue
    per_seed_frames.append(normalize_per_seed(method, df))

if missing_methods:
    print("[WARN] Missing per-seed CSV for:", ", ".join(missing_methods))

df_runs = pd.concat(per_seed_frames, ignore_index=True) if per_seed_frames else pd.DataFrame()
if not df_runs.empty:
    df_runs = df_runs[df_runs["region"].isin(regions)].reset_index(drop=True)
    print("Loaded per-seed rows:", len(df_runs))
else:
    print("No per-seed CSVs loaded.")


In [None]:
# -------------------------
# Fallback: discover NPZ runs (if per-seed CSV missing)
# -------------------------
def parse_eps_from_dir(name: str) -> float:
    if not name.startswith("eps_"):
        raise ValueError(f"Invalid eps dir: {name}")
    val = name[4:].replace("p", ".")
    return float(val)

def parse_seed_from_dir(name: str) -> int:
    if not name.startswith("seed_"):
        raise ValueError(f"Invalid seed dir: {name}")
    return int(name[5:])

def discover_runs(method: str, region: str):
    base = PROJECT_ROOT / "data" / "derived" / method
    if not base.exists():
        return []
    pattern = str(base / "eps_*" / "seed_*" / "*.npz")
    all_paths = [Path(p) for p in glob.glob(pattern)]

    region_key = "chr2" if region.startswith("chr2") else "chr10"
    out = []
    for p in all_paths:
        name = p.name
        if region_key not in name:
            continue
        if method in ["method1", "method2"] and "case_maf_noisy" not in name:
            continue
        if method == "method3" and ("case_maf_ldp" not in name and "grr" not in name and "method3" not in name):
            continue

        try:
            eps = parse_eps_from_dir(p.parent.parent.name)
            seed = parse_seed_from_dir(p.parent.name)
        except Exception:
            continue

        out.append({"eps": eps, "seed": seed, "npz_path": p})
    return out

def load_region_npz(path: Path) -> dict:
    z = np.load(path, allow_pickle=True)
    return {k: z[k] for k in z.files}

def _pick_genotype_matrix(region: dict) -> np.ndarray:
    for key in ["G", "G_sub", "genotypes", "X"]:
        if key in region:
            G = region[key]
            if isinstance(G, np.ndarray) and G.ndim == 2:
                return G
    for v in region.values():
        if isinstance(v, np.ndarray) and v.ndim == 2:
            return v
    raise ValueError("No genotype matrix found in region NPZ")

def _ensure_snps_by_individuals(G: np.ndarray) -> np.ndarray:
    return G.T if G.shape[0] < G.shape[1] else G

def compute_counts_and_maf(G: np.ndarray, idx: list):
    sub = G[:, idx]
    mask = sub >= 0
    called = mask.sum(axis=1).astype(np.int32)
    mac = np.where(mask, sub, 0).sum(axis=1).astype(np.float64)
    denom = 2.0 * np.maximum(called, 1)
    maf = mac / denom
    return called, mac, maf

def lr_scores(G_snps_x_ind: np.ndarray, cols: list, p_case: np.ndarray, p_ctrl: np.ndarray) -> np.ndarray:
    p_case = np.clip(p_case, P_CLIP, 1.0 - P_CLIP)
    p_ctrl = np.clip(p_ctrl, P_CLIP, 1.0 - P_CLIP)
    Gsub = G_snps_x_ind[:, cols]
    # log likelihoods
    p0_case = (1.0 - p_case) ** 2
    p1_case = 2.0 * p_case * (1.0 - p_case)
    p2_case = p_case ** 2
    p0_ctrl = (1.0 - p_ctrl) ** 2
    p1_ctrl = 2.0 * p_ctrl * (1.0 - p_ctrl)
    p2_ctrl = p_ctrl ** 2
    scores = []
    for j in range(Gsub.shape[1]):
        g = Gsub[:, j]
        mask = g >= 0
        if mask.sum() == 0:
            scores.append(0.0)
            continue
        gj = g[mask].astype(int)
        ll_case = np.log(np.where(gj == 0, p0_case[mask], np.where(gj == 1, p1_case[mask], p2_case[mask])))
        ll_ctrl = np.log(np.where(gj == 0, p0_ctrl[mask], np.where(gj == 1, p1_ctrl[mask], p2_ctrl[mask])))
        scores.append(float(np.sum(ll_case - ll_ctrl)))
    return np.array(scores, dtype=float)

def chisq_pvalues_case_vs_control(mac_case, called_case, mac_ctrl, called_ctrl):
    from scipy.stats import chi2_contingency
    pvals = []
    for i in range(len(mac_case)):
        case_alt = mac_case[i]
        case_ref = 2.0 * called_case[i] - case_alt
        ctrl_alt = mac_ctrl[i]
        ctrl_ref = 2.0 * called_ctrl[i] - ctrl_alt
        table = np.array([[case_alt, case_ref], [ctrl_alt, ctrl_ref]], dtype=float)
        try:
            _, p, _, _ = chi2_contingency(table)
        except Exception:
            p = 1.0
        pvals.append(p)
    return np.array(pvals, dtype=float)

def overlap_metrics(sig_true: np.ndarray, sig_noisy: np.ndarray):
    inter = np.logical_and(sig_true, sig_noisy).sum()
    union = np.logical_or(sig_true, sig_noisy).sum()
    jaccard = float(inter / union) if union > 0 else 0.0
    return jaccard

def compute_metrics_from_npz(method: str, region: str, npz_path: Path):
    z = np.load(npz_path, allow_pickle=True)
    p_case = z["released_maf"] if "released_maf" in z else z["noisy_maf"]
    true_maf = z["true_maf"] if "true_maf" in z else None
    case_train_idx = z["case_train_idx"].astype(int).tolist() if "case_train_idx" in z else None
    case_test_idx = z["case_test_idx"].astype(int).tolist() if "case_test_idx" in z else None

    region_info = case_info["regions"][region]
    case_region = load_region_npz(PROJECT_ROOT / region_info["case_region"])
    ceu_region = load_region_npz(PROJECT_ROOT / region_info["ceu_region"])
    ceu_control_idx = np.load(PROJECT_ROOT / region_info["ceu_control_npz"], allow_pickle=True)["indices"].astype(int).tolist()

    G_case = _ensure_snps_by_individuals(_pick_genotype_matrix(case_region))
    G_ceu = _ensure_snps_by_individuals(_pick_genotype_matrix(ceu_region))

    if case_train_idx is None or case_test_idx is None:
        # fallback: deterministic split by seed if possible
        seed = int(z["seed"]) if "seed" in z else 0
        rng = np.random.default_rng(seed)
        idx_all = list(range(G_case.shape[1]))
        rng.shuffle(idx_all)
        n_train = max(1, int(round(0.7 * len(idx_all))))
        case_train_idx = idx_all[:n_train]
        case_test_idx = idx_all[n_train:]

    # true maf
    called_case, mac_case, maf_case = compute_counts_and_maf(G_case, case_train_idx)
    if true_maf is None:
        true_maf = maf_case
    # control
    called_ctrl, mac_ctrl, maf_ctrl = compute_counts_and_maf(G_ceu, ceu_control_idx)

    # distortion
    distortion = float(np.mean(np.abs(p_case - true_maf)))

    # privacy
    scores_member = lr_scores(G_case, case_train_idx, p_case=p_case, p_ctrl=maf_ctrl)
    scores_nonmember = lr_scores(G_case, case_test_idx, p_case=p_case, p_ctrl=maf_ctrl)
    thresh = float(np.quantile(scores_nonmember, 0.95))
    lr_power = float(np.mean(scores_member >= thresh))

    # utility
    mac_case_noisy = np.clip(np.rint(p_case * (2.0 * called_case)), 0.0, 2.0 * called_case)
    p_true = chisq_pvalues_case_vs_control(mac_case, called_case, mac_ctrl, called_ctrl)
    p_noisy = chisq_pvalues_case_vs_control(mac_case_noisy, called_case, mac_ctrl, called_ctrl)
    sig_true = p_true <= PVALUE_CUTOFF
    sig_noisy = p_noisy <= PVALUE_CUTOFF
    jacc = overlap_metrics(sig_true, sig_noisy)

    return lr_power, jacc, distortion

fallback_rows = []
if missing_methods:
    for method in missing_methods:
        if method == "method2":
            print("[WARN] method2 missing per-seed CSV; skipping NPZ fallback to avoid mismatched cohorts.")
            continue
        for region in regions:
            runs = discover_runs(method, region)
            if not runs:
                print(f"[WARN] No NPZ runs found for {method} {region}")
                continue
            for run in runs:
                lr_power, jacc, distortion = compute_metrics_from_npz(method, region, run["npz_path"])
                fallback_rows.append({
                    "method": method,
                    "method_label": methods[method],
                    "region": region,
                    "eps": run["eps"],
                    "seed": run["seed"],
                    "lr_power": lr_power,
                    "utility_jaccard": jacc,
                    "distortion_mae": distortion,
                })

if fallback_rows:
    df_fallback = pd.DataFrame(fallback_rows)
    df_runs = pd.concat([df_runs, df_fallback], ignore_index=True)
    print("Added fallback NPZ rows:", len(df_fallback))


In [None]:
# -------------------------
# Aggregate + save tables
# -------------------------
if df_runs.empty:
    raise RuntimeError("No runs available to compare. Check per-seed CSVs or NPZ outputs.")

df_runs = df_runs[df_runs["region"].isin(regions)].copy()
df_runs["eps"] = df_runs["eps"].astype(float)

agg = df_runs.groupby(["method", "method_label", "region", "eps"])[["lr_power", "utility_jaccard", "distortion_mae"]]
df_agg = agg.agg(["mean", "std"]).reset_index()
df_agg.columns = [c[0] if c[1] == "" else f"{c[0]}_{c[1]}" for c in df_agg.columns.to_flat_index()]

out_all = save_csv_no_overwrite(df_runs, TAB_DIR / "all_runs.csv")
out_agg = save_csv_no_overwrite(df_agg, TAB_DIR / "agg_mean_std.csv")
print("Saved:", out_all.relative_to(PROJECT_ROOT))
print("Saved:", out_agg.relative_to(PROJECT_ROOT))


In [None]:
# -------------------------
# Figures per region
# -------------------------
def plot_metric(region: str, metric: str, ylabel: str, title_prefix: str, filename_prefix: str):
    sub = df_agg[df_agg["region"] == region].copy()
    if sub.empty:
        print(f"[WARN] No data for {region}")
        return

    plt.figure(figsize=(6, 4))
    for method, label in methods.items():
        msub = sub[sub["method"] == method].sort_values("eps")
        if msub.empty:
            continue
        plt.plot(msub["eps"], msub[f"{metric}_mean"], marker="o", label=label)

    plt.xscale("log")
    plt.xlabel("epsilon (log scale)")
    plt.ylabel(ylabel)
    plt.title(f"{title_prefix} - {region}")
    plt.legend()
    plt.tight_layout()
    out = with_timestamp(FIG_DIR / f"{filename_prefix}_{region}.png")
    plt.savefig(out, dpi=200)
    plt.close()
    print("Saved:", out.relative_to(PROJECT_ROOT))

for region in regions:
    plot_metric(
        region,
        metric="lr_power",
        ylabel="LR power (members above 95% nonmember threshold) down",
        title_prefix="Privacy vs eps (LR power)",
        filename_prefix="privacy",
    )
    plot_metric(
        region,
        metric="utility_jaccard",
        ylabel="Jaccard(sig_true, sig_noisy) up",
        title_prefix="Utility vs eps (GWAS overlap)",
        filename_prefix="utility",
    )
    plot_metric(
        region,
        metric="distortion_mae",
        ylabel="MAF MAE(|p_true - p_noisy|) down",
        title_prefix="Distortion vs eps (MAF MAE)",
        filename_prefix="distortion",
    )


In [None]:
# -------------------------
# Compact tables for report
# -------------------------
def fmt_mean_std(mean_val, std_val):
    if np.isnan(mean_val):
        return "NA"
    if np.isnan(std_val):
        return f"{mean_val:.4g}"
    return f"{mean_val:.4g}+/-{std_val:.2g}"

target_eps = [0.1, 1.0, 10.0]
for region in regions:
    sub = df_agg[df_agg["region"] == region].copy()
    if sub.empty:
        continue
    sub = sub[sub["eps"].isin(target_eps)]
    if sub.empty:
        print(f"[WARN] No target eps present for {region}")
        continue

    rows = []
    for method, label in methods.items():
        msub = sub[sub["method"] == method]
        if msub.empty:
            continue
        for eps in sorted(msub["eps"].unique()):
            row = msub[msub["eps"] == eps].iloc[0]
            rows.append({
                "method": label,
                "eps": eps,
                "lr_power": fmt_mean_std(row["lr_power_mean"], row["lr_power_std"]),
                "utility": fmt_mean_std(row["utility_jaccard_mean"], row["utility_jaccard_std"]),
                "distortion": fmt_mean_std(row["distortion_mae_mean"], row["distortion_mae_std"]),
            })

    out_df = pd.DataFrame(rows)
    out_csv = save_csv_no_overwrite(out_df, TAB_DIR / f"table_{region}_eps_0p1_1_10.csv")
    out_md = save_md_no_overwrite(df_to_markdown_simple(out_df), TAB_DIR / f"table_{region}_eps_0p1_1_10.md")
    print("Saved:", out_csv.relative_to(PROJECT_ROOT))
    print("Saved:", out_md.relative_to(PROJECT_ROOT))


In [None]:
# -------------------------
# Scoreboard at eps=1.0 (optional)
# -------------------------
rows = []
for region in regions:
    sub = df_agg[(df_agg["region"] == region) & (df_agg["eps"] == 1.0)].copy()
    if sub.empty:
        continue
    # check all methods present
    if len(set(sub["method"])) < len(methods):
        print(f"[WARN] eps=1.0 missing some methods for {region}")
    for _, r in sub.iterrows():
        rows.append({
            "region": region,
            "method": r["method_label"],
            "lr_power_mean": r["lr_power_mean"],
            "utility_mean": r["utility_jaccard_mean"],
            "distortion_mean": r["distortion_mae_mean"],
        })

if rows:
    df_score = pd.DataFrame(rows)
    out_score = save_csv_no_overwrite(df_score, TAB_DIR / "scoreboard_eps_1p0.csv")
    print("Saved:", out_score.relative_to(PROJECT_ROOT))


In [None]:
print("Outputs written to:")
print(" Figures:", FIG_DIR.relative_to(PROJECT_ROOT))
print(" Tables:", TAB_DIR.relative_to(PROJECT_ROOT))
