In [1]:
from pathlib import Path

# project_root = 430_project/
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name != "430_project":

    for parent in Path.cwd().parents:
        if parent.name == "430_project":
            PROJECT_ROOT = parent
            break

DATA_RAW = PROJECT_ROOT / "data" / "raw"
HAPMAP_RAW = DATA_RAW / "hapmap"

GENO_DIR = HAPMAP_RAW / "genotypes"
PHASE_DIR = HAPMAP_RAW / "phasing" / "HapMap3_r2" / "CEU" / "UNRELATED"
PHASE_META_DIR = HAPMAP_RAW / "phasing" / "HapMap3_r2"

for d in [GENO_DIR, PHASE_DIR, PHASE_META_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Genotype dir :", GENO_DIR)
print("Phasing dir  :", PHASE_DIR)
print("Phasing meta :", PHASE_META_DIR)

Project root: /Users/erkmenerken/Desktop/430_project
Genotype dir : /Users/erkmenerken/Desktop/430_project/data/raw/hapmap/genotypes
Phasing dir  : /Users/erkmenerken/Desktop/430_project/data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED
Phasing meta : /Users/erkmenerken/Desktop/430_project/data/raw/hapmap/phasing/HapMap3_r2


In [2]:
# Sources used:
# - Genotypes (CEU, non-redundant, b36, forward): chr2 + chr10
# - Phased haplotypes (HapMap3 r2, CEU, UNRELATED): chr2 + chr10
# - Small meta file in HapMap3 r2 directory (.info)

MANIFEST = [
    {
        "name": "CEU genotypes chr2 (unphased)",
        "url": "https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/latest_phaseII+III_ncbi_b36/forward/non-redundant/genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz",
        "dst": GENO_DIR / "genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz",
    },
    {
        "name": "CEU genotypes chr10 (unphased)",
        "url": "https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/latest_phaseII+III_ncbi_b36/forward/non-redundant/genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz",
        "dst": GENO_DIR / "genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz",
    },
    {
        "name": "CEU phased haplotypes chr2 (UNRELATED)",
        "url": "https://ftp.ncbi.nlm.nih.gov/hapmap/phasing/2009-02_phaseIII/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz",
        "dst": PHASE_DIR / "hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz",
    },
    {
        "name": "CEU phased haplotypes chr10 (UNRELATED)",
        "url": "https://ftp.ncbi.nlm.nih.gov/hapmap/phasing/2009-02_phaseIII/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz",
        "dst": PHASE_DIR / "hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz",
    },
    {
        "name": "HapMap3 r2 SNP meta (.info)",
        "url": "https://ftp.ncbi.nlm.nih.gov/hapmap/phasing/2009-02_phaseIII/HapMap3_r2/hapmap3_r2_b36_fwd.consensus.qc.poly.info",
        "dst": PHASE_META_DIR / "hapmap3_r2_b36_fwd.consensus.qc.poly.info",
    },
]

print(f" Manifest entries: {len(MANIFEST)}")
for x in MANIFEST:
    print("-", x["name"], "->", x["dst"].relative_to(PROJECT_ROOT))

 Manifest entries: 5
- CEU genotypes chr2 (unphased) -> data/raw/hapmap/genotypes/genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz
- CEU genotypes chr10 (unphased) -> data/raw/hapmap/genotypes/genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz
- CEU phased haplotypes chr2 (UNRELATED) -> data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz
- CEU phased haplotypes chr10 (UNRELATED) -> data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz
- HapMap3 r2 SNP meta (.info) -> data/raw/hapmap/phasing/HapMap3_r2/hapmap3_r2_b36_fwd.consensus.qc.poly.info


In [3]:
import urllib.request
from tqdm import tqdm
import os

def download_file(url: str, dst: Path, overwrite: bool = False, chunk_size: int = 1024 * 1024) -> None:
    dst.parent.mkdir(parents=True, exist_ok=True)

    if dst.exists() and not overwrite:
        size_bytes = dst.stat().st_size
        if size_bytes > 0:
            print(f"  Already exists, skipping: {dst.name} ({size_bytes/1e6:.2f} MB)")
            return
        print(f"  Existing file is empty, re-downloading: {dst.name}")

    tmp = dst.with_suffix(dst.suffix + ".part")

    print(f"  Downloading: {url}")
    print(f" Saving to  : {dst}")

    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urllib.request.urlopen(req) as resp:
        total = resp.headers.get("Content-Length")
        total = int(total) if total is not None else None

        with open(tmp, "wb") as f, tqdm(
            total=total, unit="B", unit_scale=True, unit_divisor=1024, desc=dst.name
        ) as pbar:
            while True:
                chunk = resp.read(chunk_size)
                if not chunk:
                    break
                f.write(chunk)
                pbar.update(len(chunk))

    os.replace(tmp, dst)  # atomic move
    print(f" Done: {dst.name} ({dst.stat().st_size/1e6:.2f} MB)\n")

In [4]:
print(" Starting HapMap downloads...\n")

for item in MANIFEST:
    print(f"=== {item['name']} ===")
    download_file(item["url"], item["dst"], overwrite=False)

print(" All requested HapMap files are present in data/raw/hapmap/.")

 Starting HapMap downloads...

=== CEU genotypes chr2 (unphased) ===
  Already exists, skipping: genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz (21.10 MB)
=== CEU genotypes chr10 (unphased) ===
  Already exists, skipping: genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz (13.63 MB)
=== CEU phased haplotypes chr2 (UNRELATED) ===
  Already exists, skipping: hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz (1.80 MB)
=== CEU phased haplotypes chr10 (UNRELATED) ===
  Already exists, skipping: hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz (1.13 MB)
=== HapMap3 r2 SNP meta (.info) ===
  Already exists, skipping: hapmap3_r2_b36_fwd.consensus.qc.poly.info (0.03 MB)
 All requested HapMap files are present in data/raw/hapmap/.


In [5]:
def human_mb(n_bytes: int) -> str:
    return f"{n_bytes/1e6:.2f} MB"

print(" Downloaded files:")
for item in MANIFEST:
    p = item["dst"]
    if p.exists():
        print(p.relative_to(PROJECT_ROOT), "-", human_mb(p.stat().st_size))
    else:
        print("MISSING:", p.relative_to(PROJECT_ROOT))

 Downloaded files:
data/raw/hapmap/genotypes/genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz - 21.10 MB
data/raw/hapmap/genotypes/genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz - 13.63 MB
data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz - 1.80 MB
data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz - 1.13 MB
data/raw/hapmap/phasing/HapMap3_r2/hapmap3_r2_b36_fwd.consensus.qc.poly.info - 0.03 MB


In [6]:
from pathlib import Path

def find_project_root(name="430_project") -> Path:
    cwd = Path.cwd()
    if cwd.name == name:
        return cwd
    for p in [cwd] + list(cwd.parents):
        if p.name == name:
            return p
    raise RuntimeError(f"Could not find project root folder named '{name}' from {cwd}")

PROJECT_ROOT = find_project_root("430_project")

RAW_HAPMAP = PROJECT_ROOT / "data" / "raw" / "hapmap"
GENO_DIR   = RAW_HAPMAP / "genotypes"
PHASE_DIR  = RAW_HAPMAP / "phasing" / "HapMap3_r2" / "CEU" / "UNRELATED"

PROC_DIR   = PROJECT_ROOT / "data" / "processed" / "hapmap"
REGION_DIR = PROC_DIR / "regions"
BLOCK_DIR  = PROC_DIR / "blocks"
COHORT_DIR = PROC_DIR / "cohorts"

for d in [PROC_DIR, REGION_DIR, BLOCK_DIR, COHORT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(" Project root:", PROJECT_ROOT)
print(" Raw HapMap  :", RAW_HAPMAP)
print(" Genotypes   :", GENO_DIR)
print(" Phasing     :", PHASE_DIR)
print(" Processed   :", PROC_DIR)

 Project root: /Users/erkmenerken/Desktop/430_project
 Raw HapMap  : /Users/erkmenerken/Desktop/430_project/data/raw/hapmap
 Genotypes   : /Users/erkmenerken/Desktop/430_project/data/raw/hapmap/genotypes
 Phasing     : /Users/erkmenerken/Desktop/430_project/data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED
 Processed   : /Users/erkmenerken/Desktop/430_project/data/processed/hapmap


In [7]:
import gzip

geno_chr2  = GENO_DIR / "genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz"
geno_chr10 = GENO_DIR / "genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz"
phase_chr2  = PHASE_DIR / "hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz"
phase_chr10 = PHASE_DIR / "hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz"

def peek_gz(path, n_lines=2, maxchars=220):
    print(f"\nðŸ”Ž Peeking: {path.name}")
    with gzip.open(path, "rt", encoding="utf-8", errors="replace") as f:
        for i in range(n_lines):
            line = f.readline()
            if not line:
                break
            print(f"  Line {i+1}: {line[:maxchars].rstrip()}")

for p in [geno_chr2, geno_chr10, phase_chr2, phase_chr10]:
    if p.exists():
        peek_gz(p, n_lines=2)
    else:
        print(" Missing:", p)

print("\n If the header looks like: rs# alleles chrom pos ... QCcode NA06984 ... then weâ€™re good.")


ðŸ”Ž Peeking: genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz
  Line 1: rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode NA06984 NA06985 NA06986 NA06989 NA06991 NA06993 NA06994 NA06995 NA06997 NA07000 NA07014 NA07019 NA07022 NA07029 NA07031 NA07034 NA07037 NA
  Line 2: rs10171150 A/G chr2 2091 + ncbi_b36 mcgill-gqic urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1 urn:LSID:mcgill-gqic.hapmap.org:Assay:810448:1 urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1 QC+ NN GG NN NN GG GG

ðŸ”Ž Peeking: genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz
  Line 1: rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode NA06984 NA06985 NA06986 NA06989 NA06991 NA06993 NA06994 NA06995 NA06997 NA07000 NA07014 NA07019 NA07022 NA07029 NA07031 NA07034 NA07037 NA
  Line 2: rs11511647 C/T chr10 62765 + ncbi_b36 sanger urn:lsid:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1 urn:lsid:sanger.hapmap.org:Assay:4310385:1 urn:lsid:dcc.hapmap.org:Panel:CEPH-30-

In [8]:
import pandas as pd
import numpy as np

def load_positions_rsids(geno_path):
    print(f" Reading rs# + pos from {geno_path.name} (whitespace-separated)...")
    df = pd.read_csv(
        geno_path,
        sep=r"\s+",
        engine="python",
        compression="gzip",
        usecols=["rs#", "pos"],
        dtype={"rs#": str, "pos": int},
    )
    df = df.dropna().sort_values("pos").reset_index(drop=True)
    print(f" Loaded {len(df)} SNP positions.")
    return df

def choose_window_by_bp(df_pos, window_bp, target_snps):
    pos = df_pos["pos"].to_numpy(np.int64)
    n = len(pos)
    ends = np.searchsorted(pos, pos + window_bp, side="right")
    counts = ends - np.arange(n)

    diff = np.abs(counts - target_snps)
    best_i = int(np.argmin(diff))

    start = int(pos[best_i])
    end = int(start + window_bp)
    count = int(counts[best_i])
    end_idx = int(ends[best_i] - 1)
    end_pos_actual = int(pos[end_idx]) if end_idx >= best_i else start

    return {
        "start_bp": start,
        "end_bp": end,
        "snps_in_window": count,
        "end_pos_actual": end_pos_actual,
        "total_snps_chr": int(n),
        "min_pos": int(pos[0]),
        "max_pos": int(pos[-1]),
    }

df2  = load_positions_rsids(geno_chr2)
df10 = load_positions_rsids(geno_chr10)

chr2_plan  = choose_window_by_bp(df2,  window_bp=5_000_000, target_snps=311)
chr10_plan = choose_window_by_bp(df10, window_bp=1_000_000, target_snps=610)

print("\n Proposal-style windows selected:")
print("chr2 (5Mb, ~311 SNPs):", chr2_plan)
print("chr10 (1Mb, ~610 SNPs):", chr10_plan)

 Reading rs# + pos from genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz (whitespace-separated)...
 Loaded 329831 SNP positions.
 Reading rs# + pos from genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz (whitespace-separated)...
 Loaded 211711 SNP positions.

 Proposal-style windows selected:
chr2 (5Mb, ~311 SNPs): {'start_bp': 242050505, 'end_bp': 247050505, 'snps_in_window': 311, 'end_pos_actual': 242742878, 'total_snps_chr': 329831, 'min_pos': 2091, 'max_pos': 242742878}
chr10 (1Mb, ~610 SNPs): {'start_bp': 38322261, 'end_bp': 39322261, 'snps_in_window': 610, 'end_pos_actual': 39194226, 'total_snps_chr': 211711, 'min_pos': 62765, 'max_pos': 135373179}


In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm

MISSING_TOKENS = {"", "NN", "NA", "N", "00", "--", "??"}

def _find_sample_columns(columns):
    cols = list(columns)
    if "QCcode" in cols:
        qc_idx = cols.index("QCcode")
        return cols[:qc_idx + 1], cols[qc_idx + 1:]
    # Fallback (rarely needed for HapMap)
    return cols[:11], cols[11:]

def genotype_row_to_minor_counts(geno_strs, allele_a, allele_b):
    """
    Convert HapMap genotype strings (e.g., 'AA', 'AG', 'GG', 'NN') to
    minor-allele counts in {0,1,2} with -1 for missing.

    Minor allele is determined per SNP by observed allele counts in the row.
    """
    g = np.asarray(geno_strs, dtype=object)
    g = np.array([x.strip() if isinstance(x, str) else "" for x in g], dtype=object)

    missing = np.zeros(len(g), dtype=bool)
    for t in MISSING_TOKENS:
        missing |= (g == t)

    # valid genotype strings are length 2 (e.g. "AG")
    valid = (~missing) & np.array([len(x) == 2 for x in g], dtype=bool)

    # count allele occurrences among valid entries
    a_count = 0
    b_count = 0
    for x in g[valid]:
        a_count += (x[0] == allele_a) + (x[1] == allele_a)
        b_count += (x[0] == allele_b) + (x[1] == allele_b)

    # choose minor allele (ties deterministic)
    if a_count < b_count:
        minor, major = allele_a, allele_b
    elif b_count < a_count:
        minor, major = allele_b, allele_a
    else:
        minor, major = allele_b, allele_a

    out = np.full(len(g), -1, dtype=np.int8)
    for i, x in enumerate(g):
        if (not isinstance(x, str)) or (x in MISSING_TOKENS) or len(x) != 2:
            continue
        out[i] = np.int8((x[0] == minor) + (x[1] == minor))

    return out, minor, major

def parse_genotypes_window(geno_path, start_bp, end_bp, chunksize=2000):
    """
    Parses HapMap whitespace-separated genotype .txt.gz file and extracts SNPs
    within [start_bp, end_bp]. Returns:
      G (M x N): int8 minor allele counts, missing=-1
      sample_ids (M,)
      snp_ids (N,)
      positions (N,)
      minor_alleles (N,)
      major_alleles (N,)
    """
    print(f"\n Parsing genotypes from {geno_path.name}")
    print(f"   Window: [{start_bp}, {end_bp}] bp (inclusive)")

    # Read header row only (detect columns + sample IDs)
    header = pd.read_csv(
        geno_path,
        sep=r"\s+",
        engine="python",
        compression="gzip",
        nrows=1,
        dtype=str
    )
    _, sample_cols = _find_sample_columns(header.columns)
    sample_ids = np.array(sample_cols, dtype=object)
    print(f" Individuals detected: {len(sample_ids)}")

    G_cols, snp_ids, positions, minor_alleles, major_alleles = [], [], [], [], []

    reader = pd.read_csv(
        geno_path,
        sep=r"\s+",
        engine="python",
        compression="gzip",
        dtype=str,
        chunksize=chunksize,
    )

    kept = 0
    for chunk in tqdm(reader, desc=f"Reading {geno_path.name}"):
        # Filter by window using numeric positions
        pos_int = pd.to_numeric(chunk["pos"], errors="coerce")
        mask = (pos_int >= start_bp) & (pos_int <= end_bp)
        chunk = chunk.loc[mask]
        if chunk.empty:
            continue

        for _, row in chunk.iterrows():
            rsid = row.get("rs#", None)
            alleles = row.get("alleles", None)
            pos = row.get("pos", None)

            if rsid is None or alleles is None or pos is None or "/" not in alleles:
                continue

            a, b = [x.strip() for x in alleles.split("/", 1)]
            if len(a) != 1 or len(b) != 1:
                continue

            geno_strs = row[sample_cols].values
            counts, minor, major = genotype_row_to_minor_counts(geno_strs, a, b)

            G_cols.append(counts)
            snp_ids.append(rsid)
            positions.append(int(pos))
            minor_alleles.append(minor)
            major_alleles.append(major)
            kept += 1

    if kept == 0:
        raise RuntimeError("No SNPs were kept. Check start/end window values.")

    G = np.stack(G_cols, axis=1)  # (M, N)
    positions = np.array(positions, dtype=np.int32)

    # Sort by position (just in case)
    order = np.argsort(positions)
    G = G[:, order]
    positions = positions[order]
    snp_ids = np.array(snp_ids, dtype=object)[order]
    minor_alleles = np.array(minor_alleles, dtype=object)[order]
    major_alleles = np.array(major_alleles, dtype=object)[order]

    print(f" Kept SNPs: {G.shape[1]} | Individuals: {G.shape[0]}")
    print(f" Missing rate: {float(np.mean(G == -1)):.4f}")
    print(f" Kept position range: {int(positions.min())} .. {int(positions.max())}")

    return G, sample_ids, snp_ids, positions, minor_alleles, major_alleles

In [10]:
import json, time

def save_region(region_name, chrom, plan, G, sample_ids, snp_ids, positions, minor, major):
    out_npz = REGION_DIR / f"{region_name}.npz"
    out_meta = REGION_DIR / f"{region_name}.meta.json"

    np.savez_compressed(
        out_npz,
        G=G.astype(np.int8),
        sample_ids=sample_ids,
        snp_ids=snp_ids,
        positions=positions,
        minor_alleles=minor,
        major_alleles=major,
        chrom=str(chrom),
    )

    meta = {
        "region_name": region_name,
        "chrom": str(chrom),
        "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
        "window_start_bp": int(plan["start_bp"]),
        "window_end_bp": int(plan["end_bp"]),
        "snps_in_window": int(G.shape[1]),
        "individuals": int(G.shape[0]),
        "missing_rate": float(np.mean(G == -1)),
        "note": "G is minor-allele count in {0,1,2}, missing=-1.",
    }
    out_meta.write_text(json.dumps(meta, indent=2))

    print(f" Saved: {out_npz.relative_to(PROJECT_ROOT)}")
    print(f" Meta : {out_meta.relative_to(PROJECT_ROOT)}\n")

# chr2
G2, ids2, rs2, pos2, min2, maj2 = parse_genotypes_window(geno_chr2, chr2_plan["start_bp"], chr2_plan["end_bp"])
save_region("CEU_chr2_5Mb", 2, chr2_plan, G2, ids2, rs2, pos2, min2, maj2)

# chr10
G10, ids10, rs10, pos10, min10, maj10 = parse_genotypes_window(geno_chr10, chr10_plan["start_bp"], chr10_plan["end_bp"])
save_region("CEU_chr10_1Mb", 10, chr10_plan, G10, ids10, rs10, pos10, min10, maj10)

print(" Region extraction complete.")
print(" Check: data/processed/hapmap/regions/")


 Parsing genotypes from genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz
   Window: [242050505, 247050505] bp (inclusive)
 Individuals detected: 174


Reading genotypes_chr2_CEU_r27_nr.b36_fwd.txt.gz: 165it [00:24,  6.62it/s]


 Kept SNPs: 311 | Individuals: 174
 Missing rate: 0.2843
 Kept position range: 242050505 .. 242742878
 Saved: data/processed/hapmap/regions/CEU_chr2_5Mb.npz
 Meta : data/processed/hapmap/regions/CEU_chr2_5Mb.meta.json


 Parsing genotypes from genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz
   Window: [38322261, 39322261] bp (inclusive)
 Individuals detected: 174


Reading genotypes_chr10_CEU_r27_nr.b36_fwd.txt.gz: 106it [00:15,  6.69it/s]

 Kept SNPs: 610 | Individuals: 174
 Missing rate: 0.3864
 Kept position range: 38322261 .. 39194226
 Saved: data/processed/hapmap/regions/CEU_chr10_1Mb.npz
 Meta : data/processed/hapmap/regions/CEU_chr10_1Mb.meta.json

 Region extraction complete.
 Check: data/processed/hapmap/regions/





In [11]:
def make_split(sample_ids, test_frac=0.2, seed=0):
    rng = np.random.default_rng(seed)
    n = len(sample_ids)
    idx = np.arange(n)
    rng.shuffle(idx)
    n_test = int(round(test_frac * n))
    test_idx = np.sort(idx[:n_test])
    control_idx = np.sort(idx[n_test:])
    return control_idx, test_idx

control_idx, test_idx = make_split(ids2, test_frac=0.2, seed=0)

cohorts = {
    "notes": {
        "case_source": "PGP (TBD)",
        "control_test_source": "HapMap CEU",
        "split_seed": 0,
        "test_frac": 0.2
    },
    "case": {"sample_ids": [], "indices_in_ceu_matrix": []},
    "control": {
        "sample_ids": [str(ids2[i]) for i in control_idx],
        "indices_in_ceu_matrix": control_idx.tolist()
    },
    "test": {
        "sample_ids": [str(ids2[i]) for i in test_idx],
        "indices_in_ceu_matrix": test_idx.tolist()
    }
}

out_path = COHORT_DIR / "ceu_control_test_split.json"
out_path.write_text(json.dumps(cohorts, indent=2))

print(" Created CEU control/test split")
print(f"   control n={len(control_idx)} | test n={len(test_idx)}")
print(" Saved:", out_path.relative_to(PROJECT_ROOT))

 Created CEU control/test split
   control n=139 | test n=35
 Saved: data/processed/hapmap/cohorts/ceu_control_test_split.json


In [12]:
def maf_from_G(G_sub):
    M, N = G_sub.shape
    maf = np.zeros(N, dtype=float)
    for j in range(N):
        col = G_sub[:, j]
        mask = col >= 0
        if mask.sum() == 0:
            maf[j] = np.nan
        else:
            maf[j] = col[mask].mean() / 2.0
    return maf

maf_ctrl_chr2 = maf_from_G(G2[control_idx])
maf_test_chr2 = maf_from_G(G2[test_idx])

maf_ctrl_chr10 = maf_from_G(G10[control_idx])
maf_test_chr10 = maf_from_G(G10[test_idx])

freq_out = PROC_DIR / "ceu_maf_reference.npz"
np.savez_compressed(
    freq_out,
    chr2_control_maf=maf_ctrl_chr2,
    chr2_test_maf=maf_test_chr2,
    chr2_snp_ids=rs2,
    chr2_positions=pos2,
    chr10_control_maf=maf_ctrl_chr10,
    chr10_test_maf=maf_test_chr10,
    chr10_snp_ids=rs10,
    chr10_positions=pos10,
)

print(" Computed and saved CEU MAF references (control + test)")
print(" Saved:", freq_out.relative_to(PROJECT_ROOT))
print("   chr2: mean control MAF =", float(np.nanmean(maf_ctrl_chr2)))
print("   chr10: mean control MAF =", float(np.nanmean(maf_ctrl_chr10)))

 Computed and saved CEU MAF references (control + test)
 Saved: data/processed/hapmap/ceu_maf_reference.npz
   chr2: mean control MAF = 0.18893962444832998
   chr10: mean control MAF = 0.16131570012366628


In [14]:
from pathlib import Path
import json

def find_project_root(name="430_project") -> Path:
    cwd = Path.cwd()
    if cwd.name == name:
        return cwd
    for p in [cwd] + list(cwd.parents):
        if p.name == name:
            return p
    raise RuntimeError(f"Could not find project root folder named '{name}' from {cwd}")

PROJECT_ROOT = find_project_root("430_project")

RAW_HAPMAP = PROJECT_ROOT / "data" / "raw" / "hapmap"
PHASE_DIR  = RAW_HAPMAP / "phasing" / "HapMap3_r2" / "CEU" / "UNRELATED"

PROC_DIR      = PROJECT_ROOT / "data" / "processed" / "hapmap"
REGION_DIR    = PROC_DIR / "regions"
COHORT_DIR    = PROC_DIR / "cohorts"
HAP_OUT_DIR   = PROC_DIR / "haplotypes"        # new
BLOCK_OUT_DIR = PROC_DIR / "blocks"            # new (if not already)

for d in [HAP_OUT_DIR, BLOCK_OUT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

REGION_CHR2  = REGION_DIR / "CEU_chr2_5Mb.npz"
REGION_CHR10 = REGION_DIR / "CEU_chr10_1Mb.npz"
COHORTS_JSON = COHORT_DIR / "ceu_control_test_split.json"

PHASE_CHR2  = PHASE_DIR / "hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz"
PHASE_CHR10 = PHASE_DIR / "hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz"

print(" Project root:", PROJECT_ROOT)
print(" Raw phasing dir:", PHASE_DIR)
print(" Regions:", REGION_DIR)
print(" Cohorts:", COHORT_DIR)
print(" Output haplotypes:", HAP_OUT_DIR)
print(" Output blocks:", BLOCK_OUT_DIR)

for p in [REGION_CHR2, REGION_CHR10, COHORTS_JSON, PHASE_CHR2, PHASE_CHR10]:
    print("Exists" if p.exists() else " Missing", "-", p.relative_to(PROJECT_ROOT))

 Project root: /Users/erkmenerken/Desktop/430_project
 Raw phasing dir: /Users/erkmenerken/Desktop/430_project/data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED
 Regions: /Users/erkmenerken/Desktop/430_project/data/processed/hapmap/regions
 Cohorts: /Users/erkmenerken/Desktop/430_project/data/processed/hapmap/cohorts
 Output haplotypes: /Users/erkmenerken/Desktop/430_project/data/processed/hapmap/haplotypes
 Output blocks: /Users/erkmenerken/Desktop/430_project/data/processed/hapmap/blocks
Exists - data/processed/hapmap/regions/CEU_chr2_5Mb.npz
Exists - data/processed/hapmap/regions/CEU_chr10_1Mb.npz
Exists - data/processed/hapmap/cohorts/ceu_control_test_split.json
Exists - data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz
Exists - data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz


In [15]:
import numpy as np

def load_region_npz(path: Path):
    z = np.load(path, allow_pickle=True)
    out = {k: z[k] for k in z.files}
    # make sure types are friendly
    out["snp_ids"] = out["snp_ids"].astype(object)
    out["sample_ids"] = out["sample_ids"].astype(object)
    out["minor_alleles"] = out["minor_alleles"].astype(object)
    out["major_alleles"] = out["major_alleles"].astype(object)
    return out

r2 = load_region_npz(REGION_CHR2)
r10 = load_region_npz(REGION_CHR10)

with open(COHORTS_JSON, "r") as f:
    cohorts = json.load(f)

print(" Loaded regions:")
print("  chr2 :", r2["G"].shape, "| SNPs:", len(r2["snp_ids"]), "| individuals:", len(r2["sample_ids"]))
print("  chr10:", r10["G"].shape, "| SNPs:", len(r10["snp_ids"]), "| individuals:", len(r10["sample_ids"]))

control_idx = np.array(cohorts["control"]["indices_in_ceu_matrix"], dtype=int)
test_idx    = np.array(cohorts["test"]["indices_in_ceu_matrix"], dtype=int)

print("\n Loaded CEU cohorts:")
print("  control n =", len(control_idx))
print("  test    n =", len(test_idx))
print("  case placeholder size =", len(cohorts["case"]["sample_ids"]))

 Loaded regions:
  chr2 : (174, 311) | SNPs: 311 | individuals: 174
  chr10: (174, 610) | SNPs: 610 | individuals: 174

 Loaded CEU cohorts:
  control n = 139
  test    n = 35
  case placeholder size = 0


In [26]:
import numpy as np
import json

def adjacent_r2_from_G(G):
    """
    Compute r^2 between adjacent SNPs using correlation on dosage {0,1,2},
    ignoring missing=-1. Returns length (N-1) array.
    """
    X = G.astype(float)
    M, N = X.shape
    out = np.zeros(N - 1, dtype=float)

    for j in range(N - 1):
        x = X[:, j]
        y = X[:, j + 1]
        mask = (x >= 0) & (y >= 0)
        if mask.sum() < 10:
            out[j] = 0.0
            continue

        xv = x[mask] - x[mask].mean()
        yv = y[mask] - y[mask].mean()
        denom = np.sqrt((xv * xv).sum() * (yv * yv).sum())
        if denom == 0:
            out[j] = 0.0
            continue

        r = float((xv * yv).sum() / denom)
        out[j] = r * r
    return out

def build_blocks_from_adjacent_r2(r2, threshold=0.8, min_snps=5, max_snps=80):
    """
    Create blocks by cutting when adjacent r^2 < threshold.
    Then:
      - merge blocks smaller than min_snps into previous
      - split blocks larger than max_snps
    Returns list of (start_idx, end_idx) inclusive, in SNP-index space.
    """
    N = len(r2) + 1
    cuts = [0]
    for j, v in enumerate(r2):
        if v < threshold:
            cuts.append(j + 1)
    cuts.append(N)

    blocks = [(cuts[i], cuts[i + 1] - 1) for i in range(len(cuts) - 1)]

    # merge tiny blocks into previous
    merged = []
    for s, e in blocks:
        if not merged:
            merged.append((s, e))
        else:
            if (e - s + 1) < min_snps:
                ps, pe = merged[-1]
                merged[-1] = (ps, e)
            else:
                merged.append((s, e))

    # split huge blocks
    final = []
    for s, e in merged:
        while (e - s + 1) > max_snps:
            final.append((s, s + max_snps - 1))
            s = s + max_snps
        final.append((s, e))

    return final

def save_blocks_json_phased_compatible(region_name, pc, blocks, params):
    """
    Saves blocks JSON using the phased-compatible SNP list (pc['snp_ids_sub'], pc['positions_sub']).
    """
    out = BLOCK_OUT_DIR / f"{region_name}.blocks.phased_compatible.json"

    snp_ids = pc["snp_ids_sub"]
    positions = pc["positions_sub"]

    payload = {
        "region_name": region_name,
        "phased_compatible": True,
        "block_params": params,
        "num_snps": int(len(snp_ids)),
        "num_blocks": int(len(blocks)),
        "blocks": [
            {
                "block_id": int(i),
                "start_snp_index": int(s),
                "end_snp_index": int(e),
                "num_snps": int(e - s + 1),
                "start_pos": int(positions[s]),
                "end_pos": int(positions[e]),
                "snp_ids": [str(x) for x in snp_ids[s:e+1]],
            }
            for i, (s, e) in enumerate(blocks)
        ],
    }

    out.write_text(json.dumps(payload, indent=2))
    print(f" Saved blocks â†’ {out.relative_to(PROJECT_ROOT)}")
    print(f" Blocks saved for {region_name}: {len(blocks)} blocks\n")
    return out


BLOCK_PARAMS = {"threshold": 0.8, "min_snps": 5, "max_snps": 80}

# --- chr2 (PHASED-COMPATIBLE) ---
print("\n Building blocks for chr2 (phased-compatible SNP subset)")
pc2_adj = adjacent_r2_from_G(pc2["G_sub"])
blocks_chr2 = build_blocks_from_adjacent_r2(pc2_adj, **BLOCK_PARAMS)
print(f" chr2 phased-compatible SNPs: {pc2['G_sub'].shape[1]}")
print(f" chr2 inferred blocks: {len(blocks_chr2)}")
blocks_chr2_path = save_blocks_json_phased_compatible("CEU_chr2_5Mb", pc2, blocks_chr2, BLOCK_PARAMS)

# --- chr10 (PHASED-COMPATIBLE) ---
print("\n Building blocks for chr10 (phased-compatible SNP subset)")
pc10_adj = adjacent_r2_from_G(pc10["G_sub"])
blocks_chr10 = build_blocks_from_adjacent_r2(pc10_adj, **BLOCK_PARAMS)
print(f" chr10 phased-compatible SNPs: {pc10['G_sub'].shape[1]}")
print(f" chr10 inferred blocks: {len(blocks_chr10)}")
blocks_chr10_path = save_blocks_json_phased_compatible("CEU_chr10_1Mb", pc10, blocks_chr10, BLOCK_PARAMS)


 Building blocks for chr2 (phased-compatible SNP subset)
 chr2 phased-compatible SNPs: 118
 chr2 inferred blocks: 3
 Saved blocks â†’ data/processed/hapmap/blocks/CEU_chr2_5Mb.blocks.phased_compatible.json
 Blocks saved for CEU_chr2_5Mb: 3 blocks


 Building blocks for chr10 (phased-compatible SNP subset)
 chr10 phased-compatible SNPs: 129
 chr10 inferred blocks: 2
 Saved blocks â†’ data/processed/hapmap/blocks/CEU_chr10_1Mb.blocks.phased_compatible.json
 Blocks saved for CEU_chr10_1Mb: 2 blocks



In [None]:
import pandas as pd
import numpy as np

def load_phased_df(phase_gz_path):
    print(f"\n Loading phased file: {phase_gz_path.name}")
    df = pd.read_csv(
        phase_gz_path,
        sep=r"\s+",
        engine="python",
        compression="gzip",
        dtype=str,
    )
    print(f" Phased rows loaded: {len(df)} | columns: {len(df.columns)}")
    if "rsID" not in df.columns or "position_b36" not in df.columns:
        raise ValueError("Unexpected phased file columns. Expected rsID and position_b36.")
    return df

def phased_individuals_from_columns(df):
    hap_cols = [c for c in df.columns if c.endswith("_A") or c.endswith("_B")]
    if len(hap_cols) == 0:
        raise ValueError("No _A/_B haplotype columns found in phased file.")
    individuals = sorted(set(c[:-2] for c in hap_cols))  # strip _A/_B
    # keep only those that have both columns
    individuals = [i for i in individuals if f"{i}_A" in df.columns and f"{i}_B" in df.columns]
    return individuals

def make_phased_compatible_region(region, phased_df, region_name):
    """
    region: dict loaded from your region npz (r2 or r10)
    phased_df: full phased dataframe for that chromosome
    Returns a dict with phased-compatible data:
      - G_sub (individuals x snps)
      - sample_ids_sub
      - snp_ids_sub
      - positions_sub
      - phased alleles A/B matrices (snps x individuals)
      - individuals list (phased)
      - idx maps
    """
    region_sample_ids = np.array([str(x) for x in region["sample_ids"]], dtype=object)
    region_snp_ids    = np.array([str(x) for x in region["snp_ids"]], dtype=object)
    region_positions  = np.array(region["positions"], dtype=int)

    # Individuals available in phased
    phased_indivs_all = phased_individuals_from_columns(phased_df)
    region_indiv_set = set(region_sample_ids.tolist())
    phased_indivs = [i for i in phased_indivs_all if i in region_indiv_set]

    print(f"\n [{region_name}] Individuals:")
    print(f"  phased individuals total: {len(phased_indivs_all)}")
    print(f"  phased âˆ© region:          {len(phased_indivs)}")

    if len(phased_indivs) == 0:
        raise RuntimeError(f"[{region_name}] No overlapping individuals between region and phased file.")

    # SNP intersection by rsID
    phased_rsids_set = set(phased_df["rsID"].astype(str).tolist())
    keep_snp_mask = np.array([rs in phased_rsids_set for rs in region_snp_ids], dtype=bool)
    snp_ids_sub = region_snp_ids[keep_snp_mask]
    positions_sub = region_positions[keep_snp_mask]

    print(f"\n [{region_name}] SNPs:")
    print(f"  region SNPs: {len(region_snp_ids)}")
    print(f"  matched in phased: {len(snp_ids_sub)}")

    if len(snp_ids_sub) == 0:
        raise RuntimeError(f"[{region_name}] No SNPs from region found in phased file.")

    # Subset region genotype matrix to phased-compatible individuals + SNPs
    indiv_idx_in_region = np.array([np.where(region_sample_ids == i)[0][0] for i in phased_indivs], dtype=int)
    snp_idx_in_region = np.where(keep_snp_mask)[0]

    G_sub = region["G"][indiv_idx_in_region][:, snp_idx_in_region]   # (n_indiv, n_snps)
    sample_ids_sub = region_sample_ids[indiv_idx_in_region]

    print(f" [{region_name}] Subset region G to phased-compatible shape: {G_sub.shape}")

    # Now subset phased dataframe rows to our SNP list, and reorder to match region order
    phased_sub = phased_df[phased_df["rsID"].isin(set(snp_ids_sub.tolist()))].copy()
    # reorder rows to match snp_ids_sub order
    order_map = {rsid: i for i, rsid in enumerate(snp_ids_sub.tolist())}
    phased_sub["__order"] = phased_sub["rsID"].map(order_map)
    phased_sub = phased_sub.sort_values("__order").drop(columns="__order")

    # Columns for A/B
    A_cols = [f"{i}_A" for i in phased_indivs]
    B_cols = [f"{i}_B" for i in phased_indivs]

    alleles_A = phased_sub[A_cols].to_numpy(dtype=object)  # (n_snps, n_indiv)
    alleles_B = phased_sub[B_cols].to_numpy(dtype=object)

    # sanity prints
    print(f" [{region_name}] Phased allele matrices:")
    print(f"   A: {alleles_A.shape} | B: {alleles_B.shape}")
    print(f" [{region_name}] Confirm SNP order alignment: {len(phased_sub)} rows")

    # best-effort position check
    phased_pos = phased_sub["position_b36"].astype(int).to_numpy()
    if len(phased_pos) == len(positions_sub) and np.all(phased_pos == positions_sub):
        print(f" [{region_name}] Positions match between phased and region-subset.")
    else:
        print(f" [{region_name}] Position mismatch (rsID alignment is still OK).")
        print("   first 5 phased positions:", phased_pos[:5])
        print("   first 5 region positions:", positions_sub[:5])

    return {
        "region_name": region_name,
        "G_sub": G_sub,
        "sample_ids_sub": sample_ids_sub,
        "snp_ids_sub": snp_ids_sub,
        "positions_sub": positions_sub,
        "alleles_A": alleles_A,
        "alleles_B": alleles_B,
        "phased_individuals": phased_indivs,
        "indiv_idx_in_region": indiv_idx_in_region,
        "snp_idx_in_region": snp_idx_in_region,
    }

# Load phased dataframes
df_phase_chr2 = load_phased_df(PHASE_CHR2)
df_phase_chr10 = load_phased_df(PHASE_CHR10)

# Build phased-compatible subsets
pc2 = make_phased_compatible_region(r2, df_phase_chr2, "chr2")
pc10 = make_phased_compatible_region(r10, df_phase_chr10, "chr10")

print("\n Done. You now have phased-compatible region subsets:")
print("  chr2: G_sub", pc2["G_sub"].shape, "| phased indiv:", len(pc2["phased_individuals"]), "| SNPs:", len(pc2["snp_ids_sub"]))
print("  chr10: G_sub", pc10["G_sub"].shape, "| phased indiv:", len(pc10["phased_individuals"]), "| SNPs:", len(pc10["snp_ids_sub"]))


 Loading phased file: hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz
 Phased rows loaded: 116430 | columns: 36

 Loading phased file: hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz
 Phased rows loaded: 73832 | columns: 36

ðŸ”— [chr2] Individuals:
  phased individuals total: 17
  phased âˆ© region:          17

ðŸ”— [chr2] SNPs:
  region SNPs: 311
  matched in phased: 118
 [chr2] Subset region G to phased-compatible shape: (17, 118)
 [chr2] Phased allele matrices:
   A: (118, 17) | B: (118, 17)
 [chr2] Confirm SNP order alignment: 118 rows
 [chr2] Positions match between phased and region-subset.

ðŸ”— [chr10] Individuals:
  phased individuals total: 17
  phased âˆ© region:          17

ðŸ”— [chr10] SNPs:
  region SNPs: 610
  matched in phased: 129
 [chr10] Subset region G to phased-compatible shape: (17, 129)
 [chr10] Phased allele matrices:
   A: (129, 17) | B: (129, 17)
 [chr10] Confirm SNP order alignment: 129 rows
 [chr10] Positions match between phased

In [29]:
from collections import Counter
import json
import numpy as np

def hap_strings_for_block(allele_matrix, start, end):
    """
    allele_matrix: (N_snps, N_individuals), single-letter alleles (strings)
    returns list length N_individuals where each is a haplotype string for [start:end]
    """
    block = allele_matrix[start:end+1, :]  # (block_len, n_ind)
    return ["".join(block[:, j].tolist()) for j in range(block.shape[1])]

def build_haplotype_histograms_from_pc(region_name, blocks, pc, cohorts, top_k=50):
    """
    pc is the phased-compatible dict from Cell 4:
      pc["alleles_A"], pc["alleles_B"] : (n_snps, n_indiv)
      pc["sample_ids_sub"] : region sample IDs for the phased subset (length n_indiv)
      pc["snp_ids_sub"], pc["positions_sub"]
    cohorts is your ceu_control_test_split.json already loaded.

    We count haplotypes ONLY over CONTROL individuals, but restricted to those
    that exist in the phased subset.
    """
    print(f"\n Building haplotype histograms for {region_name} (PHASED-COMPATIBLE subset)")

    # These are the phased-subset individual IDs (in the same order as columns in alleles_A/B)
    phased_region_ids = [str(x) for x in pc["sample_ids_sub"]]

    # Control individuals in the FULL CEU region (174 people)
    control_ids_full = set(str(x) for x in cohorts["control"]["sample_ids"])

    # Determine which phased-subset columns are in control
    control_cols = [j for j, sid in enumerate(phased_region_ids) if sid in control_ids_full]

    print(f" Total phased-compatible individuals: {len(phased_region_ids)}")
    print(f" Control individuals available in phased subset: {len(control_cols)}")
    if len(control_cols) == 0:
        raise RuntimeError(
            f"No CONTROL individuals overlap with phased subset for {region_name}. "
            "This can happen if your control split contains mostly people not in the UNRELATED phased set."
        )

    alleles_A = pc["alleles_A"]
    alleles_B = pc["alleles_B"]

    block_payload = []
    for block_id, (s, e) in enumerate(blocks):
        hA_all = hap_strings_for_block(alleles_A, s, e)
        hB_all = hap_strings_for_block(alleles_B, s, e)

        ctr = Counter()
        for col in control_cols:
            ctr[hA_all[col]] += 1
            ctr[hB_all[col]] += 1

        total = int(sum(ctr.values()))
        top = ctr.most_common(top_k)
        top_haps = [h for h, _ in top]
        top_counts = [int(c) for _, c in top]
        other_count = int(total - sum(top_counts))

        block_payload.append({
            "block_id": int(block_id),
            "start_snp_index": int(s),
            "end_snp_index": int(e),
            "num_snps": int(e - s + 1),
            "total_haplotypes_counted": total,  # should be 2 * (#control_cols)
            "top_k": int(top_k),
            "top_haplotypes": top_haps,
            "top_counts": top_counts,
            "other_count": other_count,
        })

        if block_id % 10 == 0:
            top1 = top_counts[0] if top_counts else 0
            print(f"   block {block_id}: SNPs {s}-{e} | unique={len(ctr)} | top1={top1}")

    out = HAP_OUT_DIR / f"{region_name}.control_haplotypes.phased_compatible.json"
    out.write_text(json.dumps({
        "region_name": region_name,
        "note": (
            "This file is built on the intersection of (region SNPs âˆ© phased SNPs) and "
            "(region individuals âˆ© phased UNRELATED CEU individuals)."
        ),
        "counts_from": "CONTROL cohort only, restricted to phased-compatible individuals",
        "phased_compatible": {
            "num_individuals_total": int(len(phased_region_ids)),
            "num_control_individuals_used": int(len(control_cols)),
            "num_snps_total": int(len(pc["snp_ids_sub"])),
        },
        "blocks": block_payload
    }, indent=2))

    print(f"\n Saved haplotype histograms â†’ {out.relative_to(PROJECT_ROOT)}")
    return out



def assert_blocks_fit_pc(blocks, pc, region_name):
    n_snps = len(pc["snp_ids_sub"])
    max_end = max(e for s, e in blocks)
    if max_end >= n_snps:
        raise RuntimeError(
            f"Blocks for {region_name} do not fit phased-compatible SNP count. "
            f"max_end={max_end}, but phased-compatible n_snps={n_snps}. "
            "Rebuild blocks using pc['G_sub']."
        )

# Ensure blocks fit phased-compatible SNP counts
assert_blocks_fit_pc(blocks_chr2, pc2, "chr2")
assert_blocks_fit_pc(blocks_chr10, pc10, "chr10")

# Build histograms
hap_chr2_out = build_haplotype_histograms_from_pc("CEU_chr2_5Mb", blocks_chr2, pc2, cohorts, top_k=50)
hap_chr10_out = build_haplotype_histograms_from_pc("CEU_chr10_1Mb", blocks_chr10, pc10, cohorts, top_k=50)

print("\n Done building phased-compatible control haplotype histograms.")
print("chr2 â†’", hap_chr2_out.relative_to(PROJECT_ROOT))
print("chr10 â†’", hap_chr10_out.relative_to(PROJECT_ROOT))


 Building haplotype histograms for CEU_chr2_5Mb (PHASED-COMPATIBLE subset)
 Total phased-compatible individuals: 17
 Control individuals available in phased subset: 16
   block 0: SNPs 0-79 | unique=31 | top1=2

 Saved haplotype histograms â†’ data/processed/hapmap/haplotypes/CEU_chr2_5Mb.control_haplotypes.phased_compatible.json

 Building haplotype histograms for CEU_chr10_1Mb (PHASED-COMPATIBLE subset)
 Total phased-compatible individuals: 17
 Control individuals available in phased subset: 16
   block 0: SNPs 0-79 | unique=13 | top1=9

 Saved haplotype histograms â†’ data/processed/hapmap/haplotypes/CEU_chr10_1Mb.control_haplotypes.phased_compatible.json

 Done building phased-compatible control haplotype histograms.
chr2 â†’ data/processed/hapmap/haplotypes/CEU_chr2_5Mb.control_haplotypes.phased_compatible.json
chr10 â†’ data/processed/hapmap/haplotypes/CEU_chr10_1Mb.control_haplotypes.phased_compatible.json


In [30]:
print("\n SUMMARY OF WHAT WE NOW HAVE")

print("\nRaw downloads (existing):")
print(" -", (PHASE_CHR2).relative_to(PROJECT_ROOT))
print(" -", (PHASE_CHR10).relative_to(PROJECT_ROOT))

print("\nProcessed regions (existing):")
print(" -", (REGION_CHR2).relative_to(PROJECT_ROOT))
print(" -", (REGION_CHR10).relative_to(PROJECT_ROOT))

print("\nNew outputs created now:")
print(" - Blocks chr2:", blocks_chr2_path.relative_to(PROJECT_ROOT))
print(" - Blocks chr10:", blocks_chr10_path.relative_to(PROJECT_ROOT))
print(" - Hap hist chr2:", hap_chr2_out.relative_to(PROJECT_ROOT))
print(" - Hap hist chr10:", hap_chr10_out.relative_to(PROJECT_ROOT))


 SUMMARY OF WHAT WE NOW HAVE

Raw downloads (existing):
 - data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr2_ceu.unr.phased.gz
 - data/raw/hapmap/phasing/HapMap3_r2/CEU/UNRELATED/hapmap3_r2_b36_fwd.consensus.qc.poly.chr10_ceu.unr.phased.gz

Processed regions (existing):
 - data/processed/hapmap/regions/CEU_chr2_5Mb.npz
 - data/processed/hapmap/regions/CEU_chr10_1Mb.npz

New outputs created now:
 - Blocks chr2: data/processed/hapmap/blocks/CEU_chr2_5Mb.blocks.phased_compatible.json
 - Blocks chr10: data/processed/hapmap/blocks/CEU_chr10_1Mb.blocks.phased_compatible.json
 - Hap hist chr2: data/processed/hapmap/haplotypes/CEU_chr2_5Mb.control_haplotypes.phased_compatible.json
 - Hap hist chr10: data/processed/hapmap/haplotypes/CEU_chr10_1Mb.control_haplotypes.phased_compatible.json


In [31]:
import json
from pathlib import Path
import numpy as np


HAP_OUT_DIR = PROJECT_ROOT / "data" / "processed" / "hapmap" / "haplotypes"
HAP_OUT_DIR.mkdir(parents=True, exist_ok=True)

required_vars = ["pc2", "pc10", "blocks_chr2", "blocks_chr10", "cohorts", "PROJECT_ROOT"]
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise RuntimeError(f"Missing variables from earlier cells: {missing}")

def _check_pc(pc, name):
    A = pc["alleles_A"]; B = pc["alleles_B"]
    assert A.shape == B.shape, f"{name}: A/B shape mismatch"
    n_snps, n_ind = A.shape
    print(f"{name}: phased-compatible SNPs={n_snps}, individuals={n_ind}")
    print(f"   sample_ids_sub={len(pc['sample_ids_sub'])}, snp_ids_sub={len(pc['snp_ids_sub'])}")
    # Blocks must fit SNP count
    max_end = max(e for s, e in (blocks_chr2 if name=='chr2' else blocks_chr10))
    if max_end >= n_snps:
        raise RuntimeError(f"{name}: blocks don't fit SNP count (max_end={max_end}, n_snps={n_snps}). Rebuild Cell 3 on pc['G_sub'].")

_check_pc(pc2, "chr2")
_check_pc(pc10, "chr10")

print("\n Haplotypes output dir:", HAP_OUT_DIR.relative_to(PROJECT_ROOT))

chr2: phased-compatible SNPs=118, individuals=17
   sample_ids_sub=17, snp_ids_sub=118
chr10: phased-compatible SNPs=129, individuals=17
   sample_ids_sub=17, snp_ids_sub=129

 Haplotypes output dir: data/processed/hapmap/haplotypes


In [32]:
from collections import Counter
import time

def hap_strings_for_block(allele_matrix, start, end):
    """
    allele_matrix: (N_snps, N_individuals) of single-letter alleles (strings)
    returns list length N_individuals where each element is the haplotype string over SNPs [start:end]
    """
    block = allele_matrix[start:end+1, :]  
    return ["".join(block[:, j].tolist()) for j in range(block.shape[1])]

def compute_control_cols_in_pc(pc, cohorts):
    """
    pc has sample_ids_sub = individuals in phased-compatible subset (same order as allele columns).
    cohorts['control']['sample_ids'] are the control IDs from full CEU (unphased split).
    Return list of column indices in pc corresponding to CONTROL individuals.
    """
    pc_ids = [str(x) for x in pc["sample_ids_sub"]]
    control_set = set(str(x) for x in cohorts["control"]["sample_ids"])
    control_cols = [j for j, sid in enumerate(pc_ids) if sid in control_set]
    return control_cols

def save_control_haplotype_histograms(region_name, pc, blocks, cohorts, top_k=50):
    """
    Writes:
      data/processed/hapmap/haplotypes/{region_name}.control_haplotypes.phased_compatible.json

    Counts are over haplotypes => 2 per person (A and B haplotypes).
    Uses CONTROL only (public reference), restricted to phased-compatible individuals.
    """
    A = pc["alleles_A"]
    B = pc["alleles_B"]
    snp_ids = [str(x) for x in pc["snp_ids_sub"]]
    positions = [int(x) for x in pc["positions_sub"]]
    pc_ids = [str(x) for x in pc["sample_ids_sub"]]

    control_cols = compute_control_cols_in_pc(pc, cohorts)
    if len(control_cols) == 0:
        raise RuntimeError(
            f"{region_name}: No CONTROL individuals overlap with phased-compatible subset. "
            "This can happen depending on your split + UNRELATED set."
        )

    print(f"\n [{region_name}] CONTROL haplotype histograms (phased-compatible)")
    print(f" phased-compatible individuals total: {len(pc_ids)}")
    print(f" control individuals used: {len(control_cols)} (=> {2*len(control_cols)} haplotypes)")
    print(f" SNPs used: {A.shape[0]}")

    block_payload = []
    for block_id, (s, e) in enumerate(blocks):

        hA_all = hap_strings_for_block(A, s, e)
        hB_all = hap_strings_for_block(B, s, e)

        ctr = Counter()
        for col in control_cols:
            ctr[hA_all[col]] += 1
            ctr[hB_all[col]] += 1

        total = int(sum(ctr.values()))
        top = ctr.most_common(top_k)
        top_haps = [h for h, _ in top]
        top_counts = [int(c) for _, c in top]
        other_count = int(total - sum(top_counts))

        block_payload.append({
            "block_id": int(block_id),
            "start_snp_index": int(s),
            "end_snp_index": int(e),
            "num_snps": int(e - s + 1),
            "start_pos": int(positions[s]),
            "end_pos": int(positions[e]),
            "total_haplotypes_counted": total,  
            "top_k": int(top_k),
            "top_haplotypes": top_haps,
            "top_counts": top_counts,
            "other_count": other_count,
        })

        if block_id % 10 == 0:
            print(f"   block {block_id}: SNPs {s}-{e} | unique={len(ctr)} | top1={top_counts[0] if top_counts else 0}")

    out_path = HAP_OUT_DIR / f"{region_name}.control_haplotypes.phased_compatible.json"
    payload = {
        "region_name": region_name,
        "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
        "source": "HapMap3 r2 phased (CEU UNRELATED) + CEU control split from unphased genotypes",
        "counts_from": "CONTROL cohort only (public reference), restricted to phased-compatible subset",
        "phased_compatible": {
            "num_snps": int(A.shape[0]),
            "num_individuals_total": int(A.shape[1]),
            "num_control_individuals_used": int(len(control_cols)),
        },
        "note": "Counts are over haplotypes (2 per person: A and B).",
        "blocks": block_payload,
    }
    out_path.write_text(json.dumps(payload, indent=2))
    print(f"\n Saved â†’ {out_path.relative_to(PROJECT_ROOT)}")
    return out_path

In [33]:

hap_chr2_out = save_control_haplotype_histograms(
    region_name="CEU_chr2_5Mb",
    pc=pc2,
    blocks=blocks_chr2,
    cohorts=cohorts,
    top_k=50
)

hap_chr10_out = save_control_haplotype_histograms(
    region_name="CEU_chr10_1Mb",
    pc=pc10,
    blocks=blocks_chr10,
    cohorts=cohorts,
    top_k=50
)

print("\n Preprocessing milestone complete for Method 2.")
print("chr2:", hap_chr2_out.relative_to(PROJECT_ROOT))
print("chr10:", hap_chr10_out.relative_to(PROJECT_ROOT))


 [CEU_chr2_5Mb] CONTROL haplotype histograms (phased-compatible)
 phased-compatible individuals total: 17
 control individuals used: 16 (=> 32 haplotypes)
 SNPs used: 118
   block 0: SNPs 0-79 | unique=31 | top1=2

 Saved â†’ data/processed/hapmap/haplotypes/CEU_chr2_5Mb.control_haplotypes.phased_compatible.json

 [CEU_chr10_1Mb] CONTROL haplotype histograms (phased-compatible)
 phased-compatible individuals total: 17
 control individuals used: 16 (=> 32 haplotypes)
 SNPs used: 129
   block 0: SNPs 0-79 | unique=13 | top1=9

 Saved â†’ data/processed/hapmap/haplotypes/CEU_chr10_1Mb.control_haplotypes.phased_compatible.json

 Preprocessing milestone complete for Method 2.
chr2: data/processed/hapmap/haplotypes/CEU_chr2_5Mb.control_haplotypes.phased_compatible.json
chr10: data/processed/hapmap/haplotypes/CEU_chr10_1Mb.control_haplotypes.phased_compatible.json
