# 🧠 BRCA2 South-Asian Database Bias Project
### Phase 1 — Download BRCA2 allele frequencies from gnomAD (via GraphQL API)
This Colab notebook automates the setup and running of the GraphQL query script for gnomAD BRCA2 allele frequencies.

In [None]:
# ==============================================================
# 1️⃣ Mount Google Drive
# ==============================================================
from google.colab import drive
drive.mount('/content/drive')

# --------------------------------------------------------------
# 2️⃣ Set up working directory
# --------------------------------------------------------------
project_dir = "/content/drive/MyDrive/BRCA-DataBias"
%cd $project_dir

# --------------------------------------------------------------
# 3️⃣ Install required Python packages
# --------------------------------------------------------------
!pip install -q pandas numpy requests tqdm

# --------------------------------------------------------------
# 4️⃣ Create the query_gnomad_graphql.py script (if missing)
# --------------------------------------------------------------
import os

os.makedirs("scripts", exist_ok=True)

script_code = """import os, time, requests, pandas as pd

API = "https://gnomad.broadinstitute.org/api"

CHR = "13"
GENE_START = 32315086
GENE_END   = 32400268
WINDOW = 5000

QUERY = '''
query Region($chrom: String!, $start: Int!, $stop: Int!) {
  region(chrom: $chrom, start: $start, stop: $stop, reference_genome: GRCh38) {
    variants(dataset: gnomad_r4) {
      variant_id
      pos
      ref
      alt
      consequence
      exome { populations { id ac an } }
      genome { populations { id ac an } }
    }
  }
}
'''

def graphql(q, v):
    r = requests.post(API, json={"query": q, "variables": v}, timeout=60)
    r.raise_for_status()
    return r.json()

rows = []
s = GENE_START
while s <= GENE_END:
    e = min(s + WINDOW - 1, GENE_END)
    data = graphql(QUERY, {"chrom": CHR, "start": s, "stop": e})
    region = data.get("data", {}).get("region")
    if region:
        for v in region["variants"]:
            pops = {}
            for layer in ("exome", "genome"):
                layer_data = v.get(layer) or {}
                for p in (layer_data.get("populations") or []):
                    ac, an = p.get("ac") or 0, p.get("an") or 0
                    af = (ac / an) if an else 0.0
                    pid = p.get("id")
                    pops[pid] = max(pops.get(pid, 0), af)
            rows.append({
                "variant_id": v["variant_id"],
                "chrom": CHR, "pos": v["pos"], "ref": v["ref"], "alt": v["alt"],
                "consequence": v.get("consequence"),
                "sas_af": pops.get("sas", 0.0),
                "eur_af": pops.get("nfe", 0.0) or pops.get("eur", 0.0),
                "afr_af": pops.get("afr", 0.0),
                "eas_af": pops.get("eas", 0.0),
                "amr_af": pops.get("amr", 0.0)
            })
    s = e + 1
    time.sleep(0.25)

os.makedirs("data/processed", exist_ok=True)
out_csv = "data/processed/gnomad_brca2_af.csv"
pd.DataFrame(rows).drop_duplicates("variant_id").to_csv(out_csv, index=False)
print("✅ Saved:", out_csv)
"""

with open("scripts/query_gnomad_graphql.py", "w") as f:
    f.write(script_code)

# --------------------------------------------------------------
# 5️⃣ Run the script
# --------------------------------------------------------------
!python scripts/query_gnomad_graphql.py

# --------------------------------------------------------------
# 6️⃣ Verify the output file
# --------------------------------------------------------------
import pandas as pd
out_path = os.path.join(project_dir, "data/processed/gnomad_brca2_af.csv")
df = pd.read_csv(out_path)
print("✅ File created at:", out_path)
df.head()
