
# 🧬 BRCA2 gnomAD Fetch (v4 API) - Final Fixed Version

This notebook runs the corrected **`query_gnomad_graphql_updated.py`** script, which now uses the **gnomAD v4 GraphQL API** to fetch BRCA2 variants including population-specific allele frequencies.

---

### ✅ It will:
1. Mount Google Drive  
2. Confirm the project folder `/My Drive/BRCA2-database-bias`  
3. Run the GraphQL query script  
4. Save to `/data/processed/gnomad_brca2_af.csv`  
5. Verify and preview the results


In [None]:
# 1️⃣ Mount Google Drive and set paths
from google.colab import drive
import os

drive.mount('/content/drive')

base_dir = "/content/drive/MyDrive/BRCA2-database-bias"
os.makedirs(base_dir, exist_ok=True)
os.chdir(base_dir)

print("✅ Working in:", os.getcwd())

folders = ["data/processed", "scripts"]
for f in folders:
    os.makedirs(os.path.join(base_dir, f), exist_ok=True)


In [None]:
# 2️⃣ Run updated GraphQL fetch script
print("🚀 Running updated gnomAD GraphQL v4 script...")

!python scripts/query_gnomad_graphql_updated.py || echo "⚠️ Script failed — check API or folder paths."

print("✅ Script completed.")


In [None]:
# 3️⃣ Verify that gnomad_brca2_af.csv was created
import pandas as pd

output_path = os.path.join(base_dir, "data/processed/gnomad_brca2_af.csv")

if os.path.exists(output_path):
    df = pd.read_csv(output_path)
    print(f"✅ File loaded successfully: {df.shape[0]:,} rows, {df.shape[1]} columns\n")
    print("=== Column names ===")
    print(df.columns.tolist())

    # Detect population columns
    pop_cols = [c for c in df.columns if c.startswith("AF_")]
    if pop_cols:
        print(f"✅ Detected {len(pop_cols)} population frequency columns:")
        print(pop_cols)
    else:
        print("⚠️ No population-level AF columns found.")

    print("\n=== Sample Rows ===")
    display(df.head(10))
else:
    print("❌ Output file not found! Please check if the API query ran successfully.")


In [None]:
# 4️⃣ Dataset Summary and Checks
if 'df' in locals():
    print("\n=== Dataset Summary ===")
    print(df.describe(include='all'))

    # Basic NA check
    na_counts = df.isna().sum()
    print("\n=== Missing Values by Column ===")
    print(na_counts[na_counts > 0])
else:
    print("⚠️ Dataset not loaded.")
