
# 🧬 BRCA2 Merge Notebook (Final, Stable Version)

This notebook merges your **ClinVar** and **gnomAD** BRCA2 variant data into a unified dataset.

---

### ✅ Workflow
1. Mount Google Drive  
2. Load ClinVar (streamed for memory safety)  
3. Load gnomAD (flexible column detection)  
4. Merge datasets on shared genomic coordinates  
5. Save combined output  
6. View summary + sample rows

---

### 📘 Input Files
| File | Path | Source |
|------|------|---------|
| ClinVar | `/data/raw/variant_summary.txt.gz` | NCBI ClinVar |
| gnomAD | `/data/processed/gnomad_brca2_af.csv` | GraphQL API (v4) |

**Output:** `/data/processed/brca2_merged_variants.csv`


In [None]:
# 1️⃣ Mount Google Drive and set up environment
from google.colab import drive
import os, pandas as pd, numpy as np

drive.mount('/content/drive')
base_dir = "/content/drive/MyDrive/BRCA2-database-bias"
os.makedirs(base_dir, exist_ok=True)
os.chdir(base_dir)
print("✅ Working in:", os.getcwd())

# Ensure subfolders exist
folders = ["data/raw", "data/processed", "results"]
for f in folders:
    os.makedirs(os.path.join(base_dir, f), exist_ok=True)


In [None]:
# 2️⃣ Verify ClinVar file
clinvar_path = os.path.join(base_dir, "data/raw/variant_summary.txt.gz")

if os.path.exists(clinvar_path):
    size_mb = os.path.getsize(clinvar_path) / (1024 * 1024)
    print(f"✅ ClinVar file found ({size_mb:.2f} MB)")
else:
    raise FileNotFoundError("❌ ClinVar file not found! Upload to data/raw/variant_summary.txt.gz")


In [None]:
# 3️⃣ Stream-load only BRCA2 variants from ClinVar (memory safe)
import pandas as pd

chunks = pd.read_csv(clinvar_path, sep='\t', compression='gzip', chunksize=100_000, low_memory=False)
subset = []

for i, chunk in enumerate(chunks):
    brca2_chunk = chunk[chunk['GeneSymbol'].str.upper() == 'BRCA2']
    subset.append(brca2_chunk)
    if i % 10 == 0:
        print(f"Processed {i*100_000:,} rows...")

clinvar = pd.concat(subset, ignore_index=True)
print(f"✅ Extracted {clinvar.shape[0]:,} BRCA2 rows from ClinVar.")

# Clean and select columns
rename_map = {
    'Chromosome': 'chrom',
    'Start': 'pos',
    'ReferenceAllele': 'ref',
    'AlternateAllele': 'alt',
    'ClinicalSignificance': 'clin_significance'
}
clinvar = clinvar.rename(columns=rename_map)
clinvar = clinvar[['chrom', 'pos', 'ref', 'alt', 'clin_significance']].dropna()

clinvar['chrom'] = clinvar['chrom'].astype(str)
clinvar['pos'] = clinvar['pos'].astype(int)

print("✅ ClinVar cleaned columns:", clinvar.columns.tolist())


In [None]:
# 4️⃣ Load gnomAD allele frequency data
gnomad_path = os.path.join(base_dir, "data/processed/gnomad_brca2_af.csv")
if not os.path.exists(gnomad_path):
    raise FileNotFoundError("❌ gnomad_brca2_af.csv not found. Run BRCA2_Colab_gnomAD_Fetch.ipynb first.")

gnomad = pd.read_csv(gnomad_path)
print(f"✅ gnomAD loaded: {gnomad.shape[0]:,} rows, {gnomad.shape[1]} columns")

# Flexible rename
rename_map = {
    'Chromosome': 'chrom', 'chrom': 'chrom',
    'Position': 'pos', 'Pos': 'pos', 'position': 'pos',
    'Ref': 'ref', 'Reference': 'ref',
    'Alt': 'alt', 'Alternate': 'alt'
}
gnomad = gnomad.rename(columns=rename_map)

# Detect frequency columns
freq_cols = [c for c in gnomad.columns if 'allele' in c.lower() or c.startswith('AF_')]
print(f"✅ Detected {len(freq_cols)} frequency columns:", freq_cols)

keep_cols = ['chrom', 'pos', 'ref', 'alt'] + freq_cols
gnomad = gnomad[keep_cols].drop_duplicates()

gnomad['chrom'] = gnomad['chrom'].astype(str)
gnomad['pos'] = gnomad['pos'].astype(int)


In [None]:
# 5️⃣ Merge ClinVar + gnomAD
merged = pd.merge(clinvar, gnomad, on=['chrom', 'pos', 'ref', 'alt'], how='outer', indicator=True)
print(f"✅ Merged dataset: {merged.shape[0]:,} rows, {merged.shape[1]} columns")

# Fill missing frequency columns
for c in freq_cols:
    if c in merged.columns:
        merged[c] = merged[c].fillna(0)

output_path = os.path.join(base_dir, "data/processed/brca2_merged_variants.csv")
merged.to_csv(output_path, index=False)
print(f"💾 Saved merged dataset to: {output_path}")


In [None]:
# 6️⃣ Summary and preview
print("\n=== Dataset Summary ===")
print("Total variants:", len(merged))
print("Unique chromosomes:", merged['chrom'].nunique())
print("Missing clinical significance:", merged['clin_significance'].isna().sum())
if freq_cols:
    print("Missing frequency entries:", merged[freq_cols].isna().sum().sum())

print("\n=== Sample Rows ===")
display(merged.sample(10, random_state=42))
