# Phase 1 — Clean BRCA2 Data Integration (ClinVar GRCh38 + gnomAD)
This notebook rebuilds the BRCA2 merged dataset from scratch.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
BASE = '/content/drive/MyDrive/BRCA2-database-bias'
RAW = f'{BASE}/data/raw'
PROC = f'{BASE}/data/processed'
os.makedirs(RAW, exist_ok=True)
os.makedirs(PROC, exist_ok=True)
print('Paths ready.')

Paths ready.


## Install tabix + pysam

In [3]:
!apt-get update -y
!apt-get install -y tabix
!which tabix
!pip install -q pysam
import pysam

0% [Working]            Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Waiting for header                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [2 InRelease 3,632 0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Connected to r2u.s0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Connected to r2u.s                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 https://cli.github.com/packages sta

## Download ClinVar GRCh38 VCF

In [4]:
import os

vcf = f'{RAW}/clinvar.vcf.gz'
tbi = f'{RAW}/clinvar.vcf.gz.tbi'

if not os.path.exists(vcf):
    !wget -q -O "$vcf" https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
if not os.path.exists(tbi):
    !wget -q -O "$tbi" https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi
print('ClinVar downloaded.')

ClinVar downloaded.


## Extract BRCA2 region

In [5]:
brca2_vcf = f'{RAW}/clinvar_brca2.vcf'
if os.path.exists(brca2_vcf):
    os.remove(brca2_vcf)
!tabix -h "$vcf" 13:32315000-32401000 > "$brca2_vcf"
print('Extracted BRCA2 region.')

Extracted BRCA2 region.


## bgzip + tabix index

In [6]:
bgz = f'{RAW}/clinvar_brca2.vcf.gz'
!bgzip -c "$brca2_vcf" > "$bgz"
!tabix -p vcf "$bgz"
print('Indexed BRCA2 VCF.')

Indexed BRCA2 VCF.


## Parse ClinVar BRCA2

In [7]:
import pysam, pandas as pd
parsed = f'{RAW}/clinvar_brca2_parsed.csv'
records=[]
vcf_in=pysam.VariantFile(bgz)
for rec in vcf_in.fetch():
    if rec.ref is None or rec.alts is None:
        continue
    csig=str(rec.info.get('CLNSIG'))
    review=str(rec.info.get('CLNREVSTAT'))
    for alt in rec.alts:
        if alt in [None,'.','']:
            continue
        records.append({
            'Chromosome': rec.chrom.replace('chr',''),
            'Start': rec.pos,
            'ReferenceAllele': rec.ref,
            'AlternateAllele': alt,
            'ClinicalSignificance': csig,
            'ReviewStatus': review
        })
df = pd.DataFrame(records)
df.to_csv(parsed,index=False)
print('Parsed',len(df),'variants.')

Parsed 20614 variants.


## Load gnomAD AF + Merge

In [8]:
gd = pd.read_csv(f'{PROC}/gnomad_brca2_af.csv')
cv = pd.read_csv(parsed)

cv['variant_key'] = cv['Chromosome'].astype(str)+'-'+cv['Start'].astype(str)+'-'+cv['ReferenceAllele']+'-'+cv['AlternateAllele']
gd['variant_key'] = gd['chrom'].astype(str)+'-'+gd['pos'].astype(str)+'-'+gd['ref']+'-'+gd['alt']

merged = cv.merge(gd,on='variant_key',how='left')
print('Merged:',len(merged))

Merged: 20614


## Clean AF columns + Add Derived Features

In [9]:
import math

for c in ['sas_af','eur_af','afr_af','eas_af','amr_af']:
    merged[c]=pd.to_numeric(merged[c],errors='coerce').fillna(0).clip(lower=0)

tiny=1e-12
merged['log10_sas_af']=(merged['sas_af'].replace(0,tiny)).apply(lambda x: math.log10(x))
merged['log10_eur_af']=(merged['eur_af'].replace(0,tiny)).apply(lambda x: math.log10(x))
merged['sas_eur_ratio']=(merged['sas_af'].replace(0,tiny))/(merged['eur_af'].replace(0,tiny))

out=f'{PROC}/brca2_merged.csv'
merged.to_csv(out,index=False)
print('Saved:',out)
for c in ['sas_af','eur_af','afr_af','eas_af','amr_af']:
    print(c,merged[c].notna().sum())

Saved: /content/drive/MyDrive/BRCA2-database-bias/data/processed/brca2_merged.csv
sas_af 20614
eur_af 20614
afr_af 20614
eas_af 20614
amr_af 20614
