In [2]:
# Load just the header row to inspect column names
df_preview = pd.read_csv("variant_summary.txt", sep='\t', nrows=5)
df_preview.columns.tolist()

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'PositionVCF',
 'ReferenceAlleleVCF',
 'AlternateAlleleVCF',
 'SomaticClinicalImpact',
 'SomaticClinicalImpactLastEvaluated',
 'ReviewStatusClinicalImpact',
 'Oncogenicity',
 'OncogenicityLastEvaluated',
 'ReviewStatusOncogenicity',
 'SCVsForAggregateGermlineClassification',
 'SCVsForAggregateSomaticClinicalImpact',
 'SCVsForAggregateOncogenicityClassification']

In [3]:
usecols = [
    'GeneSymbol',
    'Type',
    'ClinicalSignificance',
    'ReviewStatus',
    'Name',
    'Assembly'
]

In [4]:
# Step 1: Load the full ClinVar dataset with corrected columns
usecols = [
    'GeneSymbol',
    'Type',
    'ClinicalSignificance',
    'ReviewStatus',
    'Name',
    'Assembly'
]

df_all = pd.read_csv("variant_summary.txt", sep='\t', usecols=usecols, low_memory=False)

# Step 2: Filter to BRCA1/2 variants with clear pathogenicity and GRCh38 assembly
df_brca_large = df_all[
    (df_all['GeneSymbol'].isin(['BRCA1', 'BRCA2'])) &
    (df_all['ClinicalSignificance'].isin(['Pathogenic', 'Benign'])) &
    (df_all['Assembly'] == 'GRCh38')
].copy()

# Step 3: Preview cleaned BRCA data
df_brca_large.head()

Unnamed: 0,Type,Name,GeneSymbol,ClinicalSignificance,Assembly,ReviewStatus
17017,Deletion,NM_000059.4(BRCA2):c.7004_7007+2del,BRCA2,Pathogenic,GRCh38,"criteria provided, multiple submitters, no con..."
17019,Deletion,NM_000059.4(BRCA2):c.6275_6276del (p.Leu2092fs),BRCA2,Pathogenic,GRCh38,reviewed by expert panel
17021,Deletion,NM_000059.4(BRCA2):c.6591_6592del (p.Glu2198fs),BRCA2,Pathogenic,GRCh38,reviewed by expert panel
17023,Microsatellite,NM_000059.4(BRCA2):c.5722_5723del (p.Leu1908fs),BRCA2,Pathogenic,GRCh38,reviewed by expert panel
17025,Deletion,NM_000059.3(BRCA2):c.2808_2811del (p.Ala938Profs),BRCA2,Pathogenic,GRCh38,reviewed by expert panel


In [5]:
# Step 4: Add label column (for ML)
df_brca_large['label'] = df_brca_large['ClinicalSignificance'].map({
    'Pathogenic': 1,
    'Benign': 0
})

# Step 5: Add gene column (0 = BRCA1, 1 = BRCA2)
df_brca_large['gene'] = df_brca_large['GeneSymbol'].map({
    'BRCA1': 0,
    'BRCA2': 1
})

# Optional: drop rows with missing mappings (just in case)
df_brca_large = df_brca_large.dropna(subset=['label', 'gene'])

# Show final result
df_brca_large[['Name', 'GeneSymbol', 'ClinicalSignificance', 'label', 'gene']].head()

Unnamed: 0,Name,GeneSymbol,ClinicalSignificance,label,gene
17017,NM_000059.4(BRCA2):c.7004_7007+2del,BRCA2,Pathogenic,1,1
17019,NM_000059.4(BRCA2):c.6275_6276del (p.Leu2092fs),BRCA2,Pathogenic,1,1
17021,NM_000059.4(BRCA2):c.6591_6592del (p.Glu2198fs),BRCA2,Pathogenic,1,1
17023,NM_000059.4(BRCA2):c.5722_5723del (p.Leu1908fs),BRCA2,Pathogenic,1,1
17025,NM_000059.3(BRCA2):c.2808_2811del (p.Ala938Profs),BRCA2,Pathogenic,1,1


In [6]:
df_brca_large.to_csv("brca_clinvar_large.csv", index=False)

In [7]:
from google.colab import files
files.download("brca_clinvar_large.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
