# ClinVar

Original data was pulled from [here](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz) (FTP link) on 2/14/24.

In [1]:
import pandas as pd
import re 
import table_cleaning_functions as tcf

# Preliminary Wrangling

In [2]:
# read in raw clinvar data 
clinvar_raw = pd.read_csv('../home/data/raw_data/GlyGen/clinvar_raw.txt', sep = '\t')
print(clinvar_raw.shape)
clinvar_raw.head()

  clinvar_raw = pd.read_csv('../home/data/raw_data/GlyGen/clinvar_raw.txt', sep = '\t')


(4807886, 40)


Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity
0,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic,0,-,397704705,...,2,4820844,GGAT,TGCTGTAAACTGTAACTGTAAA,-,-,-,-,-,-
1,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic,0,-,397704705,...,2,4781213,GGAT,TGCTGTAAACTGTAACTGTAAA,-,-,-,-,-,-
2,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,0,"Jun 29, 2010",397704709,...,3,4827360,GCTGCTGGACCTGCC,G,-,-,-,-,-,-
3,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,0,"Jun 29, 2010",397704709,...,3,4787729,GCTGCTGGACCTGCC,G,-,-,-,-,-,-
4,15043,single nucleotide variant,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,4,85342440,G,A,-,-,-,-,-,-


In [3]:
clinvar_raw.isnull().sum()

#AlleleID                             0
Type                                  0
Name                                  0
GeneID                                0
GeneSymbol                            0
HGNC_ID                               0
ClinicalSignificance                  0
ClinSigSimple                         0
LastEvaluated                         0
RS# (dbSNP)                           0
nsv/esv (dbVar)                       0
RCVaccession                          0
PhenotypeIDS                          0
PhenotypeList                         0
Origin                                0
OriginSimple                          0
Assembly                              0
ChromosomeAccession                   0
Chromosome                            0
Start                                 0
Stop                                  0
ReferenceAllele                       0
AlternateAllele                       0
Cytogenetic                           0
ReviewStatus                          0


Drop unnecessary rows.

In [4]:
clinvar_raw.columns

Index(['#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID',
       'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)',
       'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList',
       'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession',
       'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele',
       'Cytogenetic', 'ReviewStatus', 'NumberSubmitters', 'Guidelines',
       'TestedInGTR', 'OtherIDs', 'SubmitterCategories', 'VariationID',
       'PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF',
       'SomaticClinicalImpact', 'SomaticClinicalImpactLastEvaluated',
       'ReviewStatusClinicalImpact', 'Oncogenicity',
       'OncogenicityLastEvaluated', 'ReviewStatusOncogenicity'],
      dtype='object')

In [5]:
cols = list(clinvar_raw.columns)
cols

['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'PositionVCF',
 'ReferenceAlleleVCF',
 'AlternateAlleleVCF',
 'SomaticClinicalImpact',
 'SomaticClinicalImpactLastEvaluated',
 'ReviewStatusClinicalImpact',
 'Oncogenicity',
 'OncogenicityLastEvaluated',
 'ReviewStatusOncogenicity']

In [7]:
drop_cols = [x for x in cols if x not in {'GeneSymbol', 'RS# (dbSNP)', 'PhenotypeList', 'VariationID'}]
clinvar_raw = clinvar_raw.drop(columns = drop_cols)
clinvar_raw.head()

Unnamed: 0,GeneSymbol,RS# (dbSNP),PhenotypeList,VariationID
0,AP5Z1,397704705,Hereditary spastic paraplegia 48,2
1,AP5Z1,397704705,Hereditary spastic paraplegia 48,2
2,AP5Z1,397704709,Hereditary spastic paraplegia 48,3
3,AP5Z1,397704709,Hereditary spastic paraplegia 48,3
4,ZNF592,150829393,Galloway-Mowat syndrome 1,4


Only interested in cancer related rows right now, filter on cancer related conditions.

In [8]:
cancer_related = ['cancer', 'carcinoma', 'leukemia', 'tumor', 'malignancy', 'glioblastoma',
                'melanoma', 'lymphoma', 'sarcoma']

clinvar_raw = clinvar_raw[clinvar_raw['PhenotypeList'].str.contains('|'.join(cancer_related), case = False, na = False)]  
print(clinvar_raw.shape)
clinvar_raw.head()

(422117, 4)


Unnamed: 0,GeneSymbol,RS# (dbSNP),PhenotypeList,VariationID
12,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9
13,HFE,1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9
206,TMEM127,121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108
207,TMEM127,121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108
528,KLHDC8B,387906223,Classic Hodgkin lymphoma,273


In [9]:
clinvar_raw.isnull().sum()

GeneSymbol       0
RS# (dbSNP)      0
PhenotypeList    0
VariationID      0
dtype: int64

Re-format rs ID column.

In [10]:
clinvar_raw['RS# (dbSNP)'] = 'rs' + clinvar_raw['RS# (dbSNP)'].astype(str)
clinvar_raw['RS# (dbSNP)']

12           rs1800562
13           rs1800562
206        rs121908830
207        rs121908830
528        rs387906223
              ...     
4807791           rs-1
4807792           rs-1
4807793           rs-1
4807794           rs-1
4807795           rs-1
Name: RS# (dbSNP), Length: 422117, dtype: object

In [12]:
clinvar_raw[clinvar_raw['RS# (dbSNP)'].str.contains('-1')]

Unnamed: 0,GeneSymbol,RS# (dbSNP),PhenotypeList,VariationID
673,NF1,rs-1,Juvenile myelomonocytic leukemia,351
1583,APC,rs-1,Desmoid tumor caused by somatic mutation,838
2999,RSPO1,rs-1,Palmoplantar keratoderma-XX sex reversal-predi...,1604
3000,RSPO1,rs-1,Palmoplantar keratoderma-XX sex reversal-predi...,1604
6007,CDC73,rs-1,Hyperparathyroidism 2 with jaw tumors,3269
...,...,...,...,...
4807791,TP53,rs-1,"Prostate cancer, hereditary, 1",2687727
4807792,TP53,rs-1,"Prostate cancer, hereditary, 1",2687728
4807793,TP53,rs-1,"Prostate cancer, hereditary, 1",2687728
4807794,TP53,rs-1,"Prostate cancer, hereditary, 1",2687729


In [13]:
clinvar_raw = clinvar_raw[~clinvar_raw['RS# (dbSNP)'].str.contains('-1')]
print(clinvar_raw.shape)
clinvar_raw['RS# (dbSNP)'].value_counts()

(314386, 4)


RS# (dbSNP)
rs11309117      30
rs144131869     26
rs5901000       24
rs763704682     22
rs34003473      14
                ..
rs2072259508     1
rs2137537139     1
rs1281724812     1
rs2137567485     1
rs45445694       1
Name: count, Length: 129423, dtype: int64

Rename columns. 

In [14]:
cols = ['Gene', 'rs_id', 'disease', 'variation_id']
clinvar_raw.columns = cols
clinvar_raw.head()

Unnamed: 0,Gene,rs_id,disease,variation_id
12,HFE,rs1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9
13,HFE,rs1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9
206,TMEM127,rs121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108
207,TMEM127,rs121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108
528,KLHDC8B,rs387906223,Classic Hodgkin lymphoma,273


Normalize condition values before isolating.

In [24]:
clinvar_raw['disease'] = clinvar_raw['disease'].str.lower().str.strip()
clinvar_raw.head()

Unnamed: 0,Gene,rs_id,disease,variation_id
12,HFE,rs1800562,hemochromatosis type 1|hereditary cancer-predi...,9
13,HFE,rs1800562,hemochromatosis type 1|hereditary cancer-predi...,9
206,TMEM127,rs121908830,"pheochromocytoma, susceptibility to|pheochromo...",108
207,TMEM127,rs121908830,"pheochromocytoma, susceptibility to|pheochromo...",108
528,KLHDC8B,rs387906223,classic hodgkin lymphoma,273


Isolate condition data for cleaning. 

In [25]:
conditions = clinvar_raw['disease']
conditions = conditions.drop_duplicates()
# conditions.to_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_conditions.tsv', sep = '\t', index = False)

In [26]:
conditions.shape

(17504,)

Save processed data and isolate conditions to clean manually. 

In [17]:
clinvar_raw.to_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_processed.tsv', sep = '\t', index = False)

# Secondary Wrangling

In [18]:
clinvar_processed = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_processed.tsv', sep = '\t')
clinvar_processed.head()

Unnamed: 0,Gene,rs_id,disease,variation_id
0,HFE,rs1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9
1,HFE,rs1800562,Hemochromatosis type 1|Hereditary cancer-predi...,9
2,TMEM127,rs121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108
3,TMEM127,rs121908830,"Pheochromocytoma, susceptibility to|Pheochromo...",108
4,KLHDC8B,rs387906223,Classic Hodgkin lymphoma,273


In [19]:
clinvar_processed.isnull().sum()

Gene            0
rs_id           0
disease         0
variation_id    0
dtype: int64

In [None]:
# read in cleaned condition data 
conditions_clean = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_conditions.tsv', sep = '\t')