In [457]:
import pandas as pd
import numpy as np
import table_cleaning_functions as tcf 

Open processed tsv 

In [458]:
clinvar_processed = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_processed.tsv', sep='\t')
print(clinvar_processed.shape)
clinvar_processed.head()
#rs_id is same as SNPS and variation_id is what will go with the evidence column

(451062, 4)


Unnamed: 0,gene,rs_id,variation_id,disease
0,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
1,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
2,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
3,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
4,KLHDC8B,rs387906223,273,classic hodgkin lymphoma


Load clean conditions

In [459]:
clinvar_conditions = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_conditions.tsv', sep = '\t')
clinvar_drop_conditions = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_drop_conditions.tsv', sep = '\t')
clinvar_explode_conditions = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_explode_conditions.tsv', sep = '\t')

In [460]:
drop_conditions = set(clinvar_drop_conditions['condition'])
clinvar_processed = clinvar_processed[~clinvar_processed['disease'].isin(drop_conditions)]

In [461]:
clinvar_explode_condition_rows = clinvar_processed[clinvar_processed['disease'].isin(set(clinvar_explode_conditions['raw_disease']))]
print(clinvar_explode_condition_rows.shape)
clinvar_explode_condition_rows.head()

(4228, 4)


Unnamed: 0,gene,rs_id,variation_id,disease
330,ATM,rs587776547,3019,breast and/or ovarian cancer
335,ATM,rs587776547,3019,breast and/or ovarian cancer
338,ATM,rs774925473,3021,breast and/or ovarian cancer
342,ATM,rs774925473,3021,breast and/or ovarian cancer
353,ATM,rs28904921,3023,breast and/or ovarian cancer


In [462]:
clinvar_single_condition_rows = clinvar_processed[clinvar_processed['disease'].isin(set(clinvar_conditions['disease']))]
print(clinvar_single_condition_rows.shape) 
clinvar_single_condition_rows.head()

(445544, 4)


Unnamed: 0,gene,rs_id,variation_id,disease
0,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
1,HFE,rs1800562,9,hereditary cancer-predisposing syndrome
2,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
3,TMEM127,rs121908830,108,hereditary cancer-predisposing syndrome
4,KLHDC8B,rs387906223,273,classic hodgkin lymphoma


In [463]:
# sanity check
assert(clinvar_processed.shape[0] == clinvar_explode_condition_rows.shape[0] + clinvar_single_condition_rows.shape[0])

Map condition to disease with clinvar_single_condition_rows using clinvar_conditions

In [464]:
# Merge clinvar_single_condition_rows with clinvar_condition based on the 'disease' column
clinvar_single_condition_list_merged = clinvar_single_condition_rows.merge(clinvar_conditions, how='left', on='disease')

# Display the resulting DataFrame
clinvar_single_condition_list_merged.sample(5)


Unnamed: 0,gene,rs_id,variation_id,disease,condition,doid
201168,SDHC,rs1553261768,480864,hereditary cancer-predisposing syndrome,cancer,:162
351669,KIT,rs1720996900,946104,hereditary cancer-predisposing syndrome,cancer,:162
330441,NF1,rs2066927450,834162,hereditary cancer-predisposing syndrome,cancer,:162
321511,BRCA2,rs1555280966,825144,hereditary breast ovarian cancer syndrome,hereditary breast ovarian cancer syndrome,:5683
323003,ALK,rs368744524,850828,hereditary cancer-predisposing syndrome,cancer,:162


Ensure nulls during mapping process are caused by the same 'disease' values where 'conditions' column is null

In [465]:
# Filter rows where 'conditions' column is null and extract 'disease' values
null_conditions_diseases = clinvar_conditions.loc[clinvar_conditions['condition'].isnull(), 'disease']

# Display the diseases
print(len(null_conditions_diseases))
# Count occurrences of diseases from null_conditions_diseases in 'disease' column of merged_clinvar
null_conditions_diseases_counts = clinvar_single_condition_list_merged['disease'].isin(null_conditions_diseases).sum()

# Display the counts
print(r'nulls in clinvar_conditions condition column causing nulls in clinvar_merged:',null_conditions_diseases_counts)
print(clinvar_single_condition_list_merged.isnull().sum())

110
nulls in clinvar_conditions condition column causing nulls in clinvar_merged: 35923
gene                0
rs_id               0
variation_id        0
disease             0
condition       35923
doid                0
dtype: int64


In [466]:
# Check if 'doid' is already present before the colon
mask = clinvar_single_condition_list_merged['doid'].str.startswith('doid:')

# Add 'doid' in front of the colon only if it's not already present
clinvar_single_condition_list_merged.loc[~mask, 'doid'] = 'DOID:' + clinvar_single_condition_list_merged['doid'].str.split(':').str[1]

clinvar_single_condition_list_merged.sample(5)



Unnamed: 0,gene,rs_id,variation_id,disease,condition,doid
418819,EXT1,rs1394508840,1371761,chondrosarcoma,chondrosarcoma,DOID:3371
237583,RAD50,rs1554100863,527363,hereditary cancer-predisposing syndrome,cancer,DOID:162
291319,PALB2,rs1430544638,742730,hereditary cancer-predisposing syndrome,cancer,DOID:162
157278,ATM,rs1060501655,407650,hereditary cancer-predisposing syndrome,cancer,DOID:162
341613,PDGFRA,rs182602738,903420,gastrointestinal stromal tumor,gastrointestinal stromal tumor,DOID:9253


In [467]:
# Check if 'disease' column exists before removing it
if 'disease' in clinvar_single_condition_list_merged.columns:
    clinvar_single_condition_list_merged.drop(columns=['disease'], inplace=True)

# Display sample of the DataFrame
clinvar_single_condition_list_merged.sample(5)


Unnamed: 0,gene,rs_id,variation_id,condition,doid
323979,PDGFRA,rs1724329978,856050,gastrointestinal stromal tumor,DOID:9253
162183,BLM,rs761288442,405277,cancer,DOID:162
157070,ATM,rs1060501526,407452,cancer,DOID:162
153538,PDGFRA,rs762230704,414505,gastrointestinal stromal tumor,DOID:9253
221692,POLD1,rs767560532,484391,cancer,DOID:162


Explode condition merging

In [468]:
#testing
clinvar_explode_condition_rows.sample(5)

Unnamed: 0,gene,rs_id,variation_id,disease
17102,BRCA2,rs28897715,51326,breast and/or ovarian cancer
149484,ATR,rs150964938,382221,familial cutaneous telangiectasia and orophary...
41860,MSH2,rs41295182,91039,breast and/or ovarian cancer
103776,BRIP1,rs876661246,234832,breast and/or ovarian cancer
414680,ATR,rs1441471018,1441560,familial cutaneous telangiectasia and orophary...


In [469]:
#testing
clinvar_explode_conditions.sample(5)

Unnamed: 0,raw_disease,list_diseases
8,increased risk to develop myelodysplastic synd...,"myelodysplastic syndrome:0050908,acute myeloid..."
3,prostate cancer/brain cancer susceptibility,"prostate cancer:10283,brain cancer:1319"
2,"tumor predisposition syndrome 4, breast/prosta...","breast cancer:1612,prostate cancer:10283,color..."
4,colorectal / endometrial cancer,"colorectal cancer:9256,endometrial cancer:1380"
6,barrett esophagus/esophageal adenocarcinoma,"Barrett's esophagus:9206,esophagus adenocarcin..."


In [470]:
# merge explode condition rows with explode condition list
clinvar_explode_condition_list_merged = clinvar_explode_condition_rows.merge(clinvar_explode_conditions, how = 'left', left_on = 'disease', right_on = 'raw_disease')
# Drop the duplicate 'raw_disease' column
clinvar_explode_condition_list_merged.drop(columns=['disease','raw_disease'], inplace=True)
clinvar_explode_condition_list_merged.sample(5)

Unnamed: 0,gene,rs_id,variation_id,list_diseases
1846,ATM,rs79701258,136441,"breast cancer:1612,ovarian cancer:2394"
1528,ATM,rs112775908,127450,"breast cancer:1612,ovarian cancer:2394"
3611,ATR,rs750056135,901183,"[TODO]:[TODO],[TODO]:[TODO]"
2600,FH,rs1553341148,214412,"leiomyomatosis:5138,renal cell cancer:[TODO]"
2313,MSH2,rs63750398,187518,"breast cancer:1612,ovarian cancer:2394"


In [471]:
clinvar_explode_condition_list_merged.isnull().sum()

gene             0
rs_id            0
variation_id     0
list_diseases    0
dtype: int64

In [472]:
# string split by comma to create list for each disease in each row
clinvar_explode_condition_list_merged['list_diseases'] = clinvar_explode_condition_list_merged['list_diseases'].str.split(',')
clinvar_explode_condition_list_merged.head()
# explode condition list
clinvar_explode_condition_list_merged = clinvar_explode_condition_list_merged.explode('list_diseases')
print(clinvar_explode_condition_list_merged.shape)
clinvar_explode_condition_list_merged.head()

(8462, 4)


Unnamed: 0,gene,rs_id,variation_id,list_diseases
0,ATM,rs587776547,3019,breast cancer:1612
0,ATM,rs587776547,3019,ovarian cancer:2394
1,ATM,rs587776547,3019,breast cancer:1612
1,ATM,rs587776547,3019,ovarian cancer:2394
2,ATM,rs774925473,3021,breast cancer:1612


Split list_diseases into a disease and doid column

In [473]:
# Split the 'list_diseases' column on the colon
clinvar_explode_condition_list_merged[['condition', 'doid']] = clinvar_explode_condition_list_merged['list_diseases'].str.split(':', expand=True, n=1)

# Add 'doid:' to each entry in the 'doid' column
clinvar_explode_condition_list_merged['doid'] = 'DOID:' + clinvar_explode_condition_list_merged['doid']

# Drop list_diseases column
clinvar_explode_condition_list_merged.drop(columns=['list_diseases'], inplace=True)

# Display sample of the modified DataFrame
clinvar_explode_condition_list_merged.head()


Unnamed: 0,gene,rs_id,variation_id,condition,doid
0,ATM,rs587776547,3019,breast cancer,DOID:1612
0,ATM,rs587776547,3019,ovarian cancer,DOID:2394
1,ATM,rs587776547,3019,breast cancer,DOID:1612
1,ATM,rs587776547,3019,ovarian cancer,DOID:2394
2,ATM,rs774925473,3021,breast cancer,DOID:1612


Concatenate cleaned single condtions and exploded conditions

In [474]:
# combine single and explode condition rows
clinvar_final_processed = pd.concat([clinvar_single_condition_list_merged, clinvar_explode_condition_list_merged])
clinvar_final_processed.sample(10)

Unnamed: 0,gene,rs_id,variation_id,condition,doid
258571,NF2,rs1255367068,567430,cancer,DOID:162
7219,BRCA1,rs80358087,37619,cancer,DOID:162
108410,BRCA2,rs876660812,234042,cancer,DOID:162
332096,SMARCA4,rs2087817276,834779,cancer,DOID:162
215029,DICER1,rs1555368569,483429,cancer,DOID:162
74338,RAD50,rs786201804,184939,cancer,DOID:162
322616,SDHB,rs1315623287,863389,gastrointestinal stromal tumor,DOID:9253
232215,BRCA2,rs140782158,495504,cancer,DOID:162
383315,MSH6,rs1669064182,1094278,cancer,DOID:162
3287,CHEK2,rs1298667185,479534,breast cancer,DOID:1612


Saving concatenated single and explode pd

In [475]:
clinvar_final_processed.to_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_final_processed.tsv', sep = '\t', index = False)

Opening concatentated single and explode pd

In [476]:
clinvar_final_processed = pd.read_csv('../home/data/processed_data/GlyGen/clinvar/clinvar_final_processed.tsv', sep = '\t', dtype={'doid': str})

In [477]:
df = pd.DataFrame(columns = tcf.TSV_HEADERS)
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag


In [478]:
df['assessed_biomarker_entity'] = clinvar_final_processed['gene']
df['biomarker'] = 'presence of ' + clinvar_final_processed['rs_id'].astype(str) + ' mutation in ' + df['assessed_biomarker_entity']
df['assessed_biomarker_entity_id'] = 'dbSNP:' + clinvar_final_processed['rs_id'].astype(str)
df['assessed_entity_type'] = 'gene'
df['condition'] = clinvar_final_processed['condition'].str.strip()
df['condition_id'] = clinvar_final_processed['doid'].astype(str).str.strip()
df['best_biomarker_role'] = 'risk'
df['evidence_source'] = 'CLINVAR:' + clinvar_final_processed['variation_id'].astype(str).str.strip()
df['tag'] = 'biomarker;assessed_biomarker_entity_id;assessed_biomarker_entity;condition'

print(df.shape)
df.head()

(454006, 16)


Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,exposure_agent,exposure_agent_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,,presence of rs1800562 mutation in HFE,HFE,dbSNP:rs1800562,gene,cancer,DOID:162,,,risk,,,,CLINVAR:9,,biomarker;assessed_biomarker_entity_id;assesse...
1,,presence of rs1800562 mutation in HFE,HFE,dbSNP:rs1800562,gene,cancer,DOID:162,,,risk,,,,CLINVAR:9,,biomarker;assessed_biomarker_entity_id;assesse...
2,,presence of rs121908830 mutation in TMEM127,TMEM127,dbSNP:rs121908830,gene,cancer,DOID:162,,,risk,,,,CLINVAR:108,,biomarker;assessed_biomarker_entity_id;assesse...
3,,presence of rs121908830 mutation in TMEM127,TMEM127,dbSNP:rs121908830,gene,cancer,DOID:162,,,risk,,,,CLINVAR:108,,biomarker;assessed_biomarker_entity_id;assesse...
4,,presence of rs387906223 mutation in KLHDC8B,KLHDC8B,dbSNP:rs387906223,gene,Hodgkin's lymphoma,DOID:8567,,,risk,,,,CLINVAR:273,,biomarker;assessed_biomarker_entity_id;assesse...


In [None]:
# drop rows that are full duplicates
df = df.drop_duplicates()
df.shape

In [None]:
# Group by 'biomarker' column and assign temporary IDs
df['biomarker_id'] = df.groupby('biomarker').ngroup()

# Display the DataFrame with temporary biomarker IDs
print(df)
# save cleaned data 
df.to_csv('../home/data/cleaned_data/GlyGen/clinvar.tsv', sep = '\t', index = False)