In [7]:
# imports 
import pandas as pd 
import numpy as np
from data_extraction import DataExtractionHelper

# load helper 
helper = DataExtractionHelper()
# remove max columns for display output
pd.set_option('display.max_columns', None)

In [8]:
# read in markerdb data 
markerdb_raw = helper.read_in_data('../data/markerdb_targets.csv', 'csv', header = None, names = ['id', 'gene', 'change', 'disease', 'pathogenic_status'])
markerdb_raw

Unnamed: 0,id,gene,change,disease,pathogenic_status
0,14104,UGT1A8,rs11892031,Bladder Cancer,Pathogenic
1,14105,UGT1A8,rs8102137,Bladder Cancer,Pathogenic
2,14106,UGT1A8,rs1014971,Bladder Cancer,Pathogenic
3,14107,UGT1A8,rs1495741,Bladder Cancer,Pathogenic
4,14108,UGT1A8,rs710521,Bladder Cancer,Pathogenic
...,...,...,...,...,...
9106,16457,TP53,Tyr220Cys ( rs121912666),Lung Cancer,Pathogenic
9107,17120,TP53,Val173Met ( rs876660754),Lung Cancer,Pathogenic
9108,17312,TP53,Arg248Leu ( rs11540652),Lung Cancer,Pathogenic
9109,17600,TP53,Cys135Gly ( rs1057519975),Lung Cancer,Likely pathogenic


In [10]:
# isolate rs id's 
markerdb_raw['rs_id'] = markerdb_raw['change'].str.extract(r'rs(\d+)')
markerdb_raw

Unnamed: 0,id,gene,change,disease,pathogenic_status,rs_id
0,14104,UGT1A8,rs11892031,Bladder Cancer,Pathogenic,11892031
1,14105,UGT1A8,rs8102137,Bladder Cancer,Pathogenic,8102137
2,14106,UGT1A8,rs1014971,Bladder Cancer,Pathogenic,1014971
3,14107,UGT1A8,rs1495741,Bladder Cancer,Pathogenic,1495741
4,14108,UGT1A8,rs710521,Bladder Cancer,Pathogenic,710521
...,...,...,...,...,...,...
9106,16457,TP53,Tyr220Cys ( rs121912666),Lung Cancer,Pathogenic,121912666
9107,17120,TP53,Val173Met ( rs876660754),Lung Cancer,Pathogenic,876660754
9108,17312,TP53,Arg248Leu ( rs11540652),Lung Cancer,Pathogenic,11540652
9109,17600,TP53,Cys135Gly ( rs1057519975),Lung Cancer,Likely pathogenic,1057519975


In [13]:
# gene column 
gene_series = markerdb_raw['gene']
helper.populate_col(source = gene_series, target_col = 'gene')

# rs id column 
rs_id_series = markerdb_raw['rs_id']
helper.populate_col(source = rs_id_series, target_col = 'rs_id', dtype = 'Int64')

# main_x_ref column 
helper.map_x_ref(prefix = 'dbSNP:rs')

# notes column 
notes_series = pd.Series('GRCh38', index = helper.biomarker_df.index)
helper.populate_col(source = notes_series, target_col = 'notes')

# disease column 
disease_series = markerdb_raw['disease'].str.lower()
helper.populate_col(source = disease_series, target_col = 'disease')

# assessed entity type column 
helper.set_assessed_entity_type(val = 'gene or protein')

# biomarker status column 
helper.set_biomarker_status(val = 'presence of')

# best biomarker type column 
helper.set_best_biomarker_type(val = 'risk_biomarker')

# assessed entity type column 
helper.set_assessed_entity_type(val = 'gene')

helper.biomarker_df

Unnamed: 0,biomarker_id,main_x_ref,assessed_biomarker_entity,biomarker_status,best_biomarker_type,specimen_type,loinc_code,condition_name,assessed_entity_type,evidence_source,notes,rs_id,gene,disease,uniprot,name,system,doid,mutation,variation
0,,dbSNP:rs11892031,rs11892031 mutation in udp-glucuronosyltransfe...,presence of,risk_biomarker,,,bladder cancer (DOID:11054.0),gene,,GRCh38,11892031,UGT1A8,bladder cancer,A9JQZ4,UDP-glucuronosyltransferase 1A8,,11054.0,,
1,,dbSNP:rs8102137,rs8102137 mutation in udp-glucuronosyltransfer...,presence of,risk_biomarker,,,bladder cancer (DOID:11054.0),gene,,GRCh38,8102137,UGT1A8,bladder cancer,A9JQZ4,UDP-glucuronosyltransferase 1A8,,11054.0,,
2,,dbSNP:rs1014971,rs1014971 mutation in udp-glucuronosyltransfer...,presence of,risk_biomarker,,,bladder cancer (DOID:11054.0),gene,,GRCh38,1014971,UGT1A8,bladder cancer,A9JQZ4,UDP-glucuronosyltransferase 1A8,,11054.0,,
3,,dbSNP:rs1495741,rs1495741 mutation in udp-glucuronosyltransfer...,presence of,risk_biomarker,,,bladder cancer (DOID:11054.0),gene,,GRCh38,1495741,UGT1A8,bladder cancer,A9JQZ4,UDP-glucuronosyltransferase 1A8,,11054.0,,
4,,dbSNP:rs710521,rs710521 mutation in udp-glucuronosyltransfera...,presence of,risk_biomarker,,,bladder cancer (DOID:11054.0),gene,,GRCh38,710521,UGT1A8,bladder cancer,A9JQZ4,UDP-glucuronosyltransferase 1A8,,11054.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9106,,dbSNP:rs121912666,rs121912666 mutation in tp53 (TP53),presence of,risk_biomarker,blood (UN:178),21739-8,lung cancer (DOID:1324.0),gene,,GRCh38,121912666,TP53,lung cancer,U6BKV7,TP53,Bld/Tiss,1324.0,,
9107,,dbSNP:rs876660754,rs876660754 mutation in tp53 (TP53),presence of,risk_biomarker,blood (UN:178),21739-8,lung cancer (DOID:1324.0),gene,,GRCh38,876660754,TP53,lung cancer,U6BKV7,TP53,Bld/Tiss,1324.0,,
9108,,dbSNP:rs11540652,rs11540652 mutation in tp53 (TP53),presence of,risk_biomarker,blood (UN:178),21739-8,lung cancer (DOID:1324.0),gene,,GRCh38,11540652,TP53,lung cancer,U6BKV7,TP53,Bld/Tiss,1324.0,,
9109,,dbSNP:rs1057519975,rs1057519975 mutation in tp53 (TP53),presence of,risk_biomarker,blood (UN:178),21739-8,lung cancer (DOID:1324.0),gene,,GRCh38,1057519975,TP53,lung cancer,U6BKV7,TP53,Bld/Tiss,1324.0,,
