# OncoMX Biomarker Data Exploration

In [1]:
import pandas as pd
import table_cleaning_functions as tcf

In [2]:
# read in oncomx data tsv using pandas 
df = pd.read_csv('../data/raw_data/tables/oncomx.tsv', sep='\t')
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,307,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,protein,prostate cancer (DOID:10283),DOID:10283,prognostic,blood,UN:0000178,26881-3,PMID:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,297,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,protein,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,297,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,protein,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32428990,The decrease of IL-6 was closely related to t...,biomarker; assessed_biomarker_entity; best_bio...
3,297,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,protein,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a...",biomarker; assessed_biomarker_entity; best_bio...
4,297,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,protein,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32438331,"Clinical biomarkers for chronic inflammation,...",biomarker; assessed_biomarker_entity; best_bio...


In [3]:
# check for null values 
df.isnull().sum()

biomarker_id                      0
biomarker                         0
assessed_biomarker_entity         0
assessed_biomarker_entity_id      0
assessed_entity_type              0
condition                         0
condition_id                      0
best_biomarker_role               0
specimen                          0
specimen_id                       0
loinc_code                      363
evidence_source                   0
evidence                          0
tag                               0
dtype: int64

In [4]:
df.shape

(845, 14)

In [5]:
# clean condition and assessed_biomarker_entity columns 
df['condition'] = tcf.clean_parantheticals(df['condition'])
df['assessed_biomarker_entity'] = tcf.clean_parantheticals(df['assessed_biomarker_entity'])
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,307,increased IL6 level,Interleukin-6,UPKB:P05231,protein,prostate cancer,DOID:10283,prognostic,blood,UN:0000178,26881-3,PMID:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32428990,The decrease of IL-6 was closely related to t...,biomarker; assessed_biomarker_entity; best_bio...
3,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a...",biomarker; assessed_biomarker_entity; best_bio...
4,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32438331,"Clinical biomarkers for chronic inflammation,...",biomarker; assessed_biomarker_entity; best_bio...


In [6]:
# map prefixes in specimen_id and evidence source columns 
df['specimen_id'] = tcf.map_prefixes(df['specimen_id'])
df['evidence_source'] = tcf.map_prefixes(df['evidence_source'])
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,307,increased IL6 level,Interleukin-6,UPKB:P05231,protein,prostate cancer,DOID:10283,prognostic,blood,UBERON:0000178,26881-3,PubMed:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UBERON:0000178,26881-3,PubMed:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UBERON:0000178,26881-3,PubMed:32428990,The decrease of IL-6 was closely related to t...,biomarker; assessed_biomarker_entity; best_bio...
3,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UBERON:0000178,26881-3,PubMed:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a...",biomarker; assessed_biomarker_entity; best_bio...
4,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UBERON:0000178,26881-3,PubMed:32438331,"Clinical biomarkers for chronic inflammation,...",biomarker; assessed_biomarker_entity; best_bio...


In [7]:
# strip whitespace from evidence column 
df['evidence'] = tcf.strip_values(df['evidence'])
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,assessed_entity_type,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence,tag
0,307,increased IL6 level,Interleukin-6,UPKB:P05231,protein,prostate cancer,DOID:10283,prognostic,blood,UBERON:0000178,26881-3,PubMed:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UBERON:0000178,26881-3,PubMed:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UBERON:0000178,26881-3,PubMed:32428990,The decrease of IL-6 was closely related to tr...,biomarker; assessed_biomarker_entity; best_bio...
3,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UBERON:0000178,26881-3,PubMed:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, an...",biomarker; assessed_biomarker_entity; best_bio...
4,297,increased IL6 level,Interleukin-6,UPKB:P05231,protein,COVID-19,DOID:0080600,monitoring,blood,UBERON:0000178,26881-3,PubMed:32438331,"Clinical biomarkers for chronic inflammation, ...",biomarker; assessed_biomarker_entity; best_bio...


In [8]:
# write tsv 
df.to_csv('../data/results/tables/oncomx.tsv', sep='\t', index=False)

In [54]:
# OLD 
# create a new column 'tmp_id' with group numbers based on core fields 
df['tmp_id'] = df.groupby(['biomarker', 'assessed_biomarker_entity_id', 'assessed_biomarker_entity', 'condition']).ngroup()
df.head()

Unnamed: 0,tmp_id,oncomx_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence
0,307,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,prostate cancer (DOID:10283),DOID:10283,prognostic,blood,UN:0000178,26881-3,PMID:10914713,Univariate analysis of all patients demonstrat...
1,297,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32259560,Serial measurement of circulating IL-6 levels ...
2,297,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32428990,The decrease of IL-6 was closely related to t...
3,297,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a..."
4,297,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32438331,"Clinical biomarkers for chronic inflammation,..."
