# OncoMX Biomarker Data Exploration

In [13]:
import pandas as pd
import table_cleaning_functions as tcf

In [14]:
# read in oncomx data tsv using pandas 
df = pd.read_csv('../data/raw_data/tables/oncomx.tsv', sep='\t')
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity_ID,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_ID,loinc_code,condition,condition_ID,evidence_source,evidence,tags
0,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,prognostic,blood,UN:0000178,26881-3,prostate cancer,DOID:10283,PMID:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32428990,The decrease of IL-6 was closely related to t...,biomarker; assessed_biomarker_entity; best_bio...
3,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a...",biomarker; assessed_biomarker_entity; best_bio...
4,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32438331,"Clinical biomarkers for chronic inflammation,...",biomarker; assessed_biomarker_entity; best_bio...


In [15]:
# check for null values 
df.isnull().sum()

biomarker_id                      0
biomarker                         1
assessed_biomarker_entity_ID     32
assessed_biomarker_entity         0
assessed_entity_type              0
best_biomarker_role               0
specimen                          0
specimen_ID                       0
loinc_code                      435
condition                         0
condition_ID                      0
evidence_source                   0
evidence                          0
tags                              0
dtype: int64

In [16]:
# rename columns to lowercase 
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity_id,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence,tags
0,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,prognostic,blood,UN:0000178,26881-3,prostate cancer,DOID:10283,PMID:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32428990,The decrease of IL-6 was closely related to t...,biomarker; assessed_biomarker_entity; best_bio...
3,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a...",biomarker; assessed_biomarker_entity; best_bio...
4,A0001,increased IL6 level,UPKB:P05231,Interleukin-6 (IL6),protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32438331,"Clinical biomarkers for chronic inflammation,...",biomarker; assessed_biomarker_entity; best_bio...


In [17]:
df.shape

(956, 14)

In [18]:
# drop rows where assessed_biomarker_entity_id or biomarker are null 
df.dropna(subset = ['assessed_biomarker_entity_id', 'biomarker'], inplace = True)
df.isnull().sum()

biomarker_id                      0
biomarker                         0
assessed_biomarker_entity_id      0
assessed_biomarker_entity         0
assessed_entity_type              0
best_biomarker_role               0
specimen                          0
specimen_id                       0
loinc_code                      412
condition                         0
condition_id                      0
evidence_source                   0
evidence                          0
tags                              0
dtype: int64

In [19]:
df.shape

(923, 14)

In [20]:
# drop all rows where assessed_biomarker_entity_id contains '<a href' 
df = df[~df['assessed_biomarker_entity_id'].str.contains('<a href')]
df.shape

(846, 14)

In [21]:
# drop all rows where assessed_biomarker_entity_id doesn't contain a ':' 
df = df[df['assessed_biomarker_entity_id'].str.contains(':')]
df.shape

(845, 14)

In [23]:
# clean condition and assessed_biomarker_entity columns 
df['condition'] = tcf.clean_parantheticals(df['condition'])
df['assessed_biomarker_entity'] = tcf.clean_parantheticals(df['assessed_biomarker_entity'])
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity_id,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence,tags
0,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,prognostic,blood,UN:0000178,26881-3,prostate cancer,DOID:10283,PMID:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32428990,The decrease of IL-6 was closely related to t...,biomarker; assessed_biomarker_entity; best_bio...
3,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a...",biomarker; assessed_biomarker_entity; best_bio...
4,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UN:0000178,26881-3,COVID-19,DOID:0080600,PMID:32438331,"Clinical biomarkers for chronic inflammation,...",biomarker; assessed_biomarker_entity; best_bio...


In [24]:
# map prefixes in specimen_id and evidence source columns 
df['specimen_id'] = tcf.map_prefixes(df['specimen_id'])
df['evidence_source'] = tcf.map_prefixes(df['evidence_source'])
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity_id,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence,tags
0,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,prognostic,blood,UBERON:0000178,26881-3,prostate cancer,DOID:10283,PubMed:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32428990,The decrease of IL-6 was closely related to t...,biomarker; assessed_biomarker_entity; best_bio...
3,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a...",biomarker; assessed_biomarker_entity; best_bio...
4,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32438331,"Clinical biomarkers for chronic inflammation,...",biomarker; assessed_biomarker_entity; best_bio...


In [25]:
# strip whitespace from evidence column 
df['evidence'] = tcf.strip_values(df['evidence'])
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity_id,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence,tags
0,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,prognostic,blood,UBERON:0000178,26881-3,prostate cancer,DOID:10283,PubMed:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32428990,The decrease of IL-6 was closely related to tr...,biomarker; assessed_biomarker_entity; best_bio...
3,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, an...",biomarker; assessed_biomarker_entity; best_bio...
4,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32438331,"Clinical biomarkers for chronic inflammation, ...",biomarker; assessed_biomarker_entity; best_bio...


In [26]:
# strip whitespace from all columns
for col in df.columns:
    df[col] = tcf.strip_values(df[col])
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity_id,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence,tags
0,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,prognostic,blood,UBERON:0000178,26881-3,prostate cancer,DOID:10283,PubMed:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32428990,The decrease of IL-6 was closely related to tr...,biomarker; assessed_biomarker_entity; best_bio...
3,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, an...",biomarker; assessed_biomarker_entity; best_bio...
4,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32438331,"Clinical biomarkers for chronic inflammation, ...",biomarker; assessed_biomarker_entity; best_bio...


In [27]:
# create new column 'tmp_id' with group numbers based on core columns and insert as first column
df.insert(0, 'tmp_id', df.groupby(['biomarker', 'assessed_biomarker_entity_id', 'assessed_biomarker_entity', 'condition']).ngroup())
df.head()

Unnamed: 0,tmp_id,biomarker_id,biomarker,assessed_biomarker_entity_id,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence,tags
0,307,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,prognostic,blood,UBERON:0000178,26881-3,prostate cancer,DOID:10283,PubMed:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,297,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,297,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32428990,The decrease of IL-6 was closely related to tr...,biomarker; assessed_biomarker_entity; best_bio...
3,297,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, an...",biomarker; assessed_biomarker_entity; best_bio...
4,297,A0001,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32438331,"Clinical biomarkers for chronic inflammation, ...",biomarker; assessed_biomarker_entity; best_bio...


In [28]:
# drop biomarker_id column
df.drop(columns = ['biomarker_id'], inplace = True)

# rename tmp_id column to biomarker_id
df.rename(columns = {'tmp_id': 'biomarker_id'}, inplace = True)
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity_id,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence,tags
0,307,increased IL6 level,UPKB:P05231,Interleukin-6,protein,prognostic,blood,UBERON:0000178,26881-3,prostate cancer,DOID:10283,PubMed:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,297,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,297,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32428990,The decrease of IL-6 was closely related to tr...,biomarker; assessed_biomarker_entity; best_bio...
3,297,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, an...",biomarker; assessed_biomarker_entity; best_bio...
4,297,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32438331,"Clinical biomarkers for chronic inflammation, ...",biomarker; assessed_biomarker_entity; best_bio...


In [29]:
df.value_counts('biomarker_id')

biomarker_id
203    27
297    15
59     15
224    13
326     8
       ..
533     1
534     1
535     1
536     1
537     1
Name: count, Length: 549, dtype: int64

In [31]:
# rename tags column to tag 
df.rename(columns = {'tags': 'tag'}, inplace = True)
df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity_id,assessed_biomarker_entity,assessed_entity_type,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence,tag
0,307,increased IL6 level,UPKB:P05231,Interleukin-6,protein,prognostic,blood,UBERON:0000178,26881-3,prostate cancer,DOID:10283,PubMed:10914713,Univariate analysis of all patients demonstrat...,biomarker; assessed_biomarker_entity; best_bio...
1,297,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32259560,Serial measurement of circulating IL-6 levels ...,biomarker; assessed_biomarker_entity; best_bio...
2,297,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32428990,The decrease of IL-6 was closely related to tr...,biomarker; assessed_biomarker_entity; best_bio...
3,297,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, an...",biomarker; assessed_biomarker_entity; best_bio...
4,297,increased IL6 level,UPKB:P05231,Interleukin-6,protein,monitoring,blood,UBERON:0000178,26881-3,COVID-19,DOID:0080600,PubMed:32438331,"Clinical biomarkers for chronic inflammation, ...",biomarker; assessed_biomarker_entity; best_bio...


In [None]:
# strip internal whitespace
df['tag'] = df['tag'].str.replace(' ', '')

In [30]:
# write tsv 
df.to_csv('../data/results/tables/oncomx.tsv', sep='\t', index=False)