# OncoMX Biomarker Data Cleaning

## Issues / TODOs

- Missing `tags` 
- Missing `assessed_entity_type`

In [43]:
import pandas as pd

In [44]:
# read in oncomx data tsv using pandas 
raw_df = pd.read_csv('../home/raw_data/oncomx.tsv', sep='\t')
raw_df.head()

Unnamed: 0,biomarker_id,assessed_biomarker_entity_ID,assessed_biomarker_entity,biomarker,best_biomarker_role,specimen,specimen_ID,loinc_code,condition,condition_ID,evidence_source,evidence
0,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,prognostic,blood,UN:0000178,26881-3,prostate cancer (DOID:10283),DOID:10283,PMID:10914713,Univariate analysis of all patients demonstrat...
1,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,monitoring,blood,UN:0000178,26881-3,COVID-19 (DOID:0080600),DOID:0080600,PMID:32259560,Serial measurement of circulating IL-6 levels ...
2,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,monitoring,blood,UN:0000178,26881-3,COVID-19 (DOID:0080600),DOID:0080600,PMID:32428990,The decrease of IL-6 was closely related to t...
3,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,monitoring,blood,UN:0000178,26881-3,COVID-19 (DOID:0080600),DOID:0080600,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a..."
4,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,monitoring,blood,UN:0000178,26881-3,COVID-19 (DOID:0080600),DOID:0080600,PMID:32438331,"Clinical biomarkers for chronic inflammation,..."


In [45]:
new_cols = ['biomarker_id', 'assessed_biomarker_entity_id', 'assessed_biomarker_entity', 'biomarker', 'best_biomarker_role', 'specimen', 'specimen_id', 'loinc_code', 'condition', 'condition_id', 'evidence_source', 'evidence']
# rename raw_df columns with new_cols
raw_df.columns = new_cols
raw_df.head()

Unnamed: 0,biomarker_id,assessed_biomarker_entity_id,assessed_biomarker_entity,biomarker,best_biomarker_role,specimen,specimen_id,loinc_code,condition,condition_id,evidence_source,evidence
0,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,prognostic,blood,UN:0000178,26881-3,prostate cancer (DOID:10283),DOID:10283,PMID:10914713,Univariate analysis of all patients demonstrat...
1,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,monitoring,blood,UN:0000178,26881-3,COVID-19 (DOID:0080600),DOID:0080600,PMID:32259560,Serial measurement of circulating IL-6 levels ...
2,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,monitoring,blood,UN:0000178,26881-3,COVID-19 (DOID:0080600),DOID:0080600,PMID:32428990,The decrease of IL-6 was closely related to t...
3,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,monitoring,blood,UN:0000178,26881-3,COVID-19 (DOID:0080600),DOID:0080600,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a..."
4,A0001,UPKB:P05231,Interleukin-6 (IL6),increased IL6 level,monitoring,blood,UN:0000178,26881-3,COVID-19 (DOID:0080600),DOID:0080600,PMID:32438331,"Clinical biomarkers for chronic inflammation,..."


In [46]:
# reorder columns
reordered_cols = ['biomarker_id', 'biomarker', 'assessed_biomarker_entity', 'assessed_biomarker_entity_id', 'condition', 'condition_id', 'best_biomarker_role', 'specimen', 'specimen_id', 'loinc_code', 'evidence_source', 'evidence']
raw_df = raw_df[reordered_cols]
raw_df.head()

Unnamed: 0,biomarker_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence
0,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,prostate cancer (DOID:10283),DOID:10283,prognostic,blood,UN:0000178,26881-3,PMID:10914713,Univariate analysis of all patients demonstrat...
1,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32259560,Serial measurement of circulating IL-6 levels ...
2,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32428990,The decrease of IL-6 was closely related to t...
3,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a..."
4,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32438331,"Clinical biomarkers for chronic inflammation,..."


In [47]:
# rename biomarker_id to oncomx_id
raw_df = raw_df.rename(columns={'biomarker_id': 'oncomx_id'})
raw_df.head()

Unnamed: 0,oncomx_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence
0,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,prostate cancer (DOID:10283),DOID:10283,prognostic,blood,UN:0000178,26881-3,PMID:10914713,Univariate analysis of all patients demonstrat...
1,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32259560,Serial measurement of circulating IL-6 levels ...
2,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32428990,The decrease of IL-6 was closely related to t...
3,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a..."
4,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32438331,"Clinical biomarkers for chronic inflammation,..."


In [49]:
# check for null values 
raw_df.isnull().sum()

tmp_id                            0
oncomx_id                         0
biomarker                         1
assessed_biomarker_entity         0
assessed_biomarker_entity_id     32
condition                         0
condition_id                      0
best_biomarker_role               0
specimen                          0
specimen_id                       0
loinc_code                      435
evidence_source                   0
evidence                          0
dtype: int64

In [50]:
# drop row where assessed_biomarker_entity_id or biomarker are null
raw_df.dropna(subset=['assessed_biomarker_entity_id', 'biomarker'], inplace=True)
raw_df.isnull().sum()

tmp_id                            0
oncomx_id                         0
biomarker                         0
assessed_biomarker_entity         0
assessed_biomarker_entity_id      0
condition                         0
condition_id                      0
best_biomarker_role               0
specimen                          0
specimen_id                       0
loinc_code                      412
evidence_source                   0
evidence                          0
dtype: int64

In [51]:
raw_df.shape

(923, 13)

In [52]:
# drop all rows where assessed_biomarker_entity_id contains '<a href' 
raw_df = raw_df[~raw_df['assessed_biomarker_entity_id'].str.contains('<a href')]
raw_df.shape

(846, 13)

In [53]:
# drop all rows where assessed_biomarker_entity_id doesn't contain a ':' 
raw_df = raw_df[raw_df['assessed_biomarker_entity_id'].str.contains(':')]
raw_df.shape

(845, 13)

In [54]:
# create a new column 'tmp_id' with group numbers based on core fields 
raw_df['tmp_id'] = raw_df.groupby(['biomarker', 'assessed_biomarker_entity_id', 'assessed_biomarker_entity', 'condition']).ngroup()
raw_df.head()

Unnamed: 0,tmp_id,oncomx_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence
0,307,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,prostate cancer (DOID:10283),DOID:10283,prognostic,blood,UN:0000178,26881-3,PMID:10914713,Univariate analysis of all patients demonstrat...
1,297,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32259560,Serial measurement of circulating IL-6 levels ...
2,297,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32428990,The decrease of IL-6 was closely related to t...
3,297,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a..."
4,297,A0001,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32438331,"Clinical biomarkers for chronic inflammation,..."


In [58]:
raw_df.value_counts('tmp_id')

tmp_id
203    27
59     15
297    15
224    13
326     8
       ..
183     1
181     1
180     1
179     1
197     1
Name: count, Length: 550, dtype: int64

In [63]:
# drop oncomx_id column
cleaned_df = raw_df.drop(columns=['oncomx_id'])
cleaned_df

Unnamed: 0,tmp_id,biomarker,assessed_biomarker_entity,assessed_biomarker_entity_id,condition,condition_id,best_biomarker_role,specimen,specimen_id,loinc_code,evidence_source,evidence
0,307,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,prostate cancer (DOID:10283),DOID:10283,prognostic,blood,UN:0000178,26881-3,PMID:10914713,Univariate analysis of all patients demonstrat...
1,297,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32259560,Serial measurement of circulating IL-6 levels ...
2,297,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32428990,The decrease of IL-6 was closely related to t...
3,297,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32677844,"Elevated levels of IL-6, D-dimer, CRP, LDH, a..."
4,297,increased IL6 level,Interleukin-6 (IL6),UPKB:P05231,COVID-19 (DOID:0080600),DOID:0080600,monitoring,blood,UN:0000178,26881-3,PMID:32438331,"Clinical biomarkers for chronic inflammation,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
943,379,increased N-glycan level,Man7 high mannose N-glycan,GTC:G36059IK,breast cancer (DOID:1612),DOID:1612,diagnostic,tissue,UN:0000479,,PMID:30889355,In comparing the N-glycan profiles of healthy ...
944,382,increased N-glycan level,Man8 high mannose N-glycan,GTC:G66676MI,breast cancer (DOID:1612),DOID:1612,diagnostic,tissue,UN:0000479,,PMID:30889355,In comparing the N-glycan profiles of healthy ...
947,376,increased N-glycan level,Neu5Gc,GTC:G26366JF,breast cancer (DOID:1612),DOID:1612,monitoring,blood,UN:0000178,,PMID:35346112,Analysis of sera from breast cancer cases reve...
954,117,hypermethylated MIR-9-1 gene,miRNA‑9-1 gene (MIR‑9-1),MRB:MI0000466,ovarian cancer (DOID:2394),DOID:2394,prognostic,ovary,UN:0000992,,PMID:29313235,"...five microRNA genes (MIR-9-1, MIR-9-3, MIR-..."


In [None]:
# write cleaned_df to tsv
cleaned_df.to_csv('../home/results/tables/oncomx.tsv', sep='\t', index=False)