In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from loguru import logger

In [4]:
import os
import sys
sys.path.append('..')

In [5]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.evaluation import run_experiment, _select_significant_entries, _compute_prec_rec_f1
from pals.common import save_obj, DATABASE_PIMP_KEGG, SIGNIFICANT_THRESHOLD
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS

2019-12-03 00:09:46.226 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# HAT Data Analysis

This notebook is used to generate resampled data and run the different methods for comparison in the manuscript.

### Load data

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id_plasma = 636
int_df_plasma, annotation_df_plasma, experimental_design_plasma = download_from_pimp(token, PIMP_HOST, analysis_id_plasma, 'kegg')

http://polyomics.mvls.gla.ac.uk/export/get_ms1_intensities?analysis_id=636 <Response [200]>
http://polyomics.mvls.gla.ac.uk/export/get_ms1_peaks?analysis_id=636 <Response [200]>
http://polyomics.mvls.gla.ac.uk/export/get_experimental_design?analysis_id=636 <Response [200]>


### Create Data Sources

In [8]:
database_name = DATABASE_PIMP_KEGG
ds_plasma = DataSource(int_df_plasma, annotation_df_plasma, experimental_design_plasma, database_name)

2019-12-03 00:10:27.639 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-03 00:10:27.661 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-03 00:10:27.669 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-03 00:10:28.050 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


### Compare ORA vs PALS

Set up some parameters

In [9]:
significant_column = 'PiMP_KEGG Stage1/Control comb_p'
pals_plage_weight = 5
pals_hg_weight = 1
pals_num_resamples = 1000
pals_resample = True if pals_num_resamples > 0 else False
n_sample = 4
case = 'Stage1'
control = 'Control'
N = 20
threshold = SIGNIFICANT_THRESHOLD

Run PALS and ORA on the full data

In [10]:
pals = PALS(ds_plasma, plage_weight=pals_plage_weight, hg_weight=pals_hg_weight, num_resamples=pals_num_resamples)
full_df_ora = pals.get_ora_df()
full_df_pals = pals.get_pathway_df(resample=pals_resample)
full_df_ora.sort_values(significant_column, inplace=True)
full_df_pals.sort_values(significant_column, inplace=True)

2019-12-03 00:10:28.201 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2019-12-03 00:10:28.207 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:391 - Setting the zero intensity values in the dataframe
2019-12-03 00:10:47.767 | DEBUG    | pals.pathway_analysis:get_ora_df:115 - Correcting for multiple t-tests
2019-12-03 00:10:47.773 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:477 - Calculating dataset formula coverage
2019-12-03 00:10:47.782 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:391 - Setting the zero intensity values in the dataframe
2019-12-03 00:10:47.870 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:372 - Scaling the data across the sample: zero mean and unit variance
2019-12-03 00:10:47.936 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:211 - Mean values of the rows in the DF is [ 0. -0.  0. ... -0. -0. -0.]
2019-12-03 00:10:47.937 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:212 - Va

## Try resampling on the columns (samples)

Now try with a resampled data source. Here we will randomly resample the columns from the original full data.

In [11]:
ds_plasma_resampled = ds_plasma.resample(n_sample, case=case, control=control, axis=1)
pals = PALS(ds_plasma_resampled, plage_weight=pals_plage_weight, hg_weight=pals_hg_weight, num_resamples=pals_num_resamples)
partial_df_ora = pals.get_ora_df()
partial_df_pals = pals.get_pathway_df(resample=pals_resample)
partial_df_ora.sort_values(significant_column, inplace=True)
partial_df_pals.sort_values(significant_column, inplace=True)

2019-12-03 00:10:53.848 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-03 00:10:53.900 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-03 00:10:53.904 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-03 00:10:54.250 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts
2019-12-03 00:10:54.260 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2019-12-03 00:10:54.261 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:391 - Setting the zero intensity values in the dataframe
2019-12-03 00:10:58.957 | DEBUG    | pals.pathway_analysis:get_ora_df:115 - Correcting for multiple t-tests
2019-12-03 00:10:58.961 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:477 - Calculating dataset formula coverage
2019-12-03 00:10:58.966 | DEBUG    | pals.pathway_analysis:_change_zero_peak

In [12]:
ora_full = _select_significant_entries(full_df_ora, significant_column, N, threshold)
ora_partial = _select_significant_entries(partial_df_ora, significant_column, N, threshold)
_compute_prec_rec_f1(ora_full, ora_partial)

2019-12-03 00:11:00.886 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:136 - TP_items = {'map00650', 'map00930', 'map02010', 'map00960', 'map00770', 'map00330', 'map00260', 'map00290', 'map04974', 'map00760', 'map00310', 'map00250', 'ingenza00006', 'map00460', 'map04978', 'ingenza00007', 'map00643', 'map00970', 'map00472'}
2019-12-03 00:11:00.887 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:137 - FP_items = {'map00966'}
2019-12-03 00:11:00.887 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:138 - FN_items = {'map00340'}


(19, 1, 1, 0.95, 0.95, 0.9500000000000001)

In [13]:
ora_full

Unnamed: 0_level_0,pw_name,p_value
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1
ingenza00006,IG-Amino-acid Biosynthesis 1,0.0
map00472,D-Arginine and D-ornithine metabolism,0.0
map00970,Aminoacyl-tRNA biosynthesis,0.0
ingenza00007,IG-Amino-acid Biosynthesis 2,0.0
map00460,Cyanoamino acid metabolism,1.059807e-33
map00330,Arginine and proline metabolism,2.509547e-28
map00260,"Glycine, serine and threonine metabolism",1.21932e-27
map04974,Protein digestion and absorption,4.6967690000000005e-27
map02010,ABC transporters,1.460861e-21
map04978,Mineral absorption,9.023089e-15


In [14]:
ora_partial

Unnamed: 0_level_0,pw_name,p_value
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1
ingenza00006,IG-Amino-acid Biosynthesis 1,0.0
map00472,D-Arginine and D-ornithine metabolism,0.0
map00970,Aminoacyl-tRNA biosynthesis,0.0
ingenza00007,IG-Amino-acid Biosynthesis 2,0.0
map00460,Cyanoamino acid metabolism,2.8826129999999998e-36
map00330,Arginine and proline metabolism,4.578341e-30
map00260,"Glycine, serine and threonine metabolism",3.072843e-26
map04974,Protein digestion and absorption,4.036865e-24
map02010,ABC transporters,6.067161e-18
map00960,"Tropane, piperidine and pyridine alkaloid bios...",1.428803e-17


In [15]:
pals_full = _select_significant_entries(full_df_pals, significant_column, N, threshold)
pals_partial = _select_significant_entries(partial_df_pals, significant_column, N, threshold)
_compute_prec_rec_f1(pals_full, pals_partial)

2019-12-03 00:11:01.094 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:136 - TP_items = {'map04961', 'map07226'}
2019-12-03 00:11:01.095 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:137 - FP_items = {'map04672', 'map04920', 'map04745', 'map05223', 'map00830', 'map03320', 'map05222'}
2019-12-03 00:11:01.096 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:138 - FN_items = {'map00660', 'map00626', 'map00061', 'map00590', 'map05215', 'map00930', 'map00981', 'map00281', 'map00622', 'map00982', 'map00902', 'map04913', 'map05111', 'map00623', 'map05211', 'map00903', 'map00592', 'map00440'}


(2, 7, 18, 0.2222222222222222, 0.1, 0.13793103448275865)

In [16]:
pals_full

Unnamed: 0,pw_name,p_value
map00902,Monoterpenoid biosynthesis,0.0
map07226,"Progesterone, androgen and estrogen receptor a...",0.0
map00903,Limonene and pinene degradation,0.0
map04961,Endocrine and other factor-regulated calcium r...,0.0
map00930,Caprolactam degradation,8.073398e-17
map00981,Insect hormone biosynthesis,6.004333e-13
map05215,Prostate cancer,2.135918e-11
map04913,Ovarian steroidogenesis,3.234348e-11
map00622,Xylene degradation,1.127336e-10
map00626,Naphthalene degradation,6.190542e-09


In [17]:
pals_partial

Unnamed: 0,pw_name,p_value
map04672,Intestinal immune network for IgA production,2.2e-05
map05222,Small cell lung cancer,2.2e-05
map04920,Adipocytokine signaling pathway,2.9e-05
map05223,Non-small cell lung cancer,3.9e-05
map03320,PPAR signaling pathway,4.4e-05
map00830,Retinol metabolism,8.9e-05
map04745,Phototransduction - fly,0.002833
map07226,"Progesterone, androgen and estrogen receptor a...",0.035532
map04961,Endocrine and other factor-regulated calcium r...,0.040765


## Try resampling on the rows (peaks)

In [18]:
n_sample = 10000
ds_plasma_resampled = ds_plasma.resample(n_sample, case=None, control=None, axis=0)

2019-12-03 00:11:01.307 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-03 00:11:01.327 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-03 00:11:01.332 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-03 00:11:01.562 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


In [19]:
pals = PALS(ds_plasma_resampled, plage_weight=1, hg_weight=1, num_resamples=pals_num_resamples)
partial_df_ora = pals.get_ora_df()
partial_df_pals = pals.get_pathway_df(resample=pals_resample)
partial_df_ora.sort_values(significant_column, inplace=True)
partial_df_pals.sort_values(significant_column, inplace=True)

2019-12-03 00:11:01.634 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2019-12-03 00:11:01.637 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:391 - Setting the zero intensity values in the dataframe
2019-12-03 00:11:14.836 | DEBUG    | pals.pathway_analysis:get_ora_df:115 - Correcting for multiple t-tests
2019-12-03 00:11:14.843 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:477 - Calculating dataset formula coverage
2019-12-03 00:11:14.850 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:391 - Setting the zero intensity values in the dataframe
2019-12-03 00:11:14.906 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:372 - Scaling the data across the sample: zero mean and unit variance
2019-12-03 00:11:14.945 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:211 - Mean values of the rows in the DF is [ 0.  0. -0. ...  0.  0.  0.]
2019-12-03 00:11:14.946 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:212 - Va

In [20]:
ora_full = _select_significant_entries(full_df_ora, significant_column, N, threshold)
ora_partial = _select_significant_entries(partial_df_ora, significant_column, N, threshold)
_compute_prec_rec_f1(ora_full, ora_partial)

2019-12-03 00:11:20.434 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:136 - TP_items = {'map00340', 'map00250', 'map00650', 'ingenza00006', 'map00460', 'map00760', 'map04978', 'map00330', 'map00930', 'map00260', 'ingenza00007', 'map00643', 'map02010', 'map00290', 'map00970', 'map04974', 'map00960'}
2019-12-03 00:11:20.435 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:137 - FP_items = {'map00240', 'map00410', 'map00280'}
2019-12-03 00:11:20.437 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:138 - FN_items = {'map00472', 'map00310', 'map00770'}


(17, 3, 3, 0.85, 0.85, 0.85)

In [21]:
ora_full

Unnamed: 0_level_0,pw_name,p_value
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1
ingenza00006,IG-Amino-acid Biosynthesis 1,0.0
map00472,D-Arginine and D-ornithine metabolism,0.0
map00970,Aminoacyl-tRNA biosynthesis,0.0
ingenza00007,IG-Amino-acid Biosynthesis 2,0.0
map00460,Cyanoamino acid metabolism,1.059807e-33
map00330,Arginine and proline metabolism,2.509547e-28
map00260,"Glycine, serine and threonine metabolism",1.21932e-27
map04974,Protein digestion and absorption,4.6967690000000005e-27
map02010,ABC transporters,1.460861e-21
map04978,Mineral absorption,9.023089e-15


In [22]:
ora_partial

Unnamed: 0_level_0,pw_name,p_value
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1
map00970,Aminoacyl-tRNA biosynthesis,0.0
ingenza00006,IG-Amino-acid Biosynthesis 1,0.0
ingenza00007,IG-Amino-acid Biosynthesis 2,0.0
map00460,Cyanoamino acid metabolism,7.538159e-35
map04974,Protein digestion and absorption,1.7416509999999998e-26
map00260,"Glycine, serine and threonine metabolism",1.636676e-25
map00330,Arginine and proline metabolism,1.0978670000000002e-23
map02010,ABC transporters,2.89543e-19
map04978,Mineral absorption,2.093528e-16
map00930,Caprolactam degradation,5.178264e-14


In [23]:
pals_full = _select_significant_entries(full_df_pals, significant_column, N, threshold)
pals_partial = _select_significant_entries(partial_df_pals, significant_column, N, threshold)
_compute_prec_rec_f1(pals_full, pals_partial)

2019-12-03 00:11:20.646 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:136 - TP_items = {'map00660', 'map00626', 'map00061', 'map00590', 'map00930', 'map00981', 'map00982', 'map00622', 'map00902', 'map00623', 'map05211', 'map00903', 'map00592'}
2019-12-03 00:11:20.647 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:137 - FP_items = {'map00250', 'map00020', 'map04626', 'map00330', 'map00643', 'map00642', 'ingenza00009'}
2019-12-03 00:11:20.647 | DEBUG    | pals.evaluation:_compute_prec_rec_f1:138 - FN_items = {'map05215', 'map00281', 'map04961', 'map04913', 'map05111', 'map07226', 'map00440'}


(13, 7, 7, 0.65, 0.65, 0.65)

In [24]:
pals_full

Unnamed: 0,pw_name,p_value
map00902,Monoterpenoid biosynthesis,0.0
map07226,"Progesterone, androgen and estrogen receptor a...",0.0
map00903,Limonene and pinene degradation,0.0
map04961,Endocrine and other factor-regulated calcium r...,0.0
map00930,Caprolactam degradation,8.073398e-17
map00981,Insect hormone biosynthesis,6.004333e-13
map05215,Prostate cancer,2.135918e-11
map04913,Ovarian steroidogenesis,3.234348e-11
map00622,Xylene degradation,1.127336e-10
map00626,Naphthalene degradation,6.190542e-09


In [25]:
pals_partial

Unnamed: 0,pw_name,p_value
map00930,Caprolactam degradation,2.117352e-14
map00903,Limonene and pinene degradation,8.541206e-14
map00622,Xylene degradation,5.903745e-09
map00642,Ethylbenzene degradation,5.233283e-08
map00902,Monoterpenoid biosynthesis,1.80783e-07
map00643,Styrene degradation,1.248197e-06
map00592,alpha-Linolenic acid metabolism,4.382908e-06
map00660,C5-Branched dibasic acid metabolism,1.712405e-05
map00623,Toluene degradation,2.172389e-05
map00330,Arginine and proline metabolism,0.0004046997
