In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from loguru import logger

In [4]:
import os
import sys
sys.path.append('..')

In [5]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.evaluation import run_experiment, _select_significant_entries, _compute_prec_rec_f1
from pals.common import save_obj, DATABASE_PIMP_KEGG, SIGNIFICANT_THRESHOLD
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS

2019-12-14 02:58:52.063 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# HAT Data Analysis

This notebook is used to generate resampled data and run the different methods for comparison in the manuscript.

### Load data

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id_plasma = 636
int_df_plasma, annotation_df_plasma, experimental_design_plasma = download_from_pimp(token, PIMP_HOST, analysis_id_plasma, 'kegg')

http://polyomics.mvls.gla.ac.uk/export/get_ms1_intensities?analysis_id=636 <Response [200]>
http://polyomics.mvls.gla.ac.uk/export/get_ms1_peaks?analysis_id=636 <Response [200]>
http://polyomics.mvls.gla.ac.uk/export/get_experimental_design?analysis_id=636 <Response [200]>


### Create Data Sources

In [34]:
database_name = DATABASE_PIMP_KEGG
ds_plasma = DataSource(int_df_plasma, annotation_df_plasma, experimental_design_plasma, database_name)

2019-12-14 03:04:00.865 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-14 03:04:00.892 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-14 03:04:00.903 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-14 03:04:01.306 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


### Compare ORA vs PALS

Set up some parameters

In [46]:
significant_column = 'PiMP_KEGG Stage1/Control comb_p'
pals_plage_weight = 1
pals_hg_weight = 0
pals_num_resamples = 1000
pals_resample = True if pals_num_resamples > 0 else False
n_sample = 4
case = 'Stage1'
control = 'Control'
N = 20
threshold = SIGNIFICANT_THRESHOLD

Run PALS and ORA on the full data

In [47]:
pals = PALS(ds_plasma, plage_weight=pals_plage_weight, hg_weight=pals_hg_weight, num_resamples=pals_num_resamples)
full_df_ora = pals.get_ora_df()
full_df_pals = pals.get_pathway_df(resample=pals_resample)
full_df_ora.sort_values(significant_column, inplace=True)
full_df_pals.sort_values(significant_column, inplace=True)

2019-12-15 00:21:03.863 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2019-12-15 00:21:03.871 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:408 - Setting the zero intensity values in the dataframe
2019-12-15 00:21:19.285 | DEBUG    | pals.pathway_analysis:get_ora_df:132 - Correcting for multiple t-tests
2019-12-15 00:21:19.292 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:494 - Calculating dataset formula coverage
2019-12-15 00:21:19.302 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:408 - Setting the zero intensity values in the dataframe
2019-12-15 00:21:19.386 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:389 - Scaling the data across the sample: zero mean and unit variance
2019-12-15 00:21:19.440 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:228 - Mean values of the rows in the DF is [ 0. -0.  0. ... -0. -0. -0.]
2019-12-15 00:21:19.441 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:229 - Va

In [48]:
full_df_ora

Unnamed: 0_level_0,pw_name,Stage2/Stage1 p-value,Stage1/Control p-value,Stage2/Control p-value,PiMP_KEGG Stage2/Stage1 comb_p,PiMP_KEGG Stage1/Control comb_p,PiMP_KEGG Stage2/Control comb_p,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
map00930,Caprolactam degradation,0.020308,2.629191e-07,1.609888e-05,0.209176,0.000054,0.001105,19,12,63.16
map00622,Xylene degradation,0.037897,2.634549e-05,3.445397e-07,0.281996,0.002714,0.000071,24,10,41.67
map00642,Ethylbenzene degradation,0.361732,1.388231e-04,2.024196e-06,0.856515,0.007149,0.000208,14,7,50.00
map00460,Cyanoamino acid metabolism,0.007631,1.323420e-04,3.541736e-02,0.120925,0.007149,0.455999,40,23,57.50
map00061,Fatty acid biosynthesis,0.274202,4.188068e-04,3.087144e-03,0.724174,0.017255,0.057814,10,5,50.00
...,...,...,...,...,...,...,...,...,...,...
map01051,Biosynthesis of ansamycins,1.000000,7.069843e-01,1.000000e+00,1.000000,1.000000,1.000000,30,2,6.67
map00473,D-Alanine metabolism,0.091559,1.000000e+00,1.000000e+00,0.460028,1.000000,1.000000,3,2,66.67
map00600,Sphingolipid metabolism,1.000000,1.000000e+00,1.000000e+00,1.000000,1.000000,1.000000,10,4,40.00
map00450,Selenocompound metabolism,1.000000,1.000000e+00,1.000000e+00,1.000000,1.000000,1.000000,21,1,4.76


In [49]:
full_df_pals

Unnamed: 0,pw_name,Stage2/Stage1 p-value,Stage1/Control p-value,Stage2/Control p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG Stage2/Stage1 comb_p,PiMP_KEGG Stage1/Control comb_p,PiMP_KEGG Stage2/Control comb_p
map00902,Monoterpenoid biosynthesis,0.106336,0.000000e+00,1.823120e-06,19,5,26.32,0.015851,2.05,10.79,0.106336,0.000000e+00,1.823120e-06
map00903,Limonene and pinene degradation,0.050299,0.000000e+00,3.532938e-10,17,5,29.41,0.009142,1.84,10.82,0.050299,0.000000e+00,3.532938e-10
map04961,Endocrine and other factor-regulated calcium r...,0.000000,0.000000e+00,1.000000e+00,7,1,14.29,0.211225,0.76,10.86,0.000000,0.000000e+00,1.000000e+00
map07226,"Progesterone, androgen and estrogen receptor a...",0.000000,0.000000e+00,1.000000e+00,5,1,20.00,0.130425,0.54,10.80,0.000000,0.000000e+00,1.000000e+00
map00626,Naphthalene degradation,0.932438,9.139592e-15,2.013188e-05,43,5,11.63,0.338317,4.65,10.81,0.932438,9.139592e-15,2.013188e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map03320,PPAR signaling pathway,1.000000,1.000000e+00,1.000000e+00,4,1,25.00,0.093455,0.43,10.75,1.000000,1.000000e+00,1.000000e+00
ingenza00005,Valine Degradation,1.000000,1.000000e+00,1.000000e+00,4,2,50.00,0.010601,0.43,10.75,1.000000,1.000000e+00,1.000000e+00
map00650,Butanoate metabolism,1.000000,1.000000e+00,1.000000e+00,30,10,33.33,0.000229,3.24,10.80,1.000000,1.000000e+00,1.000000e+00
map05222,Small cell lung cancer,1.000000,1.000000e+00,1.000000e+00,1,1,100.00,0.011656,0.11,11.00,1.000000,1.000000e+00,1.000000e+00


## Try resampling on the columns (samples)

Now try with a resampled data source. Here we will randomly resample the columns from the original full data.

In [50]:
ds_plasma_resampled = ds_plasma.resample(n_sample, case=case, control=control, axis=1)
pals = PALS(ds_plasma_resampled, plage_weight=pals_plage_weight, hg_weight=pals_hg_weight, num_resamples=pals_num_resamples)
partial_df_ora = pals.get_ora_df()
partial_df_pals = pals.get_pathway_df(resample=pals_resample)
partial_df_ora.sort_values(significant_column, inplace=True)
partial_df_pals.sort_values(significant_column, inplace=True)

2019-12-15 00:23:09.100 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-15 00:23:09.126 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-15 00:23:09.136 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-15 00:23:09.492 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts
2019-12-15 00:23:09.502 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2019-12-15 00:23:09.504 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:408 - Setting the zero intensity values in the dataframe
2019-12-15 00:23:13.322 | DEBUG    | pals.pathway_analysis:get_ora_df:132 - Correcting for multiple t-tests
2019-12-15 00:23:13.326 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:494 - Calculating dataset formula coverage
2019-12-15 00:23:13.329 | DEBUG    | pals.pathway_analysis:_change_zero_peak

In [51]:
ora_full = _select_significant_entries(full_df_ora, significant_column, N, threshold)
ora_partial = _select_significant_entries(partial_df_ora, significant_column, N, threshold)
_compute_prec_rec_f1(ora_full, ora_partial)

(7, 7, 2, 0.5, 0.7777777777777778, 0.6086956521739131)

In [52]:
ora_full

Unnamed: 0_level_0,pw_name,p_value
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1
map00930,Caprolactam degradation,5.4e-05
map00622,Xylene degradation,0.002714
map00642,Ethylbenzene degradation,0.007149
map00460,Cyanoamino acid metabolism,0.007149
map00061,Fatty acid biosynthesis,0.017255
map00643,Styrene degradation,0.017939
ingenza00007,IG-Amino-acid Biosynthesis 2,0.029064
map00360,Phenylalanine metabolism,0.029064
map00350,Tyrosine metabolism,0.029064


In [53]:
ora_partial

Unnamed: 0_level_0,pw_name,p_value
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1
map00642,Ethylbenzene degradation,0.019272
map04974,Protein digestion and absorption,0.021651
map00061,Fatty acid biosynthesis,0.021651
map00460,Cyanoamino acid metabolism,0.021651
map00350,Tyrosine metabolism,0.021651
map00643,Styrene degradation,0.021651
map00360,Phenylalanine metabolism,0.021651
map00970,Aminoacyl-tRNA biosynthesis,0.031074
map00622,Xylene degradation,0.033882
map00592,alpha-Linolenic acid metabolism,0.037017


In [54]:
pals_full = _select_significant_entries(full_df_pals, significant_column, N, threshold)
pals_partial = _select_significant_entries(partial_df_pals, significant_column, N, threshold)
_compute_prec_rec_f1(pals_full, pals_partial)

(5, 1, 15, 0.8333333333333334, 0.25, 0.3846153846153846)

In [55]:
pals_full

Unnamed: 0,pw_name,p_value
map00902,Monoterpenoid biosynthesis,0.0
map00903,Limonene and pinene degradation,0.0
map04961,Endocrine and other factor-regulated calcium r...,0.0
map07226,"Progesterone, androgen and estrogen receptor a...",0.0
map00626,Naphthalene degradation,9.139592e-15
map00622,Xylene degradation,5.543e-14
map00930,Caprolactam degradation,1.311906e-13
map00981,Insect hormone biosynthesis,5.079867e-13
map04913,Ovarian steroidogenesis,1.620474e-11
map05215,Prostate cancer,3.321734e-11


In [56]:
pals_partial

Unnamed: 0,pw_name,p_value
map07226,"Progesterone, androgen and estrogen receptor a...",0.000235
map04961,Endocrine and other factor-regulated calcium r...,0.000235
map00930,Caprolactam degradation,0.015241
map00660,C5-Branched dibasic acid metabolism,0.026283
map00982,Drug metabolism - cytochrome P450,0.027962
map00380,Tryptophan metabolism,0.032979


## Try resampling on the rows (peaks)

In [22]:
n_sample = 10000
ds_plasma_resampled = ds_plasma.resample(n_sample, case=None, control=None, axis=0)

2019-12-14 03:02:50.197 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-14 03:02:50.217 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-14 03:02:50.224 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-14 03:02:50.460 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


In [23]:
pals = PALS(ds_plasma_resampled, plage_weight=1, hg_weight=1, num_resamples=pals_num_resamples)
partial_df_ora = pals.get_ora_df()
partial_df_pals = pals.get_pathway_df(resample=pals_resample)
partial_df_ora.sort_values(significant_column, inplace=True)
partial_df_pals.sort_values(significant_column, inplace=True)

2019-12-14 03:02:53.855 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2019-12-14 03:02:53.858 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:416 - Setting the zero intensity values in the dataframe
2019-12-14 03:03:03.363 | DEBUG    | pals.pathway_analysis:get_ora_df:132 - Correcting for multiple t-tests
2019-12-14 03:03:03.370 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:502 - Calculating dataset formula coverage
2019-12-14 03:03:03.377 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:416 - Setting the zero intensity values in the dataframe
2019-12-14 03:03:03.442 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:397 - Scaling the data across the sample: zero mean and unit variance
2019-12-14 03:03:03.482 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:228 - Mean values of the rows in the DF is [ 0. -0.  0. ...  0.  0. -0.]
2019-12-14 03:03:03.483 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:229 - Va

In [24]:
ora_full = _select_significant_entries(full_df_ora, significant_column, N, threshold)
ora_partial = _select_significant_entries(partial_df_ora, significant_column, N, threshold)
_compute_prec_rec_f1(ora_full, ora_partial)

(7, 2, 2, 0.7777777777777778, 0.7777777777777778, 0.7777777777777778)

In [25]:
ora_full

Unnamed: 0_level_0,pw_name,p_value
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1
map00930,Caprolactam degradation,5.4e-05
map00622,Xylene degradation,0.002714
map00642,Ethylbenzene degradation,0.007149
map00460,Cyanoamino acid metabolism,0.007149
map00061,Fatty acid biosynthesis,0.017255
map00643,Styrene degradation,0.017939
ingenza00007,IG-Amino-acid Biosynthesis 2,0.029064
map00360,Phenylalanine metabolism,0.029064
map00350,Tyrosine metabolism,0.029064


In [26]:
ora_partial

Unnamed: 0_level_0,pw_name,p_value
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1
map00930,Caprolactam degradation,0.002045
map00622,Xylene degradation,0.003005
map00642,Ethylbenzene degradation,0.003005
map00643,Styrene degradation,0.004757
map00460,Cyanoamino acid metabolism,0.004757
ingenza00007,IG-Amino-acid Biosynthesis 2,0.014557
map00350,Tyrosine metabolism,0.020615
map00290,"Valine, leucine and isoleucine biosynthesis",0.029312
map00330,Arginine and proline metabolism,0.040058


In [27]:
pals_full = _select_significant_entries(full_df_pals, significant_column, N, threshold)
pals_partial = _select_significant_entries(partial_df_pals, significant_column, N, threshold)
_compute_prec_rec_f1(pals_full, pals_partial)

(13, 7, 7, 0.65, 0.65, 0.65)

In [28]:
pals_full

Unnamed: 0,pw_name,p_value
map07226,"Progesterone, androgen and estrogen receptor a...",0.0
map04961,Endocrine and other factor-regulated calcium r...,0.0
map00903,Limonene and pinene degradation,2.802537e-32
map00902,Monoterpenoid biosynthesis,1.103771e-12
map00930,Caprolactam degradation,5.088431e-12
map00981,Insect hormone biosynthesis,5.025565e-09
map05215,Prostate cancer,1.751666e-08
map00622,Xylene degradation,2.980199e-08
map04913,Ovarian steroidogenesis,3.163928e-08
map00626,Naphthalene degradation,1.152693e-06


In [29]:
pals_partial

Unnamed: 0,pw_name,p_value
map00902,Monoterpenoid biosynthesis,0.0
map00626,Naphthalene degradation,0.0
map00930,Caprolactam degradation,0.0
map00622,Xylene degradation,0.0
map00623,Toluene degradation,0.0
map00982,Drug metabolism - cytochrome P450,0.0
map00903,Limonene and pinene degradation,0.0
map00981,Insect hormone biosynthesis,4.622703e-13
map05211,Renal cell carcinoma,1.283182e-08
map00592,alpha-Linolenic acid metabolism,8.211588e-06
