In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from loguru import logger

In [4]:
import os
import sys
sys.path.append('..')

In [5]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.evaluation import run_experiment
from pals.common import save_obj, DATABASE_PIMP_KEGG
from pals.feature_extraction import DataSource

2019-12-02 16:01:50.635 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# HAT Data Analysis

This notebook is used to generate resampled data and run the different methods for comparison in the manuscript.

### Load data

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id_plasma = 636
int_df_plasma, annotation_df_plasma, experimental_design_plasma = download_from_pimp(token, PIMP_HOST, analysis_id_plasma, 'kegg')

http://polyomics.mvls.gla.ac.uk/export/get_ms1_intensities?analysis_id=636 <Response [200]>
http://polyomics.mvls.gla.ac.uk/export/get_ms1_peaks?analysis_id=636 <Response [200]>
http://polyomics.mvls.gla.ac.uk/export/get_experimental_design?analysis_id=636 <Response [200]>


In [8]:
analysis_id_csf = 635
int_df_csf, annotation_df_csf, experimental_design_csf = download_from_pimp(token, PIMP_HOST, analysis_id_csf, 'kegg')

http://polyomics.mvls.gla.ac.uk/export/get_ms1_intensities?analysis_id=635 <Response [200]>
http://polyomics.mvls.gla.ac.uk/export/get_ms1_peaks?analysis_id=635 <Response [200]>
http://polyomics.mvls.gla.ac.uk/export/get_experimental_design?analysis_id=635 <Response [200]>


### Create Data Sources

In [9]:
database_name = DATABASE_PIMP_KEGG
ds_plasma = DataSource(int_df_plasma, annotation_df_plasma, experimental_design_plasma, database_name)
ds_csf = DataSource(int_df_csf, annotation_df_csf, experimental_design_csf, database_name)

2019-12-02 16:02:59.642 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-02 16:02:59.664 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-02 16:02:59.671 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-02 16:03:00.025 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts
2019-12-02 16:03:00.031 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-02 16:03:00.052 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-02 16:03:00.058 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-02 16:03:00.358 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


### Try running ORA vs PALS

In [10]:
from pals.pathway_analysis import PALS
pals = PALS(ds_plasma)

In [11]:
df = pals.get_ora_df()
significant_column = 'PiMP_KEGG Stage1/Control comb_p'
df.sort_values(significant_column)

2019-12-02 16:03:00.504 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2019-12-02 16:03:00.509 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:390 - Setting the zero intensity values in the dataframe
2019-12-02 16:03:20.685 | DEBUG    | pals.pathway_analysis:get_ora_df:114 - Correcting for multiple t-tests
2019-12-02 16:03:20.692 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:476 - Calculating dataset formula coverage


Unnamed: 0_level_0,pw_name,Stage2/Stage1 p-value,Stage1/Control p-value,Stage2/Control p-value,PiMP_KEGG Stage2/Stage1 comb_p,PiMP_KEGG Stage1/Control comb_p,PiMP_KEGG Stage2/Control comb_p,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ingenza00006,IG-Amino-acid Biosynthesis 1,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3,2,66.67
map00472,D-Arginine and D-ornithine metabolism,9.177617e-07,0.000000e+00,0.000000e+00,5.729058e-06,0.000000e+00,0.000000e+00,5,3,60.00
map00970,Aminoacyl-tRNA biosynthesis,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,23,18,78.26
ingenza00007,IG-Amino-acid Biosynthesis 2,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,6,5,83.33
map00460,Cyanoamino acid metabolism,4.517159e-38,2.572348e-35,1.443653e-36,2.326337e-36,1.059807e-33,4.956543e-35,40,23,57.50
...,...,...,...,...,...,...,...,...,...,...
map00943,Isoflavonoid biosynthesis,8.424669e-01,8.066341e-01,8.269592e-01,8.634238e-01,8.226070e-01,8.475303e-01,31,1,3.23
map00904,Diterpenoid biosynthesis,9.191784e-01,8.940585e-01,9.827607e-01,9.327624e-01,9.072712e-01,9.972842e-01,39,1,2.56
map00522,"Biosynthesis of 12-, 14- and 16-membered macro...",9.929083e-01,9.883184e-01,9.989255e-01,9.977517e-01,9.980078e-01,9.996857e-01,66,1,1.52
map01057,Biosynthesis of type II polyketide products,9.997831e-01,9.995114e-01,9.996857e-01,9.997831e-01,9.995114e-01,9.996857e-01,102,1,0.98


In [12]:
df = pals.get_pathway_df()
significant_column = 'PiMP_KEGG Stage1/Control comb_p'
df.sort_values(significant_column)

2019-12-02 16:03:20.792 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:390 - Setting the zero intensity values in the dataframe
2019-12-02 16:03:20.870 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:371 - Scaling the data across the sample: zero mean and unit variance
2019-12-02 16:03:20.930 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:210 - Mean values of the rows in the DF is [ 0. -0.  0. ... -0. -0. -0.]
2019-12-02 16:03:20.931 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:211 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-12-02 16:03:22.987 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:222 - Calculating plage p-values with resampling
2019-12-02 16:03:22.988 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:226 - Comparison Stage2/Stage1
2019-12-02 16:03:22.989 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:234 - Resampling 0/1000
2019-12-02 16:03:23.102 | DEBUG    | pals.pat

Unnamed: 0,pw_name,Stage2/Stage1 p-value,Stage1/Control p-value,Stage2/Control p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG Stage2/Stage1 comb_p,PiMP_KEGG Stage1/Control comb_p,PiMP_KEGG Stage2/Control comb_p
map05215,Prostate cancer,3.584472e-06,0.0,1.000000e+00,9,2,22.22,8.451166e-02,0.97,10.78,1.497429e-06,0.0,1.000000e+00
map00981,Insect hormone biosynthesis,3.821212e-01,0.0,1.586243e-03,21,3,14.29,2.083303e-01,2.27,10.81,3.251370e-01,0.0,1.134699e-03
map04961,Endocrine and other factor-regulated calcium r...,1.160762e-18,0.0,1.000000e+00,7,1,14.29,2.112254e-01,0.76,10.86,1.295493e-18,0.0,1.000000e+00
map00930,Caprolactam degradation,9.999999e-01,0.0,4.000148e-07,19,12,63.16,8.358982e-09,2.05,10.79,9.999687e-01,0.0,1.372289e-09
map00903,Limonene and pinene degradation,6.356628e-02,0.0,1.809224e-05,17,5,29.41,9.142314e-03,1.84,10.82,2.507522e-02,0.0,3.193363e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map00760,Nicotinate and nicotinamide metabolism,1.000000e+00,1.0,1.000000e+00,40,15,37.50,1.973128e-06,4.32,10.80,1.000000e+00,1.0,1.000000e+00
map00020,Citrate cycle (TCA cycle),1.000000e+00,1.0,9.998650e-01,15,3,20.00,8.565295e-02,1.62,10.80,9.999998e-01,1.0,9.995224e-01
map04745,Phototransduction - fly,1.000000e+00,1.0,1.000000e+00,6,2,33.33,3.145508e-02,0.65,10.83,1.000000e+00,1.0,9.999994e-01
map04672,Intestinal immune network for IgA production,1.000000e+00,1.0,1.000000e+00,1,1,100.00,1.165566e-02,0.11,11.00,1.000000e+00,1.0,1.000000e+00


### Run PALS Experiments

In [13]:
n_samples = [4, 8, 12]
n_iter = 5
results = {}

#### Disable debug logging

In [14]:
logger.remove()
logger.add(sys.stderr, level='INFO')

1

#### Run on Plasma samples

In [15]:
data_source = ds_plasma
experiment_name = 'plasma'
case = 'Stage1'
control = 'Control'
significant_column = 'PiMP_KEGG Stage1/Control comb_p'

In [16]:
res = run_experiment(experiment_name, data_source, case, control, n_samples, significant_column, n_iter)
results[experiment_name] = res

2019-12-02 16:03:32.252 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=0 PALS experiment=plasma case=Stage1 control=Control
2019-12-02 16:03:39.095 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=1 PALS experiment=plasma case=Stage1 control=Control
2019-12-02 16:03:45.851 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=2 PALS experiment=plasma case=Stage1 control=Control
2019-12-02 16:03:52.263 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=3 PALS experiment=plasma case=Stage1 control=Control
2019-12-02 16:03:58.897 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=4 PALS experiment=plasma case=Stage1 control=Control
2019-12-02 16:04:05.652 | INFO     | pals.evaluation:run_experiment:30 - n_sample=8 iter=0 PALS experiment=plasma case=Stage1 control=Control
2019-12-02 16:04:13.279 | INFO     | pals.evaluation:run_experiment:30 - n_sample=8 iter=1 PALS experiment=plasma case=Stage1 control=Control
2019-1

#### Run on CSF samples

In [17]:
data_source = ds_csf
experiment_name = 'csf'
case = 'Stage_2'
control = 'Control'
significant_column = 'PiMP_KEGG Stage_2/Control comb_p'

In [18]:
res = run_experiment(experiment_name, data_source, case, control, n_samples, significant_column, n_iter)
results[experiment_name] = res

2019-12-02 16:05:25.909 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=0 PALS experiment=csf case=Stage_2 control=Control
2019-12-02 16:05:30.870 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=1 PALS experiment=csf case=Stage_2 control=Control
2019-12-02 16:05:35.447 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=2 PALS experiment=csf case=Stage_2 control=Control
2019-12-02 16:05:40.079 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=3 PALS experiment=csf case=Stage_2 control=Control
2019-12-02 16:05:45.012 | INFO     | pals.evaluation:run_experiment:30 - n_sample=4 iter=4 PALS experiment=csf case=Stage_2 control=Control
2019-12-02 16:05:49.785 | INFO     | pals.evaluation:run_experiment:30 - n_sample=8 iter=0 PALS experiment=csf case=Stage_2 control=Control
2019-12-02 16:05:54.895 | INFO     | pals.evaluation:run_experiment:30 - n_sample=8 iter=1 PALS experiment=csf case=Stage_2 control=Control
2019-12-02 16:05:59.

#### Save Results

In [19]:
save_obj(results, os.path.join('test_data', 'HAT', 'HAT_results.p'))

Saving <class 'dict'> to test_data\HAT\HAT_results.p
