In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from loguru import logger
import seaborn as sns
import pandas as pd

In [4]:
import os
import sys
sys.path.append('..')

In [5]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.evaluation import run_experiment, _select_significant_entries, _compute_prec_rec_f1
from pals.common import save_obj, set_log_level_debug, set_log_level_info, set_log_level_warning, DATABASE_PIMP_KEGG, SIGNIFICANT_THRESHOLD
from pals.feature_extraction import DataSource
from pals.PALS import PALS
from pals.ORA import ORA
from pals.GSEA import GSEA

2020-01-07 13:24:12.244 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# HAT Data Analysis

### Load data

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id_plasma = 636
int_df_plasma, annotation_df_plasma, experimental_design_plasma = download_from_pimp(token, PIMP_HOST, analysis_id_plasma, 'kegg')

2020-01-07 13:24:12.417 | DEBUG    | pals.pimp_tools:download_from_pimp:119 - Trying to load data from temp file: C:\Users\joewa\AppData\Local\Temp\pimp_analysis_636.p


### Create Data Sources

In [8]:
database_name = DATABASE_PIMP_KEGG
ds_plasma = DataSource(int_df_plasma, annotation_df_plasma, experimental_design_plasma, database_name)

2020-01-07 13:24:12.541 | DEBUG    | pals.feature_extraction:__init__:42 - Using PiMP_KEGG as database
2020-01-07 13:24:12.541 | DEBUG    | pals.feature_extraction:get_database:105 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2020-01-07 13:24:12.564 | DEBUG    | pals.feature_extraction:__init__:55 - Mapping pathway to unique ids
2020-01-07 13:24:12.569 | DEBUG    | pals.feature_extraction:__init__:69 - Creating dataset to pathway mapping
2020-01-07 13:24:12.904 | DEBUG    | pals.feature_extraction:__init__:97 - Computing unique id counts


### Run the different methods

In [9]:
significant_column = 'PiMP_KEGG Stage1/Control comb_p'
pals_plage_weight = 5
pals_hg_weight = 1
pals_num_resamples = 1000
pals_resample = True if pals_num_resamples > 0 else False
case = 'Stage1'
control = 'Control'
N = 20
threshold = SIGNIFICANT_THRESHOLD

#### Run PALS

In [10]:
pals = PALS(ds_plasma, plage_weight=pals_plage_weight, hg_weight=pals_hg_weight, num_resamples=pals_num_resamples)
pals_df = pals.get_pathway_df(resample=pals_resample)
pals_df.sort_values(significant_column, inplace=True)

2020-01-07 13:24:13.064 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:280 - Setting the zero intensity values in the dataframe
2020-01-07 13:24:13.150 | DEBUG    | pals.feature_extraction:standardize_intensity_df:261 - Scaling the data across the sample: zero mean and unit variance
2020-01-07 13:24:13.213 | DEBUG    | pals.PALS:get_plage_activity_df:75 - Mean values of the rows in the DF is [ 0. -0.  0. ... -0. -0. -0.]
2020-01-07 13:24:13.214 | DEBUG    | pals.PALS:get_plage_activity_df:76 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2020-01-07 13:24:15.322 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:87 - Calculating plage p-values with resampling
2020-01-07 13:24:15.323 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:91 - Comparison Stage2/Stage1
2020-01-07 13:24:15.323 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:99 - Resampling 0/1000
2020-01-07 13:24:15.427 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:99 - Resampling 100/1000
2020-01-0

In [11]:
pals_df

Unnamed: 0,pw_name,Stage2/Stage1 p-value,Stage1/Control p-value,Stage2/Control p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG Stage2/Stage1 comb_p,PiMP_KEGG Stage1/Control comb_p,PiMP_KEGG Stage2/Control comb_p
map04961,Endocrine and other factor-regulated calcium r...,0.000000,0.000000e+00,1.000000e+00,7,1,14.29,5.514965e-01,0.76,10.86,0.000000,0.000000e+00,1.000000e+00
map07226,"Progesterone, androgen and estrogen receptor a...",0.000000,0.000000e+00,1.000000e+00,5,1,20.00,4.359032e-01,0.54,10.80,0.000000,0.000000e+00,1.000000e+00
map00903,Limonene and pinene degradation,0.059113,3.640822e-10,1.985833e-06,17,5,29.41,2.960171e-02,1.84,10.82,0.028590,7.264882e-11,4.958223e-07
map00622,Xylene degradation,0.867021,2.191707e-08,2.104320e-05,24,10,41.67,9.271170e-05,2.59,10.79,0.639723,5.255528e-10,1.020707e-06
map00930,Caprolactam degradation,0.999999,7.410467e-07,7.565146e-09,19,12,63.16,5.218894e-08,2.05,10.79,0.999864,4.124591e-09,2.150882e-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map03320,PPAR signaling pathway,1.000000,1.000000e+00,1.000000e+00,4,1,25.00,3.674136e-01,0.43,10.75,1.000000,1.000000e+00,1.000000e+00
map00627,Aminobenzoate degradation,1.000000,1.000000e+00,1.000000e+00,65,11,16.92,8.580683e-02,7.03,10.82,1.000000,1.000000e+00,1.000000e+00
map00785,Lipoic acid metabolism,1.000000,1.000000e+00,1.000000e+00,2,1,50.00,2.045795e-01,0.22,11.00,1.000000,1.000000e+00,1.000000e+00
map00960,"Tropane, piperidine and pyridine alkaloid bios...",0.770239,1.000000e+00,8.421661e-01,51,17,33.33,1.161925e-05,5.51,10.80,0.458367,1.000000e+00,5.612332e-01


In [12]:
_select_significant_entries(pals_df, significant_column, threshold, None)

Unnamed: 0,pw_name,p_value,sf,unq_pw_F,tot_ds_F,F_coverage
map04961,Endocrine and other factor-regulated calcium r...,0.0,0.5514965,7,1,14.29
map07226,"Progesterone, androgen and estrogen receptor a...",0.0,0.4359032,5,1,20.0
map00903,Limonene and pinene degradation,7.264882e-11,0.02960171,17,5,29.41
map00622,Xylene degradation,5.255528e-10,9.27117e-05,24,10,41.67
map00930,Caprolactam degradation,4.124591e-09,5.218894e-08,19,12,63.16
map00902,Monoterpenoid biosynthesis,2.055552e-08,0.04650501,19,5,26.32
map00626,Naphthalene degradation,2.840158e-08,0.5036634,43,5,11.63
map00623,Toluene degradation,2.005946e-07,0.4102787,30,4,13.33
map00981,Insect hormone biosynthesis,1.140786e-06,0.4002601,21,3,14.29
map00982,Drug metabolism - cytochrome P450,1.393897e-06,0.5302196,63,7,11.11


#### Run ORA

In [13]:
ora = ORA(ds_plasma)
ora_df = ora.get_pathway_df()
ora_df.sort_values(significant_column, inplace=True)

2020-01-07 13:24:19.394 | DEBUG    | pals.ORA:get_pathway_df:31 - Calculating ORA
2020-01-07 13:24:19.395 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:280 - Setting the zero intensity values in the dataframe
2020-01-07 13:24:35.022 | DEBUG    | pals.ORA:get_pathway_df:91 - Correcting for multiple t-tests
2020-01-07 13:24:35.029 | DEBUG    | pals.feature_extraction:_calculate_coverage_df:301 - Calculating dataset formula coverage


In [14]:
ora_df

Unnamed: 0,pw_name,Stage2/Stage1 p-value,Stage1/Control p-value,Stage2/Control p-value,PiMP_KEGG Stage2/Stage1 comb_p,PiMP_KEGG Stage1/Control comb_p,PiMP_KEGG Stage2/Control comb_p,unq_pw_F,tot_ds_F,F_coverage
map00930,Caprolactam degradation,0.020308,2.629191e-07,1.609888e-05,0.209176,0.000054,0.001105,19,12,63.16
map00622,Xylene degradation,0.037897,2.634549e-05,3.445397e-07,0.281996,0.002714,0.000071,24,10,41.67
map00642,Ethylbenzene degradation,0.361732,1.388231e-04,2.024196e-06,0.856515,0.007149,0.000208,14,7,50.00
map00460,Cyanoamino acid metabolism,0.007631,1.323420e-04,3.541736e-02,0.120925,0.007149,0.455999,40,23,57.50
map00061,Fatty acid biosynthesis,0.274202,4.188068e-04,3.087144e-03,0.724174,0.017255,0.057814,10,5,50.00
...,...,...,...,...,...,...,...,...,...,...
map00365,Furfural degradation,1.000000,1.000000e+00,1.000000e+00,1.000000,1.000000,1.000000,12,2,16.67
map00627,Aminobenzoate degradation,0.614098,7.409747e-01,3.362318e-01,1.000000,1.000000,1.000000,65,11,16.92
map05140,Leishmaniasis,1.000000,1.000000e+00,1.201963e-01,1.000000,1.000000,0.825348,4,1,25.00
map04713,Circadian entrainment,1.000000,1.000000e+00,1.000000e+00,1.000000,1.000000,1.000000,8,1,12.50


In [15]:
_select_significant_entries(ora_df, significant_column, threshold, None)

Unnamed: 0,pw_name,p_value,unq_pw_F,tot_ds_F,F_coverage
map00930,Caprolactam degradation,5.4e-05,19,12,63.16
map00622,Xylene degradation,0.002714,24,10,41.67
map00642,Ethylbenzene degradation,0.007149,14,7,50.0
map00460,Cyanoamino acid metabolism,0.007149,40,23,57.5
map00061,Fatty acid biosynthesis,0.017255,10,5,50.0
map00643,Styrene degradation,0.017939,18,9,50.0
map00360,Phenylalanine metabolism,0.029064,55,16,29.09
ingenza00007,IG-Amino-acid Biosynthesis 2,0.029064,6,5,83.33
map00350,Tyrosine metabolism,0.029064,53,11,20.75


#### Run GSEA 

In [16]:
gsea = GSEA(ds_plasma, random_sets=100, pbar=True)
gsea_df = gsea.get_pathway_df()
gsea_df.sort_values(significant_column, inplace=True)

2020-01-07 13:24:35.263 | DEBUG    | pals.GSEA:get_pathway_df:28 - Calculating GSEA
2020-01-07 13:24:35.265 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:280 - Setting the zero intensity values in the dataframe
2020-01-07 13:24:35.369 | DEBUG    | pals.GSEA:get_pathway_df:55 - Running comparison case=Stage2 control=Stage1
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:46<00:00,  1.06s/it]
2020-01-07 13:26:22.548 | DEBUG    | pals.GSEA:get_pathway_df:55 - Running comparison case=Stage1 control=Control
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:45<00:00,  1.05s/it]
2020-01-07 13:28:08.817 | DEBUG    | pals.GSEA:get_pathway_df:55 - Running comparison case=Stage2 control=Control
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 

In [17]:
gsea_df

Unnamed: 0,pw_name,Stage2/Stage1 ES_score,Stage2/Stage1 p-value,PiMP_KEGG Stage2/Stage1 comb_p,Stage1/Control ES_score,Stage1/Control p-value,PiMP_KEGG Stage1/Control comb_p,Stage2/Control ES_score,Stage2/Control p-value,PiMP_KEGG Stage2/Control comb_p,unq_pw_F,tot_ds_F,F_coverage
map00903,Limonene and pinene degradation,-1.473959,0.085106,0.085106,-2.075759,0.000000,0.000000,-1.405121,0.137255,0.137255,17,5,29.41
map00930,Caprolactam degradation,-1.563411,0.096154,0.096154,-2.094391,0.000000,0.000000,-1.493230,0.080000,0.080000,19,12,63.16
map00940,Phenylpropanoid biosynthesis,-1.436027,0.145455,0.145455,1.772935,0.000000,0.000000,-1.558353,0.035088,0.035088,50,12,24.00
map00642,Ethylbenzene degradation,-1.094570,0.318182,0.318182,-1.665460,0.000000,0.000000,-1.035182,0.460000,0.460000,14,7,50.00
map07226,"Progesterone, androgen and estrogen receptor a...",1.331980,0.000000,0.000000,-1.275313,0.000000,0.000000,1.314259,0.069767,0.069767,5,1,20.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ingenza00002,"1,2-Propanediol",0.826378,0.716981,0.716981,-0.524094,0.948276,0.948276,0.748704,0.830189,0.830189,4,3,75.00
map00785,Lipoic acid metabolism,-0.664858,0.877551,0.877551,0.610267,0.978261,0.978261,-0.650110,0.936170,0.936170,2,1,50.00
map00280,"Valine, leucine and isoleucine degradation",-0.969021,0.562500,0.562500,0.504715,0.980769,0.980769,-1.054278,0.416667,0.416667,24,6,25.00
map00760,Nicotinate and nicotinamide metabolism,-0.816514,0.717391,0.717391,-0.565150,0.983607,0.983607,-0.808606,0.787234,0.787234,40,15,37.50


In [18]:
_select_significant_entries(gsea_df, significant_column, threshold, None)

Unnamed: 0,pw_name,p_value,unq_pw_F,tot_ds_F,F_coverage
map00903,Limonene and pinene degradation,0.0,17,5,29.41
map04961,Endocrine and other factor-regulated calcium r...,0.0,7,1,14.29
map00440,Phosphonate and phosphinate metabolism,0.0,44,3,6.82
map05200,Pathways in cancer,0.0,15,4,26.67
map00592,alpha-Linolenic acid metabolism,0.0,25,6,24.0
map00622,Xylene degradation,0.0,24,10,41.67
map00981,Insect hormone biosynthesis,0.0,21,3,14.29
map07226,"Progesterone, androgen and estrogen receptor a...",0.0,5,1,20.0
map00642,Ethylbenzene degradation,0.0,14,7,50.0
map00940,Phenylpropanoid biosynthesis,0.0,50,12,24.0
