In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from loguru import logger
import seaborn as sns
import pandas as pd

In [4]:
import os
import sys
sys.path.append('..')

In [5]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.evaluation import _select_significant_entries, _compute_prec_rec_f1
from pals.common import save_obj, set_log_level_debug, set_log_level_info, set_log_level_warning, DATABASE_PIMP_KEGG, SIGNIFICANT_THRESHOLD
from pals.feature_extraction import DataSource
from pals.common import *
from pals.PALS import PALS
from pals.ORA import ORA
from pals.GSEA import GSEA

2020-01-31 12:02:32.578 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# HAT Data Analysis

### Load data

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id_plasma = 636
int_df_plasma, annotation_df_plasma, experimental_design_plasma = download_from_pimp(token, PIMP_HOST, analysis_id_plasma, 'kegg')

2020-01-31 12:02:32.953 | DEBUG    | pals.pimp_tools:download_from_pimp:119 - Trying to load data from temp file: C:\Users\joewa\AppData\Local\Temp\pimp_analysis_636.p


### Create Data Sources

In [8]:
database_name = DATABASE_PIMP_KEGG
ds_plasma = DataSource(int_df_plasma, annotation_df_plasma, experimental_design_plasma, database_name)

2020-01-31 12:02:33.168 | DEBUG    | pals.feature_extraction:__init__:42 - Using PiMP_KEGG as database
2020-01-31 12:02:33.170 | DEBUG    | pals.feature_extraction:get_database:105 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2020-01-31 12:02:33.268 | DEBUG    | pals.feature_extraction:__init__:55 - Mapping pathway to unique ids
2020-01-31 12:02:33.277 | DEBUG    | pals.feature_extraction:__init__:69 - Creating dataset to pathway mapping
2020-01-31 12:02:34.294 | DEBUG    | pals.feature_extraction:__init__:97 - Computing unique id counts


### Run the different methods

In [9]:
significant_column = 'PiMP_KEGG Stage1/Control comb_p'
pals_plage_weight = 5
pals_hg_weight = 1
num_resamples = 1000
pals_resample = True if num_resamples > 0 else False
case = 'Stage1'
control = 'Control'
N = None
threshold = SIGNIFICANT_THRESHOLD

#### Run PALS

In [10]:
pals = PALS(ds_plasma, plage_weight=pals_plage_weight, hg_weight=pals_hg_weight, num_resamples=num_resamples, case=case, control=control)
pals_df = pals.get_pathway_df(resample=pals_resample)
pals_df.sort_values(significant_column, inplace=True)

2020-01-31 12:02:34.752 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:283 - Setting the zero intensity values in the dataframe
2020-01-31 12:02:34.956 | DEBUG    | pals.feature_extraction:standardize_intensity_df:264 - Scaling the data across the sample: zero mean and unit variance
2020-01-31 12:02:35.104 | DEBUG    | pals.PALS:get_plage_activity_df:79 - Mean values of the rows in the DF is [ 0. -0.  0. ... -0. -0. -0.]
2020-01-31 12:02:35.106 | DEBUG    | pals.PALS:get_plage_activity_df:80 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2020-01-31 12:02:39.596 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:91 - Calculating plage p-values with resampling
2020-01-31 12:02:39.598 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:98 - Comparison Stage1/Control
2020-01-31 12:02:39.599 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:106 - Resampling 0/1000
2020-01-31 12:02:39.853 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:106 - Resampling 100/1000
2020-0

In [11]:
pals_df

Unnamed: 0,pw_name,Stage1/Control p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG Stage1/Control comb_p
map07226,"Progesterone, androgen and estrogen receptor a...",0.000000e+00,5,1,20.00,4.359032e-01,0.54,10.80,0.000000e+00
map04961,Endocrine and other factor-regulated calcium r...,0.000000e+00,7,1,14.29,5.514965e-01,0.76,10.86,0.000000e+00
map00903,Limonene and pinene degradation,1.153790e-08,17,5,29.41,2.960171e-02,1.84,10.82,2.476997e-09
map00622,Xylene degradation,5.560178e-07,24,10,41.67,9.271170e-05,2.59,10.79,1.802196e-08
map00930,Caprolactam degradation,3.018462e-06,19,12,63.16,5.218894e-08,2.05,10.79,2.122471e-08
...,...,...,...,...,...,...,...,...,...
map00460,Cyanoamino acid metabolism,1.000000e+00,40,23,57.50,4.468022e-13,4.32,10.80,1.000000e+00
map00941,Flavonoid biosynthesis,1.000000e+00,38,3,7.89,7.957853e-01,4.11,10.82,1.000000e+00
map00950,Isoquinoline alkaloid biosynthesis,1.000000e+00,64,6,9.38,7.060864e-01,6.92,10.81,1.000000e+00
map03320,PPAR signaling pathway,1.000000e+00,4,1,25.00,3.674136e-01,0.43,10.75,1.000000e+00


In [12]:
_select_significant_entries(pals_df, significant_column, threshold, N)

Unnamed: 0,pw_name,p_value,sf,unq_pw_F,tot_ds_F,F_coverage
map07226,"Progesterone, androgen and estrogen receptor a...",0.0,0.4359032,5,1,20.0
map04961,Endocrine and other factor-regulated calcium r...,0.0,0.5514965,7,1,14.29
map00903,Limonene and pinene degradation,2.476997e-09,0.02960171,17,5,29.41
map00622,Xylene degradation,1.802196e-08,9.27117e-05,24,10,41.67
map00930,Caprolactam degradation,2.122471e-08,5.218894e-08,19,12,63.16
map00902,Monoterpenoid biosynthesis,1.49615e-07,0.04650501,19,5,26.32
map00626,Naphthalene degradation,7.335063e-07,0.5036634,43,5,11.63
map00623,Toluene degradation,2.292016e-06,0.4102787,30,4,13.33
map00981,Insect hormone biosynthesis,4.274722e-06,0.4002601,21,3,14.29
map05215,Prostate cancer,5.594201e-06,0.2532519,9,2,22.22


#### Run ORA

In [13]:
ora = ORA(ds_plasma, case=case, control=control)
ora_df = ora.get_pathway_df()
ora_df.sort_values(significant_column, inplace=True)

2020-01-31 12:02:43.213 | DEBUG    | pals.ORA:get_pathway_df:34 - Calculating ORA
2020-01-31 12:02:43.215 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:283 - Setting the zero intensity values in the dataframe
2020-01-31 12:02:55.254 | DEBUG    | pals.ORA:get_pathway_df:97 - Correcting for multiple t-tests
2020-01-31 12:02:55.260 | DEBUG    | pals.feature_extraction:_calculate_coverage_df:304 - Calculating dataset formula coverage


In [14]:
ora_df

Unnamed: 0,pw_name,Stage1/Control p-value,PiMP_KEGG Stage1/Control comb_p,unq_pw_F,tot_ds_F,F_coverage
map00930,Caprolactam degradation,2.629191e-07,0.000054,19,12,63.16
map00622,Xylene degradation,2.634549e-05,0.002714,24,10,41.67
map00642,Ethylbenzene degradation,1.388231e-04,0.007149,14,7,50.00
map00460,Cyanoamino acid metabolism,1.323420e-04,0.007149,40,23,57.50
map00061,Fatty acid biosynthesis,4.188068e-04,0.017255,10,5,50.00
...,...,...,...,...,...,...
map04672,Intestinal immune network for IgA production,1.000000e+00,1.000000,1,1,100.00
map00232,Caffeine metabolism,1.000000e+00,1.000000,15,7,46.67
map00365,Furfural degradation,1.000000e+00,1.000000,12,2,16.67
map04724,Glutamatergic synapse,1.000000e+00,1.000000,7,2,28.57


In [15]:
_select_significant_entries(ora_df, significant_column, threshold, N)

Unnamed: 0,pw_name,p_value,unq_pw_F,tot_ds_F,F_coverage
map00930,Caprolactam degradation,5.4e-05,19,12,63.16
map00622,Xylene degradation,0.002714,24,10,41.67
map00642,Ethylbenzene degradation,0.007149,14,7,50.0
map00460,Cyanoamino acid metabolism,0.007149,40,23,57.5
map00061,Fatty acid biosynthesis,0.017255,10,5,50.0
map00643,Styrene degradation,0.017939,18,9,50.0
map00360,Phenylalanine metabolism,0.029064,55,16,29.09
map00350,Tyrosine metabolism,0.029064,53,11,20.75
ingenza00007,IG-Amino-acid Biosynthesis 2,0.029064,6,5,83.33


#### Run GSEA

In [16]:
gsea = GSEA(ds_plasma, num_resamples=num_resamples, method=GSEA_RANKING_SNR, case=case, control=control)
gsea_df = gsea.get_pathway_df()
gsea_df.sort_values(significant_column, inplace=True)

2020-01-31 12:02:55.691 | DEBUG    | pals.GSEA:get_pathway_df:51 - Calculating GSEA
2020-01-31 12:02:55.692 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:283 - Setting the zero intensity values in the dataframe
2020-01-31 12:02:55.875 | DEBUG    | pals.GSEA:get_pathway_df:80 - Running comparison case=Stage1 control=Control
2020-01-31 12:03:10.758 | DEBUG    | pals.feature_extraction:_calculate_coverage_df:304 - Calculating dataset formula coverage


In [17]:
gsea_df

Unnamed: 0,pw_name,Stage1/Control p-value,PiMP_KEGG Stage1/Control comb_p,Stage1/Control ES_score,unq_pw_F,tot_ds_F,F_coverage
map00622,Xylene degradation,0.010000,0.016569,0.660550,24,10,41.67
map05200,Pathways in cancer,0.000000,0.020711,0.781554,15,4,26.67
map00903,Limonene and pinene degradation,0.010204,0.033138,0.671105,17,5,29.41
map00140,Steroid hormone biosynthesis,0.034884,0.047636,0.704274,45,6,13.33
map04961,Endocrine and other factor-regulated calcium r...,0.000000,0.236108,0.999363,7,1,14.29
...,...,...,...,...,...,...,...
map00943,Isoflavonoid biosynthesis,0.840336,1.000000,0.646272,31,1,3.23
map04726,Serotonergic synapse,0.950413,1.000000,0.233454,20,3,15.00
map00940,Phenylpropanoid biosynthesis,0.019231,1.000000,-0.505356,50,12,24.00
map05132,Salmonella infection,0.908397,1.000000,0.558955,2,1,50.00


In [18]:
_select_significant_entries(gsea_df, significant_column, 0.25, N)

Unnamed: 0,pw_name,p_value,unq_pw_F,tot_ds_F,F_coverage
map00622,Xylene degradation,0.016569,24,10,41.67
map05200,Pathways in cancer,0.020711,15,4,26.67
map00903,Limonene and pinene degradation,0.033138,17,5,29.41
map00140,Steroid hormone biosynthesis,0.047636,45,6,13.33
map04961,Endocrine and other factor-regulated calcium r...,0.236108,7,1,14.29
map07226,"Progesterone, androgen and estrogen receptor a...",0.236108,5,1,20.0
map00981,Insect hormone biosynthesis,0.249916,21,3,14.29
