In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from loguru import logger
import seaborn as sns
import pandas as pd

In [4]:
import os
import sys
sys.path.append('..')

In [6]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.evaluation import _select_significant_entries, _compute_prec_rec_f1
from pals.common import save_obj, set_log_level_debug, set_log_level_info, set_log_level_warning, DATABASE_PIMP_KEGG, SIGNIFICANT_THRESHOLD
from pals.feature_extraction import DataSource
from pals.PALS import PALS
from pals.ORA import ORA
from pals.GSEA import GSEA
from pals.GSEApy import GSEApy

# HAT Data Analysis

### Load data

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [7]:
token = get_pimp_API_token_from_env()

In [8]:
analysis_id_plasma = 636
int_df_plasma, annotation_df_plasma, experimental_design_plasma = download_from_pimp(token, PIMP_HOST, analysis_id_plasma, 'kegg')

2020-01-30 13:37:03.995 | DEBUG    | pals.pimp_tools:download_from_pimp:119 - Trying to load data from temp file: C:\Users\joewa\AppData\Local\Temp\pimp_analysis_636.p


### Create Data Sources

In [9]:
database_name = DATABASE_PIMP_KEGG
ds_plasma = DataSource(int_df_plasma, annotation_df_plasma, experimental_design_plasma, database_name)

2020-01-30 13:37:07.462 | DEBUG    | pals.feature_extraction:__init__:42 - Using PiMP_KEGG as database
2020-01-30 13:37:07.463 | DEBUG    | pals.feature_extraction:get_database:105 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2020-01-30 13:37:07.484 | DEBUG    | pals.feature_extraction:__init__:55 - Mapping pathway to unique ids
2020-01-30 13:37:07.489 | DEBUG    | pals.feature_extraction:__init__:69 - Creating dataset to pathway mapping
2020-01-30 13:37:07.816 | DEBUG    | pals.feature_extraction:__init__:97 - Computing unique id counts


### Run the different methods

In [10]:
significant_column = 'PiMP_KEGG Stage1/Control comb_p'
pals_plage_weight = 5
pals_hg_weight = 1
pals_num_resamples = 1000
pals_resample = True if pals_num_resamples > 0 else False
case = 'Stage1'
control = 'Control'
N = 20
threshold = SIGNIFICANT_THRESHOLD

#### Run PALS

In [11]:
pals = PALS(ds_plasma, plage_weight=pals_plage_weight, hg_weight=pals_hg_weight, num_resamples=pals_num_resamples)
pals_df = pals.get_pathway_df(resample=pals_resample)
pals_df.sort_values(significant_column, inplace=True)

2020-01-30 13:37:11.049 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:283 - Setting the zero intensity values in the dataframe
2020-01-30 13:37:11.134 | DEBUG    | pals.feature_extraction:standardize_intensity_df:264 - Scaling the data across the sample: zero mean and unit variance
2020-01-30 13:37:11.190 | DEBUG    | pals.PALS:get_plage_activity_df:79 - Mean values of the rows in the DF is [ 0. -0.  0. ... -0. -0. -0.]
2020-01-30 13:37:11.190 | DEBUG    | pals.PALS:get_plage_activity_df:80 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2020-01-30 13:37:13.196 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:91 - Calculating plage p-values with resampling
2020-01-30 13:37:13.197 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:98 - Comparison Stage2/Stage1
2020-01-30 13:37:13.197 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:106 - Resampling 0/1000
2020-01-30 13:37:13.297 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:106 - Resampling 100/1000
2020-01

In [12]:
pals_df

Unnamed: 0,pw_name,Stage2/Stage1 p-value,Stage1/Control p-value,Stage2/Control p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG Stage2/Stage1 comb_p,PiMP_KEGG Stage1/Control comb_p,PiMP_KEGG Stage2/Control comb_p
map00902,Monoterpenoid biosynthesis,1.130117e-01,0.0,1.100152e-06,19,5,26.32,4.650501e-02,2.05,10.79,6.468664e-02,0.0,3.320337e-07
map07226,"Progesterone, androgen and estrogen receptor a...",2.808243e-22,0.0,1.000000e+00,5,1,20.00,4.359032e-01,0.54,10.80,1.261265e-21,0.0,1.000000e+00
map04961,Endocrine and other factor-regulated calcium r...,2.808243e-22,0.0,1.000000e+00,7,1,14.29,5.514965e-01,0.76,10.86,2.175169e-21,0.0,1.000000e+00
map00903,Limonene and pinene degradation,5.478326e-02,0.0,1.300117e-12,17,5,29.41,2.960171e-02,1.84,10.82,2.624585e-02,0.0,2.381234e-13
map00930,Caprolactam degradation,9.999998e-01,0.0,0.000000e+00,19,12,63.16,5.218894e-08,2.05,10.79,9.999564e-01,0.0,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map04976,Bile secretion,9.263869e-01,1.0,9.999859e-01,89,12,13.48,2.496155e-01,9.62,10.81,9.012552e-01,1.0,9.999646e-01
map00280,"Valine, leucine and isoleucine degradation",1.000000e+00,1.0,1.000000e+00,24,6,25.00,3.806430e-02,2.59,10.79,1.000000e+00,1.0,1.000000e+00
map00780,Biotin metabolism,9.999990e-01,1.0,9.998064e-01,15,4,26.67,6.995551e-02,1.62,10.80,9.999938e-01,1.0,9.992893e-01
map00365,Furfural degradation,1.000000e+00,1.0,1.000000e+00,12,2,16.67,3.782977e-01,1.30,10.83,1.000000e+00,1.0,1.000000e+00


In [13]:
_select_significant_entries(pals_df, significant_column, threshold, None)

Unnamed: 0,pw_name,p_value,sf,unq_pw_F,tot_ds_F,F_coverage
map00902,Monoterpenoid biosynthesis,0.0,0.04650501,19,5,26.32
map07226,"Progesterone, androgen and estrogen receptor a...",0.0,0.4359032,5,1,20.0
map04961,Endocrine and other factor-regulated calcium r...,0.0,0.5514965,7,1,14.29
map00903,Limonene and pinene degradation,0.0,0.02960171,17,5,29.41
map00930,Caprolactam degradation,0.0,5.218894e-08,19,12,63.16
map00981,Insect hormone biosynthesis,0.0,0.4002601,21,3,14.29
map04913,Ovarian steroidogenesis,7.886635e-18,0.5630889,17,2,11.76
map05215,Prostate cancer,5.337436e-17,0.2532519,9,2,22.22
map00622,Xylene degradation,2.034185e-10,9.27117e-05,24,10,41.67
map00626,Naphthalene degradation,1.182499e-08,0.5036634,43,5,11.63


#### Run ORA

In [14]:
ora = ORA(ds_plasma)
ora_df = ora.get_pathway_df()
ora_df.sort_values(significant_column, inplace=True)

2020-01-30 13:37:19.405 | DEBUG    | pals.ORA:get_pathway_df:34 - Calculating ORA
2020-01-30 13:37:19.406 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:283 - Setting the zero intensity values in the dataframe
2020-01-30 13:37:34.538 | DEBUG    | pals.ORA:get_pathway_df:97 - Correcting for multiple t-tests
2020-01-30 13:37:34.546 | DEBUG    | pals.feature_extraction:_calculate_coverage_df:304 - Calculating dataset formula coverage


In [15]:
ora_df

Unnamed: 0,pw_name,Stage2/Stage1 p-value,Stage1/Control p-value,Stage2/Control p-value,PiMP_KEGG Stage2/Stage1 comb_p,PiMP_KEGG Stage1/Control comb_p,PiMP_KEGG Stage2/Control comb_p,unq_pw_F,tot_ds_F,F_coverage
map00930,Caprolactam degradation,0.020308,2.629191e-07,1.609888e-05,0.209176,0.000054,0.001105,19,12,63.16
map00622,Xylene degradation,0.037897,2.634549e-05,3.445397e-07,0.281996,0.002714,0.000071,24,10,41.67
map00460,Cyanoamino acid metabolism,0.007631,1.323420e-04,3.541736e-02,0.120925,0.007149,0.455999,40,23,57.50
map00642,Ethylbenzene degradation,0.361732,1.388231e-04,2.024196e-06,0.856515,0.007149,0.000208,14,7,50.00
map00061,Fatty acid biosynthesis,0.274202,4.188068e-04,3.087144e-03,0.724174,0.017255,0.057814,10,5,50.00
...,...,...,...,...,...,...,...,...,...,...
map02060,Phosphotransferase system (PTS),0.552178,6.401330e-01,5.521778e-01,0.972210,1.000000,1.000000,25,3,12.00
map00472,D-Arginine and D-ornithine metabolism,0.147941,1.000000e+00,1.000000e+00,0.555773,1.000000,1.000000,5,3,60.00
map00904,Diterpenoid biosynthesis,1.000000,1.000000e+00,7.153208e-01,1.000000,1.000000,1.000000,39,1,2.56
map00365,Furfural degradation,1.000000,1.000000e+00,1.000000e+00,1.000000,1.000000,1.000000,12,2,16.67


In [16]:
_select_significant_entries(ora_df, significant_column, threshold, None)

Unnamed: 0,pw_name,p_value,unq_pw_F,tot_ds_F,F_coverage
map00930,Caprolactam degradation,5.4e-05,19,12,63.16
map00622,Xylene degradation,0.002714,24,10,41.67
map00460,Cyanoamino acid metabolism,0.007149,40,23,57.5
map00642,Ethylbenzene degradation,0.007149,14,7,50.0
map00061,Fatty acid biosynthesis,0.017255,10,5,50.0
map00643,Styrene degradation,0.017939,18,9,50.0
ingenza00007,IG-Amino-acid Biosynthesis 2,0.029064,6,5,83.33
map00350,Tyrosine metabolism,0.029064,53,11,20.75
map00360,Phenylalanine metabolism,0.029064,55,16,29.09


#### Run GSEA 

In [None]:
gsea = GSEA(ds_plasma, random_sets=1000, pbar=True)
gsea_df = gsea.get_pathway_df()
gsea_df.sort_values(significant_column, inplace=True)

In [None]:
gsea_df

In [None]:
_select_significant_entries(gsea_df, significant_column, threshold, None)

#### Run GSEApy

In [44]:
gsea = GSEApy(ds_plasma, random_sets=100000, pbar=True)
gsea_df = gsea.get_pathway_df()
gsea_df.sort_values(significant_column, inplace=True)

2020-01-30 14:01:44.086 | DEBUG    | pals.GSEApy:get_pathway_df:36 - Calculating GSEA
2020-01-30 14:01:44.087 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:283 - Setting the zero intensity values in the dataframe
2020-01-30 14:01:44.193 | DEBUG    | pals.GSEApy:get_pathway_df:70 - Running comparison case=Stage2 control=Stage1
2020-01-30 14:01:44,194 Parsing data files for GSEA.............................
2020-01-30 14:01:44,247 0000 gene_sets have been filtered out when max_size=1000 and min_size=1
2020-01-30 14:01:44,248 0206 gene_sets used for further statistical testing.....
2020-01-30 14:01:44,249 Start to run GSEA...Might take a while..................
2020-01-30 14:01:49,347 Start to generate GSEApy reports and figures............
2020-01-30 14:01:49,375 Congratulations. GSEApy ran successfully.................

2020-01-30 14:01:49.379 | DEBUG    | pals.GSEApy:get_pathway_df:70 - Running comparison case=Stage1 control=Control
2020-01-30 14:01:49,381 Parsing data fil

In [45]:
gsea_df

Unnamed: 0,pw_name,Stage2/Stage1 p-value,PiMP_KEGG Stage2/Stage1 comb_p,Stage2/Stage1 ES_score,Stage1/Control p-value,PiMP_KEGG Stage1/Control comb_p,Stage1/Control ES_score,Stage2/Control p-value,PiMP_KEGG Stage2/Control comb_p,Stage2/Control ES_score,unq_pw_F,tot_ds_F,F_coverage
map00902,Monoterpenoid biosynthesis,0.807692,0.955225,-0.370843,0.000000,0.000000,0.813284,0.798077,0.960332,-0.370843,19,5,26.32
map00903,Limonene and pinene degradation,0.461538,0.944106,-0.546529,0.000000,0.005569,0.850344,0.360000,1.000000,-0.546529,17,5,29.41
map00195,Photosynthesis,0.126214,1.000000,0.831826,0.098765,0.043626,0.860000,0.109890,0.869412,0.831826,10,2,20.00
map00140,Steroid hormone biosynthesis,0.640000,1.000000,-0.371704,0.009346,0.049427,0.778952,0.693069,0.996190,-0.371704,45,6,13.33
map05200,Pathways in cancer,0.491071,0.820281,0.510908,0.031250,0.088644,0.781559,0.508333,0.917480,0.510908,15,4,26.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map00290,"Valine, leucine and isoleucine biosynthesis",0.469565,1.000000,-0.451599,0.913978,1.000000,-0.296155,0.495327,0.995080,-0.451599,17,9,52.94
map00600,Sphingolipid metabolism,0.847059,0.944017,-0.372072,0.961538,1.000000,-0.301807,0.900000,0.968626,-0.372072,10,4,40.00
map04745,Phototransduction - fly,0.317308,0.731582,0.750589,0.933962,1.000000,-0.416364,0.309735,0.818028,0.750589,6,2,33.33
map00280,"Valine, leucine and isoleucine degradation",0.267857,0.999927,-0.503335,1.000000,1.000000,-0.193309,0.305556,1.000000,-0.503335,24,6,25.00


In [46]:
_select_significant_entries(gsea_df, significant_column, threshold, None)

Unnamed: 0,pw_name,p_value,unq_pw_F,tot_ds_F,F_coverage
map00902,Monoterpenoid biosynthesis,0.0,19,5,26.32
map00903,Limonene and pinene degradation,0.005569,17,5,29.41
map00195,Photosynthesis,0.043626,10,2,20.0
map00140,Steroid hormone biosynthesis,0.049427,45,6,13.33
