In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_annotation_df, get_experimental_design
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS
from pals.common import *

2019-11-29 17:05:23.898 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Beer Analysis

### Load data

Generate token by logging in to PiMP

In [5]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id = 1321 # example beer analysis

In [8]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'beer', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0_level_0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [9]:
annotation_df_filename = os.path.join(os.getcwd(), 'test_data', 'beer', 'annotation_df.p')
try:
    annotation_df = pd.read_pickle(annotation_df_filename)
except FileNotFoundError:
    annotation_df = get_annotation_df(token, PIMP_HOST, analysis_id)
    annotation_df.to_pickle(annotation_df_filename)

annotation_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163


In [10]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'beer', 'experimental_design.p')
try:
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

### PALS analysis using KEGG database exported from PiMP

In [11]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_PIMP_KEGG)

2019-11-29 17:05:24.901 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-11-29 17:05:24.924 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-11-29 17:05:24.929 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-11-29 17:05:25.879 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


In [12]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-11-29 17:05:25.949 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:323 - Setting the zero intensity values in the dataframe
2019-11-29 17:05:25.976 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:304 - Scaling the data across the sample: zero mean and unit variance
2019-11-29 17:05:25.981 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:143 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-29 17:05:25.982 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:144 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-29 17:05:26.688 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:155 - Calculating plage p-values with resampling
2019-11-29 17:05:26.689 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:159 - Comparison beer1/beer2
2019-11-29 17:05:26.690 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:167 - Resampling 0/1000
2019-11-29 17:05:26.781 | DEBUG    | pals.pathw

In [13]:
pathway_df.sort_values('PiMP_KEGG beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG beer1/beer2 comb_p,PiMP_KEGG beer3/beer4 comb_p
map00380,Tryptophan metabolism,0.024702,0.051431,64,34,53.12,2.024112e-08,14.26,22.28,0.001335,0.003727
map00330,Arginine and proline metabolism,0.073835,0.032060,79,50,63.29,7.494293e-16,17.61,22.29,0.001422,0.000363
map00460,Cyanoamino acid metabolism,0.030866,0.042776,40,25,62.50,1.448042e-08,8.92,22.30,0.001751,0.002769
map00300,Lysine biosynthesis,0.035455,0.033144,27,20,74.07,3.670262e-09,6.02,22.30,0.001836,0.001668
map00400,"Phenylalanine, tyrosine and tryptophan biosynt...",0.048327,0.032574,30,22,73.33,9.098577e-10,6.69,22.30,0.002490,0.001405
...,...,...,...,...,...,...,...,...,...,...,...
map00942,Anthocyanin biosynthesis,1.000000,0.983773,50,1,2.00,9.999636e-01,11.14,22.28,1.000000,0.997981
map00232,Caffeine metabolism,1.000000,0.420969,15,4,26.67,2.744668e-01,3.34,22.27,1.000000,0.377108
map00254,Aflatoxin biosynthesis,1.000000,1.000000,24,3,12.50,8.418537e-01,5.35,22.29,1.000000,1.000000
map05032,Morphine addiction,1.000000,0.071224,8,3,37.50,1.185365e-01,1.78,22.25,1.000000,0.047447


### PALS analysis using KEGG database exported from Reactome

In [14]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True)

2019-11-29 17:05:29.595 | DEBUG    | pals.feature_extraction:__init__:64 - Loading ..\pals\data\reactome\metabolic_pathways\COMPOUND\Homo sapiens.json.zip
2019-11-29 17:05:29.612 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-11-29 17:05:29.613 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-11-29 17:05:30.640 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


In [15]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-11-29 17:05:30.710 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:323 - Setting the zero intensity values in the dataframe
2019-11-29 17:05:30.737 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:304 - Scaling the data across the sample: zero mean and unit variance
2019-11-29 17:05:30.743 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:143 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-29 17:05:30.744 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:144 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-29 17:05:31.203 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:155 - Calculating plage p-values with resampling
2019-11-29 17:05:31.204 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:159 - Comparison beer1/beer2
2019-11-29 17:05:31.204 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:167 - Resampling 0/1000
2019-11-29 17:05:31.295 | DEBUG    | pals.pathw

In [16]:
pathway_df.sort_values('COMPOUND beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND beer1/beer2 comb_p,COMPOUND beer3/beer4 comb_p
R-HSA-71240,Tryptophan catabolism,0.017452,0.049908,27,14,51.85,0.020106,9.12,33.78,0.006740,0.021893
R-HSA-2024096,HS-GAG degradation,0.012120,0.070162,5,1,20.00,0.659417,1.69,33.80,0.016629,0.086069
R-HSA-163685,Integration of energy metabolism,0.033284,1.000000,1,1,100.00,0.113509,0.34,34.00,0.020882,1.000000
R-HSA-351143,Agmatine biosynthesis,0.024487,0.104779,5,2,40.00,0.327631,1.69,33.80,0.021788,0.093756
R-HSA-71182,Phenylalanine and tyrosine catabolism,0.052569,0.119476,24,13,54.17,0.015294,8.11,33.79,0.022055,0.057188
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-209931,Serotonin and melatonin biosynthesis,1.000000,0.217924,14,4,28.57,0.613379,4.73,33.79,1.000000,0.239602
R-HSA-141334,PAOs oxidise polyamines to amines,1.000000,0.917541,7,1,14.29,0.815076,2.36,33.71,1.000000,0.937929
R-HSA-71384,Ethanol oxidation,1.000000,0.084607,13,1,7.69,0.976586,4.39,33.77,1.000000,0.168984
R-HSA-1855183,"Synthesis of IP2, IP, and Ins in the cytosol",1.000000,0.057384,11,3,27.27,0.622386,3.72,33.82,1.000000,0.068725


### PALS analysis of compounds by connecting to Reactome

In [17]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True, reactome_query=True)

2019-11-29 17:05:33.840 | DEBUG    | pals.feature_extraction:__init__:46 - Retrieving data for Homo sapiens from Reactome COMPOUND metabolic_pathway_only=True
2019-11-29 17:05:34.898 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-11-29 17:05:34.899 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-11-29 17:05:35.869 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


In [18]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-11-29 17:05:35.936 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:323 - Setting the zero intensity values in the dataframe
2019-11-29 17:05:35.960 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:304 - Scaling the data across the sample: zero mean and unit variance
2019-11-29 17:05:35.966 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:143 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-29 17:05:35.967 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:144 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-29 17:05:36.365 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:155 - Calculating plage p-values with resampling
2019-11-29 17:05:36.366 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:159 - Comparison beer1/beer2
2019-11-29 17:05:36.367 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:167 - Resampling 0/1000
2019-11-29 17:05:36.459 | DEBUG    | pals.pathw

In [19]:
pathway_df.sort_values('COMPOUND beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND beer1/beer2 comb_p,COMPOUND beer3/beer4 comb_p
R-HSA-71240,Tryptophan catabolism,0.019701,0.050734,27,14,51.85,0.020106,9.12,33.78,0.007711,0.022305
R-HSA-2024096,HS-GAG degradation,0.014193,0.066488,5,1,20.00,0.659417,1.69,33.80,0.019283,0.081853
R-HSA-163685,Integration of energy metabolism,0.037035,1.000000,1,1,100.00,0.113509,0.34,34.00,0.023383,1.000000
R-HSA-71182,Phenylalanine and tyrosine catabolism,0.057817,0.114914,24,13,54.17,0.015294,8.11,33.79,0.024598,0.054626
R-HSA-351143,Agmatine biosynthesis,0.027829,0.100407,5,2,40.00,0.327631,1.69,33.80,0.024759,0.089806
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-209931,Serotonin and melatonin biosynthesis,1.000000,0.211325,14,4,28.57,0.613379,4.73,33.79,1.000000,0.232771
R-HSA-141334,PAOs oxidise polyamines to amines,1.000000,0.920488,7,1,14.29,0.815076,2.36,33.71,1.000000,0.940250
R-HSA-71384,Ethanol oxidation,1.000000,0.080594,13,1,7.69,0.976586,4.39,33.77,1.000000,0.162551
R-HSA-1855183,"Synthesis of IP2, IP, and Ins in the cytosol",1.000000,0.054077,11,3,27.27,0.622386,3.72,33.82,1.000000,0.064986


### ORA Analysis

In [25]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_PIMP_KEGG)

2019-11-29 17:06:21.019 | DEBUG    | pals.feature_extraction:__init__:40 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-11-29 17:06:21.041 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-11-29 17:06:21.046 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-11-29 17:06:21.991 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


In [26]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_ora_df()

2019-11-29 17:06:22.813 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:322 - Setting the zero intensity values in the dataframe
2019-11-29 17:06:23.676 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:408 - Calculating dataset formula coverage


In [27]:
pathway_df.sort_values('PiMP_KEGG beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0_level_0,pw_name,beer1/beer2 p-value,PiMP_KEGG beer1/beer2 comb_p,beer3/beer4 p-value,PiMP_KEGG beer3/beer4 comb_p,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
map07110,Benzoic acid family,0.000000,0.000000,0.000000e+00,0.000000e+00,2,2,100.00
map02030,Bacterial chemotaxis,0.000000,0.000000,0.000000e+00,0.000000e+00,5,5,100.00
map00365,Furfural degradation,0.000000,0.000000,1.436336e-07,1.436336e-07,12,9,75.00
map00400,"Phenylalanine, tyrosine and tryptophan biosynt...",0.000000,0.000000,3.460140e-21,3.460140e-21,30,22,73.33
map00473,D-Alanine metabolism,0.000000,0.000000,0.000000e+00,0.000000e+00,3,3,100.00
...,...,...,...,...,...,...,...,...
map00906,Carotenoid biosynthesis,0.998952,0.998952,9.999471e-01,9.999471e-01,66,8,12.12
map00943,Isoflavonoid biosynthesis,0.999702,0.999702,9.997023e-01,9.997023e-01,31,4,12.90
map00522,"Biosynthesis of 12-, 14- and 16-membered macro...",0.999947,0.999947,9.999471e-01,9.999471e-01,66,3,4.55
map00942,Anthocyanin biosynthesis,0.999964,0.999964,9.999636e-01,9.999636e-01,50,1,2.00
