In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_annotation_df, get_experimental_design
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS
from pals.common import *

2019-11-25 13:55:31.852 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Beer Analysis

### Load data

Generate token by logging in to PiMP

In [5]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id = 1321 # example beer analysis

In [8]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'beer', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0_level_0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [9]:
annotation_df_filename = os.path.join(os.getcwd(), 'test_data', 'beer', 'annotation_df.p')
try:
    annotation_df = pd.read_pickle(annotation_df_filename)
except FileNotFoundError:
    annotation_df = get_annotation_df(token, PIMP_HOST, analysis_id)
    annotation_df.to_pickle(annotation_df_filename)

annotation_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163


In [10]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'beer', 'experimental_design.p')
try:
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

### PALS analysis using KEGG database exported from PiMP

In [11]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_PIMP_KEGG)

2019-11-25 13:55:32.781 | DEBUG    | pals.feature_extraction:__init__:34 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-11-25 13:55:32.803 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-25 13:55:32.807 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-25 13:55:33.713 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [12]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-11-25 13:55:33.780 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:250 - Setting the zero intensity values in the dataframe
2019-11-25 13:55:33.806 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:230 - Scaling the data across the sample: zero mean and unit variance
2019-11-25 13:55:33.812 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:69 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-25 13:55:33.813 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:70 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-25 13:55:34.504 | INFO     | pals.pathway_analysis:set_up_resample_plage_p_df:81 - Calculating plage p-values with resampling
2019-11-25 13:55:34.505 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:85 - Comparison beer1/beer2
2019-11-25 13:55:34.506 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:93 - Resampling 0/1000
2019-11-25 13:55:34.596 | DEBUG    | pals.pathway_an

In [13]:
pathway_df.sort_values('PiMP_KEGG beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG beer1/beer2 comb_p,PiMP_KEGG beer3/beer4 comb_p
map00380,Tryptophan metabolism,0.031331,0.047160,64,34,53.12,2.024112e-08,14.26,22.28,0.001855,0.003294
map00330,Arginine and proline metabolism,0.093292,0.028817,79,50,63.29,7.494293e-16,17.61,22.29,0.002121,0.000306
map00300,Lysine biosynthesis,0.044239,0.031231,27,20,74.07,3.670262e-09,6.02,22.30,0.002520,0.001533
map00460,Cyanoamino acid metabolism,0.043567,0.038918,40,25,62.50,1.448042e-08,8.92,22.30,0.002842,0.002423
map00400,"Phenylalanine, tyrosine and tryptophan biosynt...",0.064449,0.029298,30,22,73.33,9.098577e-10,6.69,22.30,0.003812,0.001208
...,...,...,...,...,...,...,...,...,...,...,...
map00903,Limonene and pinene degradation,1.000000,0.034551,17,7,41.18,2.999476e-02,3.79,22.29,1.000000,0.015726
map00941,Flavonoid biosynthesis,1.000000,0.029627,38,13,34.21,3.612196e-02,8.47,22.29,1.000000,0.013825
map01040,Biosynthesis of unsaturated fatty acids,1.000000,0.065115,42,4,9.52,9.770476e-01,9.36,22.29,1.000000,0.137349
map04971,Gastric acid secretion,1.000000,0.201626,13,1,7.69,8.537256e-01,2.90,22.31,1.000000,0.269881


In [14]:
output = os.path.join(os.getcwd(), 'test_data', 'beer', 'pathway_df_pimp_kegg.csv')
pathway_df.to_csv(output)

### PALS analysis using KEGG database exported from Reactome

In [15]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True)

2019-11-25 13:55:37.255 | DEBUG    | pals.feature_extraction:__init__:57 - Loading ..\pals\data\reactome\metabolic_pathways\COMPOUND\Homo sapiens.json.zip
2019-11-25 13:55:37.274 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-25 13:55:37.275 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-25 13:55:38.191 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [16]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-11-25 13:55:38.258 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:250 - Setting the zero intensity values in the dataframe
2019-11-25 13:55:38.283 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:230 - Scaling the data across the sample: zero mean and unit variance
2019-11-25 13:55:38.288 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:69 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-25 13:55:38.289 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:70 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-25 13:55:38.677 | INFO     | pals.pathway_analysis:set_up_resample_plage_p_df:81 - Calculating plage p-values with resampling
2019-11-25 13:55:38.677 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:85 - Comparison beer1/beer2
2019-11-25 13:55:38.679 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:93 - Resampling 0/1000
2019-11-25 13:55:38.771 | DEBUG    | pals.pathway_an

In [17]:
pathway_df.sort_values('COMPOUND beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND beer1/beer2 comb_p,COMPOUND beer3/beer4 comb_p
R-HSA-71240,Tryptophan catabolism,0.017412,0.057719,27,14,51.85,0.020106,9.12,33.78,0.006722,0.025828
R-HSA-2024096,HS-GAG degradation,0.012464,0.067119,5,1,20.00,0.659417,1.69,33.80,0.017071,0.082578
R-HSA-163685,Integration of energy metabolism,0.033601,1.000000,1,1,100.00,0.113509,0.34,34.00,0.021092,1.000000
R-HSA-351143,Agmatine biosynthesis,0.025101,0.102275,5,2,40.00,0.327631,1.69,33.80,0.022334,0.091493
R-HSA-71182,Phenylalanine and tyrosine catabolism,0.053678,0.117380,24,13,54.17,0.015294,8.11,33.79,0.022589,0.056009
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-170822,Regulation of Glucokinase by Glucokinase Regul...,1.000000,0.057700,2,2,100.00,0.037938,0.68,34.00,1.000000,0.029250
R-HSA-2142700,Synthesis of Lipoxins (LX),1.000000,0.377872,10,1,10.00,0.931926,3.38,33.80,1.000000,0.494908
R-HSA-1855183,"Synthesis of IP2, IP, and Ins in the cytosol",1.000000,0.054336,11,3,27.27,0.622386,3.72,33.82,1.000000,0.065280
R-HSA-351200,Interconversion of polyamines,1.000000,0.928706,9,1,11.11,0.904169,3.04,33.78,1.000000,0.954850


In [18]:
output = os.path.join(os.getcwd(), 'test_data', 'beer', 'pathway_df_reactome_kegg.csv')
pathway_df.to_csv(output)

### PALS analysis of compounds by connecting to Reactome

In [19]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True, reactome_query=True)

2019-11-25 13:55:41.381 | DEBUG    | pals.feature_extraction:__init__:40 - Retrieving data for Homo sapiens from Reactome COMPOUND metabolic_pathway_only=True
2019-11-25 13:55:41.898 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-25 13:55:41.899 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-25 13:55:42.820 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [20]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-11-25 13:55:42.895 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:250 - Setting the zero intensity values in the dataframe
2019-11-25 13:55:42.920 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:230 - Scaling the data across the sample: zero mean and unit variance
2019-11-25 13:55:42.925 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:69 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-25 13:55:42.926 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:70 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-25 13:55:43.314 | INFO     | pals.pathway_analysis:set_up_resample_plage_p_df:81 - Calculating plage p-values with resampling
2019-11-25 13:55:43.314 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:85 - Comparison beer1/beer2
2019-11-25 13:55:43.315 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:93 - Resampling 0/1000
2019-11-25 13:55:43.407 | DEBUG    | pals.pathway_an

In [21]:
pathway_df.sort_values('COMPOUND beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND beer1/beer2 comb_p,COMPOUND beer3/beer4 comb_p
R-HSA-71240,Tryptophan catabolism,0.024180,0.055532,27,14,51.85,0.020106,9.12,33.78,0.009689,0.024717
R-HSA-2024096,HS-GAG degradation,0.019852,0.072908,5,1,20.00,0.659417,1.69,33.80,0.026416,0.089208
R-HSA-163685,Integration of energy metabolism,0.044067,1.000000,1,1,100.00,0.113509,0.34,34.00,0.028123,1.000000
R-HSA-71182,Phenylalanine and tyrosine catabolism,0.071307,0.123172,24,13,54.17,0.015294,8.11,33.79,0.031320,0.059280
R-HSA-351143,Agmatine biosynthesis,0.036666,0.108234,5,2,40.00,0.327631,1.69,33.80,0.032625,0.096881
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-1989781,PPARA activates gene expression,1.000000,0.325523,2,1,50.00,0.264649,0.68,34.00,1.000000,0.285392
R-HSA-351200,Interconversion of polyamines,1.000000,0.918032,9,1,11.11,0.904169,3.04,33.78,1.000000,0.947490
R-HSA-6806664,Metabolism of vitamin K,1.000000,0.087676,2,1,50.00,0.264649,0.68,34.00,1.000000,0.073215
R-HSA-75876,Synthesis of very long-chain fatty acyl-CoAs,1.000000,0.092694,13,2,15.38,0.905116,4.39,33.77,1.000000,0.148829


In [22]:
output = os.path.join(os.getcwd(), 'test_data', 'beer', 'pathway_df_reactome_query_kegg.csv')
pathway_df.to_csv(output)