In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from feature_extraction import DataSource
from pathway_analysis import PALS
from common import *

# Load data

Generate token by logging in to PiMP

In [5]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id = 1321 # example beer analysis

In [8]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0_level_0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [9]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id, database_name, polarity='positive')
    formula_df.to_pickle(formula_df_filename)

formula_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163


In [10]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

# PALS analysis using KEGG database exported from PiMP

In [11]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_PIMP_KEGG)

2019-11-05 15:23:08.548 | DEBUG    | feature_extraction:__init__:33 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip


In [12]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df()

2019-11-05 15:23:10.528 | DEBUG    | pathway_analysis:_change_zero_peak_ints:248 - Setting the zero intensity values in the dataframe
2019-11-05 15:23:10.563 | DEBUG    | pathway_analysis:_standardize_intensity_df:228 - Scaling the data across the sample: zero mean and unit variance
2019-11-05 15:23:10.567 | DEBUG    | pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-05 15:23:10.568 | DEBUG    | pathway_analysis:get_plage_activity_df:68 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-05 15:23:11.651 | INFO     | pathway_analysis:set_up_resample_plage_p_df:79 - Calculating plage p-values with resampling
2019-11-05 15:23:11.652 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:83 - Comparison beer1/beer2
2019-11-05 15:23:11.653 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 0/1000
2019-11-05 15:23:11.746 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - R

In [13]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG beer1/beer2 comb_p,PiMP_KEGG beer3/beer4 comb_p
map00903,Limonene and pinene degradation,1.000000,0.036342,17,7,41.18,2.999476e-02,3.79,22.29,1.000000,0.016634
map00040,Pentose and glucuronate interconversions,0.142520,1.000000,23,14,60.87,2.439305e-05,5.13,22.30,0.032533,1.000000
map00590,Arachidonic acid metabolism,0.143525,0.259994,19,10,52.63,1.416461e-03,4.23,22.26,0.051613,0.111928
map00061,Fatty acid biosynthesis,0.173073,0.386406,10,7,70.00,5.053093e-04,2.23,22.30,0.058378,0.176747
map00943,Isoflavonoid biosynthesis,0.134400,0.048090,31,4,12.90,8.726532e-01,6.91,22.29,0.194625,0.079569
...,...,...,...,...,...,...,...,...,...,...,...
map02010,ABC transporters,0.109829,0.054413,79,39,49.37,3.120709e-08,17.61,22.29,0.011758,0.004223
map00440,Phosphonate and phosphinate metabolism,0.608104,1.000000,44,4,9.09,9.836618e-01,9.81,22.30,0.754262,1.000000
map05146,Amoebiasis,0.038655,0.050819,8,2,25.00,3.236155e-01,1.78,22.25,0.034231,0.045045
map00983,Drug metabolism - other enzymes,0.415817,0.452513,38,3,7.89,9.850392e-01,8.47,22.29,0.586018,0.621261


In [14]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_pimp_kegg.csv')
pathway_df.to_csv(output)

# PALS analysis using KEGG database exported from Reactome

In [15]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True)

2019-11-05 15:23:23.472 | DEBUG    | feature_extraction:__init__:41 - Loading ../pals\data\reactome\metabolic_pathways\COMPOUND\Homo sapiens.json.zip


In [16]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df()

2019-11-05 15:23:25.168 | DEBUG    | pathway_analysis:_change_zero_peak_ints:248 - Setting the zero intensity values in the dataframe
2019-11-05 15:23:25.199 | DEBUG    | pathway_analysis:_standardize_intensity_df:228 - Scaling the data across the sample: zero mean and unit variance
2019-11-05 15:23:25.205 | DEBUG    | pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-05 15:23:25.206 | DEBUG    | pathway_analysis:get_plage_activity_df:68 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-05 15:23:25.824 | INFO     | pathway_analysis:set_up_resample_plage_p_df:79 - Calculating plage p-values with resampling
2019-11-05 15:23:25.825 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:83 - Comparison beer1/beer2
2019-11-05 15:23:25.825 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 0/1000
2019-11-05 15:23:25.931 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - R

In [17]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND beer1/beer2 comb_p,COMPOUND beer3/beer4 comb_p
R-HSA-1614558,Degradation of cysteine and homocysteine,0.095154,0.130615,14,3,21.43,0.806371,4.73,33.79,0.132496,0.175639
R-HSA-70614,Amino acid synthesis and interconversion (tran...,0.056865,0.035311,33,13,39.39,0.219808,11.15,33.79,0.044331,0.027155
R-HSA-434316,Fatty Acids bound to GPR40 (FFAR1) regulate in...,0.905603,0.230810,6,2,33.33,0.439733,2.03,33.83,0.895966,0.226138
R-HSA-389599,Alpha-oxidation of phytanate,0.929793,0.131441,13,2,15.38,0.905116,4.39,33.77,0.955696,0.200258
R-HSA-141334,PAOs oxidise polyamines to amines,1.000000,0.929363,7,1,14.29,0.815076,2.36,33.71,1.000000,0.947208
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-71336,Pentose phosphate pathway,0.048790,0.124670,17,3,17.65,0.910559,5.74,33.76,0.086772,0.193255
R-HSA-2022928,HS-GAG biosynthesis,0.684306,0.101420,7,1,14.29,0.815076,2.36,33.71,0.740966,0.141658
R-HSA-947581,Molybdenum cofactor biosynthesis,0.048064,0.080705,12,2,16.67,0.872928,4.05,33.75,0.079570,0.125172
R-HSA-1483152,Hydrolysis of LPE,0.182512,1.000000,2,1,50.00,0.264649,0.68,34.00,0.155861,1.000000


In [18]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_reactome_kegg.csv')
pathway_df.to_csv(output)

# PALS analysis by connecting to local Reactome

In [19]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True, reactome_query=True)

2019-11-05 15:23:55.201 | DEBUG    | feature_extraction:__init__:45 - Retrieving data for Homo sapiens from Reactome COMPOUND


In [20]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df()

2019-11-05 15:24:03.752 | DEBUG    | pathway_analysis:_change_zero_peak_ints:248 - Setting the zero intensity values in the dataframe
2019-11-05 15:24:03.779 | DEBUG    | pathway_analysis:_standardize_intensity_df:228 - Scaling the data across the sample: zero mean and unit variance
2019-11-05 15:24:03.784 | DEBUG    | pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-05 15:24:03.785 | DEBUG    | pathway_analysis:get_plage_activity_df:68 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-05 15:24:04.403 | INFO     | pathway_analysis:set_up_resample_plage_p_df:79 - Calculating plage p-values with resampling
2019-11-05 15:24:04.403 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:83 - Comparison beer1/beer2
2019-11-05 15:24:04.404 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 0/1000
2019-11-05 15:24:04.498 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - R

In [21]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND beer1/beer2 comb_p,COMPOUND beer3/beer4 comb_p
R-HSA-1614558,Degradation of cysteine and homocysteine,0.087371,0.129187,14,3,21.43,0.806371,4.73,33.79,0.122779,0.173934
R-HSA-70614,Amino acid synthesis and interconversion (tran...,0.051624,0.033859,33,13,39.39,0.219808,11.15,33.79,0.040125,0.026010
R-HSA-434316,Fatty Acids bound to GPR40 (FFAR1) regulate in...,0.895040,0.231183,6,2,33.33,0.439733,2.03,33.83,0.884871,0.226500
R-HSA-389599,Alpha-oxidation of phytanate,0.928261,0.125975,13,2,15.38,0.905116,4.39,33.77,0.954649,0.193185
R-HSA-141334,PAOs oxidise polyamines to amines,1.000000,0.917867,7,1,14.29,0.815076,2.36,33.71,1.000000,0.938186
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-71336,Pentose phosphate pathway,0.045889,0.119614,17,3,17.65,0.910559,5.74,33.76,0.082305,0.186615
R-HSA-2022928,HS-GAG biosynthesis,0.665729,0.097737,7,1,14.29,0.815076,2.36,33.71,0.724314,0.137077
R-HSA-947581,Molybdenum cofactor biosynthesis,0.045176,0.078174,12,2,16.67,0.872928,4.05,33.75,0.075356,0.121747
R-HSA-1483152,Hydrolysis of LPE,0.170216,1.000000,2,1,50.00,0.264649,0.68,34.00,0.144989,1.000000


In [None]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_reactome_query_kegg.csv')
pathway_df.to_csv(output)