In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from feature_extraction import DataSource
from pathway_analysis import PALS
from common import *

# Load data

Generate token by logging in to PiMP

In [5]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id = 1321 # example beer analysis

In [8]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0_level_0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [9]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id, database_name, polarity='positive')
    formula_df.to_pickle(formula_df_filename)

formula_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163


In [10]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

# PALS analysis using KEGG database exported from PiMP

In [11]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_PIMP_KEGG)

2019-11-01 10:41:33.418 | DEBUG    | feature_extraction:__init__:42 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json


In [12]:
pals = PALS(ds, min_replace=5000, num_resamples=500)
pathway_df = pals.get_pathway_df(resample=True)

2019-11-01 10:41:34.474 | DEBUG    | pathway_analysis:_change_zero_peak_ints:243 - Setting the zero intensity values in the dataframe
2019-11-01 10:41:34.504 | DEBUG    | pathway_analysis:_standardize_intensity_df:217 - Scaling the data across the sample: zero mean and unit variance
2019-11-01 10:41:34.510 | DEBUG    | pathway_analysis:_standardize_intensity_df:229 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-01 10:41:34.511 | DEBUG    | pathway_analysis:_standardize_intensity_df:230 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-01 10:41:35.622 | INFO     | pathway_analysis:set_up_resample_plage_p_df:69 - Calculating plage p-values with resampling
2019-11-01 10:41:35.623 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:73 - Comparison beer1/beer2
2019-11-01 10:41:35.623 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 0/500
2019-11-01 10:41:35.721 | DEBUG    | pathway_analysis:set_up_resample_plage_p_

In [13]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
map04960,Aldosterone-regulated sodium reabsorption,0.076472,0.740572,6,2,33.33,1.893178e-01,1.34,22.33,0.057730,0.677213
map00073,"Cutin, suberine and wax biosynthesis",0.562182,0.015700,18,12,66.67,2.114165e-05,4.01,22.28,0.257988,0.001789
map00984,Steroid degradation,0.104815,1.000000,12,3,25.00,3.253097e-01,2.67,22.25,0.093577,1.000000
map04970,Salivary secretion,0.096035,0.240913,16,2,12.50,7.662196e-01,3.57,22.31,0.127837,0.292106
map00908,Zeatin biosynthesis,0.244191,0.115535,30,5,16.67,7.206403e-01,6.69,22.30,0.286122,0.144646
...,...,...,...,...,...,...,...,...,...,...,...
map00440,Phosphonate and phosphinate metabolism,0.632006,1.000000,44,4,9.09,9.836618e-01,9.81,22.30,0.773232,1.000000
map00340,Histidine metabolism,0.881671,0.090063,41,20,48.78,6.556217e-05,9.14,22.29,0.659240,0.019492
map04974,Protein digestion and absorption,0.041403,0.021765,42,25,59.52,6.038427e-08,9.36,22.29,0.003083,0.001275
map04726,Serotonergic synapse,0.223791,0.508238,20,10,50.00,2.384251e-03,4.46,22.30,0.097110,0.296933


In [14]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_pimp_kegg.csv')
pathway_df.to_csv(output)

# PALS analysis using KEGG database exported from Reactome

In [15]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True)

2019-11-01 10:41:37.529 | DEBUG    | feature_extraction:__init__:42 - Loading C:\Users\joewa\Work\git\PALS\pals\data\reactome\metabolic_pathways\KEGG\Homo sapiens.json.zip


In [16]:
pals = PALS(ds, min_replace=5000, num_resamples=500)
pathway_df = pals.get_pathway_df(resample=True)

2019-11-01 10:41:38.527 | DEBUG    | pathway_analysis:_change_zero_peak_ints:243 - Setting the zero intensity values in the dataframe
2019-11-01 10:41:38.567 | DEBUG    | pathway_analysis:_standardize_intensity_df:217 - Scaling the data across the sample: zero mean and unit variance
2019-11-01 10:41:38.573 | DEBUG    | pathway_analysis:_standardize_intensity_df:229 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-01 10:41:38.574 | DEBUG    | pathway_analysis:_standardize_intensity_df:230 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-01 10:41:39.210 | INFO     | pathway_analysis:set_up_resample_plage_p_df:69 - Calculating plage p-values with resampling
2019-11-01 10:41:39.211 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:73 - Comparison beer1/beer2
2019-11-01 10:41:39.211 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 0/500
2019-11-01 10:41:39.309 | DEBUG    | pathway_analysis:set_up_resample_plage_p_

In [17]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
R-HSA-8849175,Threonine catabolism,1.000000,0.078801,10,1,10.00,0.915664,3.21,32.10,1.000000,0.132256
R-HSA-1660662,Glycosphingolipid metabolism,0.083410,0.291544,10,1,10.00,0.915664,3.21,32.10,0.138807,0.394246
R-HSA-74217,Purine salvage,0.145503,0.015719,27,13,48.15,0.030998,8.66,32.07,0.080543,0.006649
R-HSA-1855167,Synthesis of pyrophosphates in the cytosol,0.336771,0.098248,7,1,14.29,0.786975,2.25,32.14,0.398587,0.133410
R-HSA-141333,Biogenic amines are oxidatively deaminated to ...,0.393340,0.143729,11,5,45.45,0.149815,3.53,32.09,0.319611,0.106295
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-2408499,Formation of selenosugars for excretion,0.241030,0.016758,4,1,25.00,0.515172,1.28,32.00,0.247657,0.018896
R-HSA-1237112,Methionine salvage pathway,0.373375,0.132532,9,2,22.22,0.674184,2.89,32.11,0.409780,0.157614
R-HSA-6814848,Glycerophospholipid catabolism,0.157409,1.000000,2,1,50.00,0.242187,0.64,32.00,0.130764,1.000000
R-HSA-196807,Nicotinate metabolism,0.404487,0.088461,17,3,17.65,0.883992,5.46,32.12,0.498938,0.137928


In [18]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_reactome_kegg.csv')
pathway_df.to_csv(output)