In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from feature_extraction import DataSource
from pathway_analysis import PALS
from common import *

# Load data

Generate token by logging in to PiMP

In [5]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id = 1321 # example beer analysis

In [8]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0_level_0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [9]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id, database_name, polarity='positive')
    formula_df.to_pickle(formula_df_filename)

formula_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163


In [10]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

# PALS analysis using KEGG database exported from PiMP

In [11]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_PIMP_KEGG)

2019-11-01 16:34:32.948 | DEBUG    | feature_extraction:__init__:38 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip


In [12]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df()

2019-11-01 16:34:33.941 | DEBUG    | pathway_analysis:_change_zero_peak_ints:243 - Setting the zero intensity values in the dataframe
2019-11-01 16:34:33.969 | DEBUG    | pathway_analysis:_standardize_intensity_df:217 - Scaling the data across the sample: zero mean and unit variance
2019-11-01 16:34:33.975 | DEBUG    | pathway_analysis:_standardize_intensity_df:229 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-01 16:34:33.976 | DEBUG    | pathway_analysis:_standardize_intensity_df:230 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-01 16:34:35.138 | INFO     | pathway_analysis:set_up_resample_plage_p_df:69 - Calculating plage p-values with resampling
2019-11-01 16:34:35.138 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:73 - Comparison beer1/beer2
2019-11-01 16:34:35.139 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 0/1000
2019-11-01 16:34:35.238 | DEBUG    | pathway_analysis:set_up_resample_plage_p

In [13]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
map00960,"Tropane, piperidine and pyridine alkaloid bios...",0.224350,0.032757,51,25,49.02,0.000009,11.37,22.29,0.056491,0.004048
map04977,Vitamin digestion and absorption,0.074429,0.020100,30,6,20.00,0.554894,6.69,22.30,0.082497,0.023585
map00625,Chloroalkane and chloroalkene degradation,0.158972,0.159610,32,4,12.50,0.889609,7.13,22.28,0.229904,0.230688
map00010,Glycolysis / Gluconeogenesis,0.840771,0.041923,20,5,25.00,0.319348,4.46,22.30,0.812239,0.036944
map00550,Peptidoglycan biosynthesis,0.181969,0.023358,26,5,19.23,0.578747,5.79,22.27,0.197304,0.027985
...,...,...,...,...,...,...,...,...,...,...,...
map07217,Renin-angiotensin system inhibitors,0.188659,0.999734,2,1,50.00,0.126786,0.45,22.50,0.137942,0.999245
map01057,Biosynthesis of type II polyketide products,0.799221,0.059428,102,3,2.94,1.000000,22.73,22.28,0.971730,0.328155
map07227,Histamine H2/H3 receptor agonists/antagonists,1.000000,0.208634,10,1,10.00,0.741182,2.23,22.30,1.000000,0.251897
map00944,Flavone and flavonol biosynthesis,1.000000,0.039060,33,9,27.27,0.208761,7.36,22.30,1.000000,0.029615


In [14]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_pimp_kegg.csv')
pathway_df.to_csv(output)

# PALS analysis using KEGG database exported from Reactome

In [15]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True)

2019-11-01 16:34:38.004 | DEBUG    | feature_extraction:__init__:38 - Loading C:\Users\joewa\Work\git\PALS\pals\data\reactome\metabolic_pathways\COMPOUND\Homo sapiens.json.zip


In [16]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df()

2019-11-01 16:34:39.045 | DEBUG    | pathway_analysis:_change_zero_peak_ints:243 - Setting the zero intensity values in the dataframe
2019-11-01 16:34:39.073 | DEBUG    | pathway_analysis:_standardize_intensity_df:217 - Scaling the data across the sample: zero mean and unit variance
2019-11-01 16:34:39.079 | DEBUG    | pathway_analysis:_standardize_intensity_df:229 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-01 16:34:39.080 | DEBUG    | pathway_analysis:_standardize_intensity_df:230 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-01 16:34:39.697 | INFO     | pathway_analysis:set_up_resample_plage_p_df:69 - Calculating plage p-values with resampling
2019-11-01 16:34:39.698 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:73 - Comparison beer1/beer2
2019-11-01 16:34:39.699 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 0/1000
2019-11-01 16:34:39.804 | DEBUG    | pathway_analysis:set_up_resample_plage_p

In [17]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
R-HSA-400042,"Adrenaline,noradrenaline inhibits insulin secr...",0.102443,0.123733,4,2,50.00,0.215222,1.35,33.75,0.081093,0.098753
R-HSA-75896,Plasmalogen biosynthesis,1.000000,0.121155,7,1,14.29,0.815076,2.36,33.71,1.000000,0.165859
R-HSA-209931,Serotonin and melatonin biosynthesis,1.000000,0.200388,14,4,28.57,0.613379,4.73,33.79,1.000000,0.221419
R-HSA-389661,Glyoxylate metabolism and glycine degradation,0.060927,0.154972,20,3,15.00,0.961907,6.76,33.80,0.121156,0.258539
R-HSA-211981,Xenobiotics,0.259071,0.092699,20,2,10.00,0.990138,6.76,33.80,0.429986,0.200077
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-1483152,Hydrolysis of LPE,0.169778,1.000000,2,1,50.00,0.264649,0.68,34.00,0.144602,1.000000
R-HSA-140180,COX reactions,0.297507,0.153609,4,1,25.00,0.548821,1.35,33.75,0.309523,0.164238
R-HSA-71032,Propionyl-CoA catabolism,0.300065,0.092031,6,1,16.67,0.747290,2.03,33.83,0.350703,0.120609
R-HSA-156581,Methylation,0.063400,0.047066,25,5,20.00,0.924829,8.45,33.80,0.112161,0.087008


In [18]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_reactome_kegg.csv')
pathway_df.to_csv(output)