In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from feature_extraction import DataSource
from pathway_analysis import PALS

# Load data

Generate token by logging in to PiMP

In [5]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id = 1321 # example beer analysis

In [8]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [9]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id)
    formula_df.to_pickle('formula_df.p')

formula_df.head()

Unnamed: 0_level_0,db,identifier,formula
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3033929,hmdb,HMDB34208,C5H9NO2
3036581,hmdb,HMDB34208,C5H9NO2
3036855,hmdb,HMDB34208,C5H9NO2
3038249,hmdb,HMDB34208,C5H9NO2
3033929,hmdb,HMDB00162,C5H9NO2


In [10]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)

In [11]:
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

# Set-up PALS Analysis

In [13]:
ds = DataSource(int_df, formula_df, experimental_design, database_name='kegg')

2019-10-28 11:26:55.446 | DEBUG    | feature_extraction:__init__:24 - Loading C:\Users\joewa\Work\git\PALS\pals\data\kegg.json


In [14]:
pals = PALS(ds, min_intensity=5000, num_resamples=500)

In [15]:
activity_df = pals.get_plage_activity_df()
activity_df

2019-10-28 11:27:01.777 | DEBUG    | pathway_analysis:_change_zero_peak_ints:232 - Setting the zero intensity values in the dataframe
2019-10-28 11:27:01.859 | DEBUG    | pathway_analysis:_standardize_intensity_df:205 - Scaling the data across the sample: zero mean and unit variance
2019-10-28 11:27:01.880 | DEBUG    | pathway_analysis:_standardize_intensity_df:218 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-10-28 11:27:01.884 | DEBUG    | pathway_analysis:_standardize_intensity_df:219 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]


Unnamed: 0_level_0,pw name,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
Pathway ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
map05143,African trypanosomiasis,0.037773,-0.184129,0.055488,-0.430682,-0.369160,-0.501579,-0.357647,-0.163865,-0.108555,0.652512,0.679081,0.690762
map00350,Tyrosine metabolism,0.289527,0.362744,0.252238,-0.295386,-0.222753,-0.154678,0.381827,0.410490,0.301220,-0.438664,-0.452152,-0.434414
map00471,D-Glutamine and D-glutamate metabolism,0.291376,0.139590,0.207059,-0.314899,-0.449626,-0.408032,-0.450940,-0.535296,-0.322498,0.506672,0.594792,0.741803
map04915,Estrogen signaling pathway,0.144515,0.175845,0.225877,-0.464802,-0.455986,-0.459414,-0.492532,-0.464295,-0.442494,0.688366,0.753994,0.790926
map00642,Ethylbenzene degradation,-0.246150,-0.163631,-0.258231,0.592905,0.618916,0.637517,-0.240181,-0.260303,-0.332467,-0.071048,-0.142688,-0.134639
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map04070,Phosphatidylinositol signaling system,-0.404419,-0.421453,-0.414690,-0.351470,-0.351817,-0.248602,0.003240,0.041895,0.006014,0.694027,0.737845,0.709431
map04111,Cell cycle - yeast,-0.006434,0.160731,0.100175,0.397342,0.296042,0.278561,0.333344,0.269441,0.602917,-0.857337,-0.758894,-0.815887
map04723,Retrograde endocannabinoid signaling,0.062923,0.020491,0.102829,-0.349890,-0.360309,-0.379749,-0.413356,-0.355344,-0.405105,0.671274,0.697594,0.708640
map07225,Glucocorticoid and meneralocorticoid receptor ...,0.963967,0.934091,0.875042,-0.421650,-0.544690,-0.389231,-0.105968,-0.174625,-0.267033,-0.358778,-0.241596,-0.269529


In [16]:
plage_df = pals.set_up_resample_plage_p_df(activity_df)
plage_df

2019-10-28 11:27:07.530 | INFO     | pathway_analysis:set_up_resample_plage_p_df:58 - Calculating plage p-values with resampling
2019-10-28 11:27:07.532 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:62 - Comparison beer1/beer2
2019-10-28 11:27:07.533 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 0/500
2019-10-28 11:28:20.736 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 100/500
2019-10-28 11:29:31.002 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 200/500
2019-10-28 11:30:44.483 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 300/500
2019-10-28 11:32:00.988 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 400/500
2019-10-28 11:33:17.307 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:76 - Total time 369
2019-10-28 11:33:18.592 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:62 - Comparison beer3/beer4
2019-10-28 11:33:18.593 | DEB

Unnamed: 0_level_0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
map05143,African trypanosomiasis,0.294516,0.076869,7,3,42.86
map00350,Tyrosine metabolism,0.149899,0.030247,53,42,79.25
map00471,D-Glutamine and D-glutamate metabolism,0.149042,0.078429,7,4,57.14
map04915,Estrogen signaling pathway,0.067900,0.021456,7,1,14.29
map00642,Ethylbenzene degradation,0.070062,0.233091,14,10,71.43
...,...,...,...,...,...,...
map04070,Phosphatidylinositol signaling system,0.501927,0.019880,9,2,22.22
map04111,Cell cycle - yeast,0.350552,0.071888,2,0,0.00
map04723,Retrograde endocannabinoid signaling,0.098977,0.014790,15,7,46.67
map07225,Glucocorticoid and meneralocorticoid receptor ...,0.071042,0.703851,3,2,66.67


In [20]:
output

'C:\\Users\\joewa\\Work\\git\\PALS\\notebooks\\test_data\\plage_df.csv'

In [19]:
output = os.path.join(os.getcwd(), 'test_data', 'plage_df.csv')
plage_df.to_csv(output)

In [17]:
pathway_df = pals.calculate_hg_values(plage_df)

2019-10-28 11:39:36.659 | INFO     | pathway_analysis:calculate_hg_values:137 - Calculating the hyper-geometric p-values
2019-10-28 11:39:37.162 | INFO     | pathway_analysis:calculate_hg_values:162 - Calculating the combined p-values


In [18]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
map05143,African trypanosomiasis,0.294516,0.076869,7,3,42.86,1.761629e-01,2.03,29.0,0.238185,0.056921
map00350,Tyrosine metabolism,0.149899,0.030247,53,42,79.25,1.002741e-14,15.37,29.0,0.005917,0.000417
map00471,D-Glutamine and D-glutamate metabolism,0.149042,0.078429,7,4,57.14,5.033628e-02,2.03,29.0,0.089751,0.043617
map04915,Estrogen signaling pathway,0.067900,0.021456,7,1,14.29,7.249243e-01,2.03,29.0,0.089238,0.030877
map00642,Ethylbenzene degradation,0.070062,0.233091,14,10,71.43,4.710541e-04,4.06,29.0,0.018073,0.086416
...,...,...,...,...,...,...,...,...,...,...,...
map04070,Phosphatidylinositol signaling system,0.501927,0.019880,9,2,22.22,5.905696e-01,2.61,29.0,0.519799,0.024340
map04111,Cell cycle - yeast,0.350552,0.071888,2,0,0.00,6.423378e-01,0.58,29.0,0.380240,0.086606
map04723,Retrograde endocannabinoid signaling,0.098977,0.014790,15,7,46.67,6.167200e-02,4.35,29.0,0.058839,0.007433
map07225,Glucocorticoid and meneralocorticoid receptor ...,0.071042,0.703851,3,2,66.67,7.624094e-02,0.87,29.0,0.042700,0.596580
