In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df
from feature_extraction import DataSource
from pathway_analysis import PALS

# Load data

Generate token by logging in to PiMP

In [5]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id = 1321 # example beer analysis

In [8]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [9]:
int_df.loc[3040968]

Beer_1_full1.mzXML    191345.43750
Beer_1_full2.mzXML    114476.37500
Beer_1_full3.mzXML    135545.34375
Beer_2_full1.mzXML             NaN
Beer_2_full2.mzXML             NaN
Beer_2_full3.mzXML             NaN
Beer_3_full1.mzXML             NaN
Beer_3_full2.mzXML             NaN
Beer_3_full3.mzXML             NaN
Beer_4_full1.mzXML             NaN
Beer_4_full2.mzXML             NaN
Beer_4_full3.mzXML             NaN
Name: 3040968, dtype: float64

In [10]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id)
    formula_df.to_pickle('formula_df.p')

formula_df.head()

Unnamed: 0_level_0,db,identifier,formula
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3033929,hmdb,HMDB34208,C5H9NO2
3036581,hmdb,HMDB34208,C5H9NO2
3036855,hmdb,HMDB34208,C5H9NO2
3038249,hmdb,HMDB34208,C5H9NO2
3033929,hmdb,HMDB00162,C5H9NO2


In [11]:
experiment_design = {
    'groups': {
        'beer1': ['Beer_1_full1.mzXML', 'Beer_1_full2.mzXML', 'Beer_1_full3.mzXML'],
        'beer2': ['Beer_2_full1.mzXML', 'Beer_2_full2.mzXML', 'Beer_2_full3.mzXML'],
        'beer3': ['Beer_3_full1.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full3.mzXML'],
        'beer4': ['Beer_4_full1.mzXML', 'Beer_4_full2.mzXML', 'Beer_4_full3.mzXML'],
    },
    'comparisons': [
        {
            'name': 'beer1/beer2',
            'case': 'beer1',
            'control': 'beer2'
        },        
        {
            'name': 'beer3/beer4',
            'case': 'beer3',
            'control': 'beer4'
        },                
    ]
}

# Set-up PALS Analysis

In [12]:
ds = DataSource(int_df, formula_df, experiment_design, database_name='kegg')

2019-09-25 13:38:45.636 | DEBUG    | feature_extraction:__init__:24 - Loading C:\Users\joewa\Work\git\PALS\pals\data\kegg.json


In [13]:
pals = PALS(ds, min_intensity=5000, num_resamples=500)

In [14]:
activity_df = pals.get_plage_activity_df()
activity_df

2019-09-25 13:38:46.845 | DEBUG    | pathway_analysis:_change_zero_peak_ints:232 - Setting the zero intensity values in the dataframe
2019-09-25 13:38:46.873 | DEBUG    | pathway_analysis:_standardize_intensity_df:205 - Scaling the data across the sample: zero mean and unit variance
2019-09-25 13:38:46.880 | DEBUG    | pathway_analysis:_standardize_intensity_df:218 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-09-25 13:38:46.881 | DEBUG    | pathway_analysis:_standardize_intensity_df:219 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]


Unnamed: 0_level_0,pw name,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
Pathway ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ingenza00007,IG-Amino-acid Biosynthesis 2,0.263669,0.187365,0.306593,-0.460182,-0.533728,-0.532693,-0.159428,-0.221997,-0.026864,0.342805,0.393389,0.441071
map01051,Biosynthesis of ansamycins,0.253022,0.308315,0.250116,-0.208555,-0.280498,-0.214890,0.379466,0.400231,0.531792,-0.580774,-0.492698,-0.345526
map00402,Benzoxazinoid biosynthesis,-0.736055,-0.597802,-0.685732,0.437853,0.511491,0.504818,-0.235427,-0.193896,-0.299671,0.476862,0.403499,0.414060
map04976,Bile secretion,-0.352355,-0.441831,-0.315502,0.255690,0.208314,0.207608,-0.354780,-0.389102,-0.269384,0.497308,0.494296,0.459740
map00900,Terpenoid backbone biosynthesis,0.504266,0.469856,0.541813,-0.456269,-0.502469,-0.376558,-0.011178,-0.004378,0.075747,-0.078900,-0.156159,-0.005771
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map00640,Propanoate metabolism,-0.524332,-0.373759,-0.451440,0.242844,0.294527,0.269335,-0.261373,-0.205368,-0.396562,0.506947,0.528012,0.371169
map04111,Cell cycle - yeast,-0.006434,0.160731,0.100175,0.397342,0.296042,0.278561,0.333344,0.269441,0.602917,-0.857337,-0.758894,-0.815887
map07011,Penicillins,0.227627,-0.292492,0.159689,0.234420,0.000646,-0.151676,-0.598995,-0.918964,-0.801896,0.344107,0.876485,0.921050
map00010,Glycolysis / Gluconeogenesis,0.296674,0.283708,0.340206,0.330664,0.338202,0.244056,0.025705,0.031351,0.117860,-0.659264,-0.684640,-0.664522


In [15]:
plage_df = pals.set_up_resample_plage_p_df(activity_df)
plage_df

2019-09-25 13:38:48.077 | INFO     | pathway_analysis:set_up_resample_plage_p_df:58 - Calculating plage p-values with resampling
2019-09-25 13:38:48.078 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:62 - Comparison beer1/beer2
2019-09-25 13:38:48.078 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 0/500
2019-09-25 13:39:13.607 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 100/500
2019-09-25 13:39:39.537 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 200/500
2019-09-25 13:40:06.098 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 300/500
2019-09-25 13:40:32.054 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 400/500
2019-09-25 13:40:58.057 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:76 - Total time 129
2019-09-25 13:40:58.533 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:62 - Comparison beer3/beer4
2019-09-25 13:40:58.533 | DEB

Unnamed: 0_level_0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ingenza00007,IG-Amino-acid Biosynthesis 2,0.094049,0.107099,6,6,100.00
map01051,Biosynthesis of ansamycins,0.098259,0.070697,30,5,16.67
map00402,Benzoxazinoid biosynthesis,0.072554,0.044092,9,6,66.67
map04976,Bile secretion,0.110940,0.034324,89,25,28.09
map00900,Terpenoid backbone biosynthesis,0.078358,0.644828,31,6,19.35
...,...,...,...,...,...,...
map00640,Propanoate metabolism,0.104791,0.084376,28,16,57.14
map04111,Cell cycle - yeast,0.351059,0.066734,2,0,0.00
map07011,Penicillins,1.000000,0.126603,2,1,50.00
map00010,Glycolysis / Gluconeogenesis,1.000000,0.028392,20,8,40.00


In [16]:
pathway_df = pals.calculate_hg_values(plage_df)

2019-09-25 13:43:06.741 | INFO     | pathway_analysis:calculate_hg_values:137 - Calculating the hyper-geometric p-values
2019-09-25 13:43:06.910 | INFO     | pathway_analysis:calculate_hg_values:162 - Calculating the combined p-values


In [17]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
ingenza00007,IG-Amino-acid Biosynthesis 2,0.094049,0.107099,6,6,100.00,0.000170,1.74,29.00,0.023109,0.027384
map01051,Biosynthesis of ansamycins,0.098259,0.070697,30,5,16.67,0.922740,8.70,29.00,0.161764,0.122446
map00402,Benzoxazinoid biosynthesis,0.072554,0.044092,9,6,66.67,0.008556,2.61,29.00,0.028958,0.016196
map04976,Bile secretion,0.110940,0.034324,89,25,28.09,0.550742,25.82,29.01,0.120436,0.039173
map00900,Terpenoid backbone biosynthesis,0.078358,0.644828,31,6,19.35,0.863634,8.99,29.00,0.120277,0.718800
...,...,...,...,...,...,...,...,...,...,...,...
map00640,Propanoate metabolism,0.104791,0.084376,28,16,57.14,0.000789,8.12,29.00,0.032152,0.024464
map04111,Cell cycle - yeast,0.351059,0.066734,2,0,0.00,0.642338,0.58,29.00,0.380751,0.080772
map07011,Penicillins,1.000000,0.126603,2,1,50.00,0.203509,0.58,29.00,1.000000,0.099743
map00010,Glycolysis / Gluconeogenesis,1.000000,0.028392,20,8,40.00,0.124076,5.80,29.00,1.000000,0.018109
