In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from feature_extraction import DataSource
from pathway_analysis import PALS

# Load data

In [5]:
database_name = 'kegg'

Generate token by logging in to PiMP

In [6]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [7]:
token = get_pimp_API_token_from_env()

In [8]:
analysis_id = 1321 # example beer analysis

In [9]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

http://polyomics.mvls.gla.ac.uk/export/get_ms1_intensities?analysis_id=1321 <Response [200]>


Unnamed: 0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [10]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id, database_name, polarity='positive')
    formula_df.to_pickle(formula_df_filename)

formula_df.head()

http://polyomics.mvls.gla.ac.uk/export/get_ms1_peaks?analysis_id=1321 <Response [200]>


Unnamed: 0_level_0,sec_id,mass,rt,polarity,cmpd_id,formula,adduct,identified,rc_id,compound,db,identifier,frank_annot,inchikey
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3033929,1,116.07055,577.986827,positive,2,C5H9NO2,M+H,True,15367697,L-Proline,kegg,C00148,"{'frank_cmpd_name': 'L-Proline', 'inchikey': N...",ONIBWKKTOPOVIA-BYPYZUCNSA-N
3036581,2653,157.09719,469.781817,positive,2,C5H9NO2,M+ACN+H,True,15390527,L-Proline,kegg,C00148,,ONIBWKKTOPOVIA-BYPYZUCNSA-N
3036855,2927,157.097154,569.55776,positive,2,C5H9NO2,M+ACN+H,True,15392569,L-Proline,kegg,C00148,,ONIBWKKTOPOVIA-BYPYZUCNSA-N
3033929,1,116.07055,577.986827,positive,3,C3H6O2,M+ACN+H,True,15367700,Propanoate,kegg,C00163,"{'frank_cmpd_name': 'L-Proline', 'inchikey': N...",XBDQKXXYIPTUBI-UHFFFAOYSA-N
3033929,1,116.07055,577.986827,positive,5,C5H9NO2,M+H,True,15367704,3-Acetamidopropanal,kegg,C18170,"{'frank_cmpd_name': 'L-Proline', 'inchikey': N...",ARJPPNFIEQKVBB-UHFFFAOYSA-N


In [11]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

http://polyomics.mvls.gla.ac.uk/export/get_experimental_design?analysis_id=1321 <Response [200]>


{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

# Set-up KEGG Data Source

In [12]:
ds = DataSource(int_df, formula_df, experimental_design, database_name)

2019-10-29 16:04:55.694 | DEBUG    | feature_extraction:__init__:24 - Loading C:\Users\joewa\Work\git\PALS\pals\data\kegg.json


# Set-up PALS Analysis

In [13]:
pals = PALS(ds, min_intensity=5000, num_resamples=500)

In [14]:
activity_df = pals.get_plage_activity_df()
activity_df

2019-10-29 16:04:56.855 | DEBUG    | pathway_analysis:_change_zero_peak_ints:232 - Setting the zero intensity values in the dataframe
2019-10-29 16:04:56.888 | DEBUG    | pathway_analysis:_standardize_intensity_df:205 - Scaling the data across the sample: zero mean and unit variance
2019-10-29 16:04:56.897 | DEBUG    | pathway_analysis:_standardize_intensity_df:218 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-10-29 16:04:56.898 | DEBUG    | pathway_analysis:_standardize_intensity_df:219 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]


Unnamed: 0_level_0,pw name,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
Pathway ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
map05204,Chemical carcinogenesis,0.122196,-0.024922,0.163056,-0.372108,-0.435128,-0.474277,-0.262568,-0.282725,-0.138195,0.474577,0.544222,0.685873
map00945,"Stilbenoid, diarylheptanoid and gingerol biosy...",0.041457,-0.134889,0.060969,-0.216975,-0.192954,-0.106657,-0.468973,-0.488755,-0.294233,0.342099,0.668985,0.789926
map05133,Pertussis,0.678897,0.635445,0.511368,0.152206,0.146002,0.152419,-0.021038,-0.063060,-0.049718,-0.767651,-0.590952,-0.783917
map00630,Glyoxylate and dicarboxylate metabolism,0.135207,0.095897,0.097274,-0.296609,-0.370700,-0.294633,-0.364338,-0.364911,-0.370765,0.576299,0.552414,0.604864
map04971,Gastric acid secretion,0.150060,-0.537054,0.361751,0.507736,0.093939,0.725309,0.185912,0.347716,0.384850,-0.860628,-0.761720,-0.597871
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map00061,Fatty acid biosynthesis,-0.681530,-1.260187,-0.157384,0.429501,-0.005009,0.805965,-0.211665,0.116893,1.054955,-0.445789,-0.108412,0.462661
map00633,Nitrotoluene degradation,0.035434,-0.052081,0.113923,-0.169521,-0.280401,-0.320492,-0.514672,-0.529943,-0.316949,0.580656,0.648879,0.805167
map00460,Cyanoamino acid metabolism,-0.114333,-0.153257,-0.138454,0.324407,0.303574,0.311525,0.353373,0.402342,0.333251,-0.525004,-0.533766,-0.563659
map07229,Angiotensin receptor and endothelin receptor a...,-0.319186,0.194753,0.653303,0.152896,-0.032244,0.660872,0.437385,0.143903,0.822189,-1.162649,-0.910977,-0.640243


In [15]:
plage_df = pals.set_up_resample_plage_p_df(activity_df)
plage_df

2019-10-29 16:04:58.002 | INFO     | pathway_analysis:set_up_resample_plage_p_df:58 - Calculating plage p-values with resampling
2019-10-29 16:04:58.002 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:62 - Comparison beer1/beer2
2019-10-29 16:04:58.003 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 0/500
2019-10-29 16:05:24.273 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 100/500
2019-10-29 16:05:50.971 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 200/500
2019-10-29 16:06:18.554 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 300/500
2019-10-29 16:06:45.341 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:70 - Resampling 400/500
2019-10-29 16:07:12.982 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:76 - Total time 134
2019-10-29 16:07:13.493 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:62 - Comparison beer3/beer4
2019-10-29 16:07:13.494 | DEB

Unnamed: 0_level_0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
map05204,Chemical carcinogenesis,0.184515,0.081708,78,26,33.33
map00945,"Stilbenoid, diarylheptanoid and gingerol biosy...",0.665306,0.129836,25,9,36.00
map05133,Pertussis,0.164547,0.076483,5,1,20.00
map00630,Glyoxylate and dicarboxylate metabolism,0.107260,0.012073,48,13,27.08
map04971,Gastric acid secretion,1.000000,0.076050,13,2,15.38
...,...,...,...,...,...,...
map00061,Fatty acid biosynthesis,0.516708,1.000000,10,1,10.00
map00633,Nitrotoluene degradation,0.318740,0.070304,19,8,42.11
map00460,Cyanoamino acid metabolism,0.055540,0.018800,40,30,75.00
map07229,Angiotensin receptor and endothelin receptor a...,1.000000,0.164432,3,1,33.33


In [17]:
output = os.path.join(os.getcwd(), 'test_data', 'plage_df.csv')
plage_df.to_csv(output)
output

'C:\\Users\\joewa\\Work\\git\\PALS\\notebooks\\test_data\\plage_df.csv'

In [18]:
pathway_df = pals.calculate_hg_values(plage_df)

2019-10-29 16:09:47.615 | INFO     | pathway_analysis:calculate_hg_values:137 - Calculating the hyper-geometric p-values
2019-10-29 16:09:47.819 | INFO     | pathway_analysis:calculate_hg_values:162 - Calculating the combined p-values


In [19]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
map05204,Chemical carcinogenesis,0.184515,0.081708,78,26,33.33,4.527451e-02,19.68,25.23,0.112608,0.044703
map00945,"Stilbenoid, diarylheptanoid and gingerol biosy...",0.665306,0.129836,25,9,36.00,9.455884e-02,6.31,25.24,0.564015,0.086470
map05133,Pertussis,0.164547,0.076483,5,1,20.00,4.715615e-01,1.26,25.20,0.165779,0.078480
map00630,Glyoxylate and dicarboxylate metabolism,0.107260,0.012073,48,13,27.08,3.445729e-01,12.11,25.23,0.097560,0.011026
map04971,Gastric acid secretion,1.000000,0.076050,13,2,15.38,7.260156e-01,3.28,25.23,1.000000,0.099132
...,...,...,...,...,...,...,...,...,...,...,...
map00061,Fatty acid biosynthesis,0.516708,1.000000,10,1,10.00,8.080971e-01,2.52,25.20,0.583899,1.000000
map00633,Nitrotoluene degradation,0.318740,0.070304,19,8,42.11,4.261595e-02,4.79,25.21,0.211968,0.037338
map00460,Cyanoamino acid metabolism,0.055540,0.018800,40,30,75.00,1.326647e-11,10.09,25.22,0.002056,0.000410
map07229,Angiotensin receptor and endothelin receptor a...,1.000000,0.164432,3,1,33.33,2.654903e-01,0.76,25.33,1.000000,0.140001


In [20]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df.csv')
pathway_df.to_csv(output)
output

'C:\\Users\\joewa\\Work\\git\\PALS\\notebooks\\test_data\\pathway_df.csv'