In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from feature_extraction import DataSource
from pathway_analysis import PALS

# Load data

In [5]:
database_name = 'kegg'

Generate token by logging in to PiMP

In [6]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [7]:
token = get_pimp_API_token_from_env()

In [8]:
analysis_id = 1321 # example beer analysis

In [9]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

http://polyomics.mvls.gla.ac.uk/export/get_ms1_intensities?analysis_id=1321 <Response [200]>


Unnamed: 0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [10]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id, database_name, polarity='positive')
    formula_df.to_pickle(formula_df_filename)

formula_df.head()

http://polyomics.mvls.gla.ac.uk/export/get_ms1_peaks?analysis_id=1321 <Response [200]>


Unnamed: 0_level_0,sec_id,mass,rt,polarity,cmpd_id,unique_id,adduct,identified,rc_id,compound,db,entity_id,frank_annot,inchikey
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3033929,1,116.07055,577.986827,positive,2,C5H9NO2,M+H,True,15367697,L-Proline,kegg,C00148,"{'frank_cmpd_name': 'L-Proline', 'inchikey': N...",ONIBWKKTOPOVIA-BYPYZUCNSA-N
3036581,2653,157.09719,469.781817,positive,2,C5H9NO2,M+ACN+H,True,15390527,L-Proline,kegg,C00148,,ONIBWKKTOPOVIA-BYPYZUCNSA-N
3036855,2927,157.097154,569.55776,positive,2,C5H9NO2,M+ACN+H,True,15392569,L-Proline,kegg,C00148,,ONIBWKKTOPOVIA-BYPYZUCNSA-N
3038249,4321,114.055969,577.210902,negative,2,C5H9NO2,M-H,True,15402470,L-Proline,kegg,C00148,,ONIBWKKTOPOVIA-BYPYZUCNSA-N
3033929,1,116.07055,577.986827,positive,3,C3H6O2,M+ACN+H,True,15367700,Propanoate,kegg,C00163,"{'frank_cmpd_name': 'L-Proline', 'inchikey': N...",XBDQKXXYIPTUBI-UHFFFAOYSA-N


In [11]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

http://polyomics.mvls.gla.ac.uk/export/get_experimental_design?analysis_id=1321 <Response [200]>


{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

# Set-up KEGG Data Source

In [12]:
ds = DataSource(int_df, formula_df, experimental_design, database_name)

2019-10-30 14:37:07.384 | DEBUG    | feature_extraction:__init__:24 - Loading /home/joewandy/git/PALS/pals/data/kegg.json


# Set-up PALS Analysis

In [13]:
pals = PALS(ds, min_replace=5000, num_resamples=500)

In [14]:
activity_df = pals.get_plage_activity_df()
activity_df

2019-10-30 14:37:10.140 | DEBUG    | pathway_analysis:_change_zero_peak_ints:239 - Setting the zero intensity values in the dataframe
2019-10-30 14:37:10.176 | DEBUG    | pathway_analysis:_standardize_intensity_df:212 - Scaling the data across the sample: zero mean and unit variance
2019-10-30 14:37:10.185 | DEBUG    | pathway_analysis:_standardize_intensity_df:225 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-10-30 14:37:10.186 | DEBUG    | pathway_analysis:_standardize_intensity_df:226 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]


Unnamed: 0_level_0,pw name,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
Pathway ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
map05012,Parkinson's disease,0.042117,-0.069642,0.076953,0.080161,-0.085852,-0.094464,-0.539034,-0.639061,-0.342899,0.422353,0.535947,0.613423
ingenza00003,KIV Synthesis,0.122320,-0.111537,0.160061,-0.027054,-0.160889,-0.145318,-0.539882,-0.507523,-0.298415,0.377185,0.537356,0.593695
ingenza00001,Glycerol Utilisation,-0.133697,0.026740,-0.215009,0.062752,0.132144,0.285194,-0.499739,-0.508644,-0.827241,0.620363,0.565916,0.491222
map04730,Long-term depression,0.721003,0.624199,0.651539,-0.503173,-0.561980,-0.550388,-0.535185,-0.526405,-0.423775,0.357156,0.307239,0.439770
map04626,Plant-pathogen interaction,-0.108662,0.011175,0.203120,-0.808164,-0.570265,0.054773,-0.012979,-0.299946,0.445347,-0.005134,-0.068363,1.159098
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map00590,Arachidonic acid metabolism,0.664353,0.781839,0.637379,-0.052103,-0.037756,0.099034,-0.008452,-0.158991,-0.301291,-0.568394,-0.571009,-0.484609
map07033,Anticonvulsants,0.494183,0.227265,0.352513,-0.715258,-0.804111,-0.706295,-0.123672,0.016648,-0.123027,0.336287,0.527287,0.518180
map04971,Gastric acid secretion,-0.920821,0.379637,-0.873143,-0.349875,-0.187644,-0.520062,0.084067,0.070588,-0.224172,1.100925,0.800792,0.639707
map05146,Amoebiasis,0.245916,0.184697,0.209837,-0.616993,-0.639292,-0.633359,-0.285664,-0.148425,-0.242181,0.688866,0.660072,0.576527


In [15]:
plage_df = pals.set_up_resample_plage_p_df(activity_df)
plage_df

2019-10-30 14:37:14.947 | INFO     | pathway_analysis:set_up_resample_plage_p_df:60 - Calculating plage p-values with resampling
2019-10-30 14:37:14.948 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:64 - Comparison beer1/beer2
2019-10-30 14:37:14.948 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:72 - Resampling 0/500
2019-10-30 14:37:37.733 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:72 - Resampling 100/500
2019-10-30 14:37:59.799 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:72 - Resampling 200/500
2019-10-30 14:38:22.203 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:72 - Resampling 300/500
2019-10-30 14:38:45.117 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:72 - Resampling 400/500
2019-10-30 14:39:52.966 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:78 - Total time 158
2019-10-30 14:39:53.921 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:64 - Comparison beer3/beer4
2019-10-30 14:39:53.922 | DEB

Unnamed: 0_level_0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
map05012,Parkinson's disease,1.000000,0.088902,15,5,33.33
ingenza00003,KIV Synthesis,0.929011,0.093916,6,5,83.33
ingenza00001,Glycerol Utilisation,0.572844,0.086032,3,2,66.67
map04730,Long-term depression,0.055931,0.049359,7,2,28.57
map04626,Plant-pathogen interaction,0.939666,1.000000,7,1,14.29
...,...,...,...,...,...,...
map00590,Arachidonic acid metabolism,0.160483,0.251305,19,10,52.63
map07033,Anticonvulsants,0.131471,0.138919,4,2,50.00
map04971,Gastric acid secretion,1.000000,0.198666,13,1,7.69
map05146,Amoebiasis,0.046100,0.049110,8,2,25.00


In [16]:
output = os.path.join(os.getcwd(), 'test_data', 'plage_df.csv')
plage_df.to_csv(output)
output

'/home/joewandy/git/PALS/notebooks/test_data/plage_df.csv'

In [17]:
pathway_df = pals.calculate_hg_values(plage_df)

2019-10-30 14:43:37.900 | INFO     | pathway_analysis:calculate_hg_values:139 - Calculating the hyper-geometric p-values
2019-10-30 14:43:38.057 | INFO     | pathway_analysis:calculate_hg_values:169 - Calculating the combined p-values


In [18]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
map05012,Parkinson's disease,1.000000,0.088902,15,5,33.33,0.123522,3.34,22.27,1.000000,0.060764
ingenza00003,KIV Synthesis,0.929011,0.093916,6,5,83.33,0.000677,1.34,22.33,0.791461,0.027436
ingenza00001,Glycerol Utilisation,0.572844,0.086032,3,2,66.67,0.036616,0.67,22.33,0.432007,0.045476
map04730,Long-term depression,0.055931,0.049359,7,2,28.57,0.254428,1.56,22.29,0.045651,0.040180
map04626,Plant-pathogen interaction,0.939666,1.000000,7,1,14.29,0.561104,1.56,22.29,0.939668,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
map00590,Arachidonic acid metabolism,0.160483,0.251305,19,10,52.63,0.001397,4.23,22.26,0.059439,0.106811
map07033,Anticonvulsants,0.131471,0.138919,4,2,50.00,0.076516,0.89,22.25,0.084108,0.089418
map04971,Gastric acid secretion,1.000000,0.198666,13,1,7.69,0.853037,2.89,22.23,1.000000,0.266273
map05146,Amoebiasis,0.046100,0.049110,8,2,25.00,0.322650,1.78,22.25,0.040800,0.043475


In [19]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df.csv')
pathway_df.to_csv(output)
output

'/home/joewandy/git/PALS/notebooks/test_data/pathway_df.csv'