In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from feature_extraction import DataSource
from pathway_analysis import PALS

# Load data

In [5]:
database_name = 'kegg'

Generate token by logging in to PiMP

In [6]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [7]:
token = get_pimp_API_token_from_env()

In [8]:
analysis_id = 1321 # example beer analysis

In [9]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

http://polyomics.mvls.gla.ac.uk/export/get_ms1_intensities?analysis_id=1321 <Response [200]>


Unnamed: 0_level_0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [10]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id, database_name, polarity='positive')
    formula_df.to_pickle(formula_df_filename)

formula_df.head()

http://polyomics.mvls.gla.ac.uk/export/get_ms1_peaks?analysis_id=1321 <Response [200]>


Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163


In [11]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

http://polyomics.mvls.gla.ac.uk/export/get_experimental_design?analysis_id=1321 <Response [200]>


{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

# Set-up KEGG Data Source

In [12]:
ds = DataSource(int_df, formula_df, experimental_design, database_name)

2019-10-30 22:02:12.387 | DEBUG    | feature_extraction:__init__:28 - Loading /home/joewandy/git/PALS/pals/data/kegg.json


# Set-up PALS Analysis

In [13]:
pals = PALS(ds, min_replace=5000, num_resamples=500)

In [14]:
activity_df = pals.get_plage_activity_df()
activity_df

2019-10-30 22:02:15.186 | DEBUG    | pathway_analysis:_change_zero_peak_ints:243 - Setting the zero intensity values in the dataframe
2019-10-30 22:02:15.223 | DEBUG    | pathway_analysis:_standardize_intensity_df:217 - Scaling the data across the sample: zero mean and unit variance
2019-10-30 22:02:15.231 | DEBUG    | pathway_analysis:_standardize_intensity_df:229 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-10-30 22:02:15.232 | DEBUG    | pathway_analysis:_standardize_intensity_df:230 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]


Unnamed: 0_level_0,pw name,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
Pathway ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
map00051,Fructose and mannose metabolism,-0.236780,-0.208168,-0.247665,-0.397355,-0.439309,-0.360497,0.070748,0.043260,-0.038722,0.625452,0.632694,0.556341
map07232,Potassium channel blocking and opening drugs,-0.709094,-0.735175,-0.683822,0.928083,0.854990,1.018371,0.038907,0.092642,0.181136,-0.328679,-0.328679,-0.328679
map00010,Glycolysis / Gluconeogenesis,0.277307,0.246524,0.321167,0.387165,0.399970,0.292816,-0.024253,-0.026879,0.073653,-0.632469,-0.674344,-0.640657
map04626,Plant-pathogen interaction,-0.108662,0.011175,0.203120,-0.808164,-0.570265,0.054773,-0.012979,-0.299946,0.445347,-0.005134,-0.068363,1.159098
map04726,Serotonergic synapse,0.653154,0.751441,0.586765,-0.078374,-0.100623,0.139543,-0.076644,-0.225899,-0.349086,-0.468090,-0.465122,-0.367066
...,...,...,...,...,...,...,...,...,...,...,...,...,...
map00270,Cysteine and methionine metabolism,0.166028,0.097307,0.161346,-0.434696,-0.511479,-0.474801,-0.236638,-0.273937,-0.159875,0.470283,0.574388,0.622074
map04930,Type II diabetes mellitus,0.062137,0.178450,0.009795,-0.639208,-0.734853,-0.564963,0.240651,0.227063,0.134440,0.241470,0.400607,0.444413
map05204,Chemical carcinogenesis,0.136851,0.027640,0.168538,-0.410504,-0.443479,-0.447836,-0.263377,-0.238244,-0.156844,0.452790,0.528884,0.645578
map00942,Anthocyanin biosynthesis,-0.314692,-1.291212,0.201712,-0.026023,0.084119,-0.967483,0.018624,0.761336,0.973530,-0.057549,0.176870,0.440768


In [15]:
plage_df = pals.set_up_resample_plage_p_df(activity_df)
plage_df

2019-10-30 22:02:17.120 | INFO     | pathway_analysis:set_up_resample_plage_p_df:69 - Calculating plage p-values with resampling
2019-10-30 22:02:17.123 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:73 - Comparison beer1/beer2
2019-10-30 22:02:17.125 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 0/500
2019-10-30 22:02:17.329 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 100/500
2019-10-30 22:02:17.449 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 200/500
2019-10-30 22:02:17.556 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 300/500
2019-10-30 22:02:17.667 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:81 - Resampling 400/500
2019-10-30 22:02:17.769 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:87 - Total time 0
2019-10-30 22:02:17.918 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:73 - Comparison beer3/beer4
2019-10-30 22:02:17.919 | DEBUG

Unnamed: 0_level_0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
map00051,Fructose and mannose metabolism,0.217957,0.061012,17,9,52.94
map07232,Potassium channel blocking and opening drugs,0.044913,0.082334,5,1,20.00
map00010,Glycolysis / Gluconeogenesis,0.857148,0.041737,20,5,25.00
map04626,Plant-pathogen interaction,0.928035,1.000000,7,1,14.29
map04726,Serotonergic synapse,0.191750,0.479746,20,10,50.00
...,...,...,...,...,...,...
map00270,Cysteine and methionine metabolism,0.079247,0.062316,52,21,40.38
map04930,Type II diabetes mellitus,0.140658,0.529109,5,2,40.00
map05204,Chemical carcinogenesis,0.120720,0.075321,78,13,16.67
map00942,Anthocyanin biosynthesis,1.000000,0.985992,50,1,2.00


In [16]:
output = os.path.join(os.getcwd(), 'test_data', 'plage_df.csv')
plage_df.to_csv(output)
output

'/home/joewandy/git/PALS/notebooks/test_data/plage_df.csv'

In [17]:
pathway_df = pals.calculate_hg_values(plage_df)

2019-10-30 22:02:53.951 | INFO     | pathway_analysis:calculate_hg_values:148 - Calculating the hyper-geometric p-values
2019-10-30 22:02:54.130 | INFO     | pathway_analysis:calculate_hg_values:180 - Calculating the combined p-values


In [18]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,beer1/beer2 comb_p,beer3/beer4 comb_p
map00051,Fructose and mannose metabolism,0.217957,0.061012,17,9,52.94,0.002127,3.79,22.29,0.092650,0.018904
map07232,Potassium channel blocking and opening drugs,0.044913,0.082334,5,1,20.00,0.400780,1.11,22.20,0.043387,0.078997
map00010,Glycolysis / Gluconeogenesis,0.857148,0.041737,20,5,25.00,0.319348,4.46,22.30,0.830156,0.036779
map04626,Plant-pathogen interaction,0.928035,1.000000,7,1,14.29,0.562067,1.56,22.29,0.928344,1.000000
map04726,Serotonergic synapse,0.191750,0.479746,20,10,50.00,0.002384,4.46,22.30,0.079561,0.273155
...,...,...,...,...,...,...,...,...,...,...,...
map00270,Cysteine and methionine metabolism,0.079247,0.062316,52,21,40.38,0.001214,11.59,22.29,0.023997,0.017845
map04930,Type II diabetes mellitus,0.140658,0.529109,5,2,40.00,0.128750,1.11,22.20,0.100534,0.440203
map05204,Chemical carcinogenesis,0.120720,0.075321,78,13,16.67,0.871671,17.38,22.28,0.177174,0.117635
map00942,Anthocyanin biosynthesis,1.000000,0.985992,50,1,2.00,0.999964,11.14,22.28,1.000000,0.998318


In [19]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df.csv')
pathway_df.to_csv(output)
output

'/home/joewandy/git/PALS/notebooks/test_data/pathway_df.csv'