In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS
from pals.common import *

# Beer Analysis

### Load data

Generate token by logging in to PiMP

In [5]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id = 1321 # example beer analysis

In [8]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0_level_0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [9]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id, database_name, polarity='positive')
    formula_df.to_pickle(formula_df_filename)

formula_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163


In [10]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

### PALS analysis using KEGG database exported from PiMP

In [11]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_PIMP_KEGG)

2019-11-13 13:18:09.219 | DEBUG    | pals.feature_extraction:__init__:32 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-11-13 13:18:09.243 | DEBUG    | pals.feature_extraction:__init__:60 - Mapping pathway to unique ids
2019-11-13 13:18:09.250 | DEBUG    | pals.feature_extraction:__init__:74 - Creating dataset to pathway mapping
2019-11-13 13:18:10.213 | DEBUG    | pals.feature_extraction:__init__:99 - Computing unique id counts


In [12]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df()

2019-11-13 13:18:10.289 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:248 - Setting the zero intensity values in the dataframe
2019-11-13 13:18:10.318 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:228 - Scaling the data across the sample: zero mean and unit variance
2019-11-13 13:18:10.324 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-11-13 13:18:10.325 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:68 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-13 13:18:11.418 | INFO     | pals.pathway_analysis:set_up_resample_plage_p_df:79 - Calculating plage p-values with resampling
2019-11-13 13:18:11.419 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:83 - Comparison beer1/beer2
2019-11-13 13:18:11.419 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 0/1000
2019-11-13 13:18:11.518 | DEBUG    | pals.pathway_an

In [13]:
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG beer1/beer2 comb_p,PiMP_KEGG beer3/beer4 comb_p
map00310,Lysine degradation,0.064377,0.047836,32,17,53.12,0.000050,7.13,22.28,0.012141,0.008266
map02020,Two-component system,0.146308,0.019298,34,15,44.12,0.001689,7.58,22.29,0.054051,0.004619
ingenza00001,Glycerol Utilisation,0.565172,0.083981,3,2,66.67,0.036777,0.67,22.33,0.424648,0.044290
map00760,Nicotinate and nicotinamide metabolism,0.103418,0.048513,40,21,52.50,0.000010,8.92,22.30,0.019069,0.006893
map00500,Starch and sucrose metabolism,0.283011,0.070189,19,11,57.89,0.000288,4.23,22.26,0.107858,0.016962
...,...,...,...,...,...,...,...,...,...,...,...
map05032,Morphine addiction,1.000000,0.063120,8,3,37.50,0.118536,1.78,22.25,1.000000,0.041701
map04068,FoxO signaling pathway,0.110979,0.116863,4,2,50.00,0.076830,0.89,22.25,0.069781,0.073879
map04626,Plant-pathogen interaction,0.941764,1.000000,7,1,14.29,0.562067,1.56,22.29,0.941781,1.000000
map00564,Glycerophospholipid metabolism,0.241965,0.033273,23,8,34.78,0.065948,5.13,22.30,0.163078,0.018104


In [14]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_pimp_kegg.csv')
pathway_df.to_csv(output)

### PALS analysis using KEGG database exported from Reactome

In [None]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True)

In [None]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df()

In [None]:
pathway_df

In [None]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_reactome_kegg.csv')
pathway_df.to_csv(output)

### PALS analysis of compounds by connecting to Reactome

In [None]:
ds = DataSource(int_df, formula_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True, reactome_query=True)

In [None]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df()

In [None]:
pathway_df

In [None]:
output = os.path.join(os.getcwd(), 'test_data', 'pathway_df_reactome_query_kegg.csv')
pathway_df.to_csv(output)