In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('../pals')

In [3]:
import pandas as pd

In [4]:
from pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from feature_extraction import DataSource
from pathway_analysis import PALS

# Load data

In [5]:
database_name = 'kegg'

Generate token by logging in to PiMP

In [6]:
# username = 'joewandy' # PiMP username
# password = 'enter' # PiMP password
# host = 'localhost:8000' # server address and port
# token = get_authentication_token(host, username, password)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [7]:
token = get_pimp_API_token_from_env()

In [8]:
analysis_id = 1321 # example beer analysis

In [9]:
int_df_filename = os.path.join(os.getcwd(), 'test_data', 'int_df.p')
try:
    int_df = pd.read_pickle(int_df_filename)
except FileNotFoundError:
    int_df = get_ms1_intensities(token, PIMP_HOST, analysis_id)
    int_df.to_pickle(int_df_filename)

int_df.head()

Unnamed: 0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [10]:
formula_df_filename = os.path.join(os.getcwd(), 'test_data', 'formula_df.p')
try:
    formula_df = pd.read_pickle(formula_df_filename)
except FileNotFoundError:
    formula_df = get_formula_df(token, PIMP_HOST, analysis_id, database_name, polarity='positive')
    formula_df.to_pickle(formula_df_filename)

formula_df.head()

Unnamed: 0_level_0,sec_id,mass,rt,polarity,cmpd_id,formula,adduct,identified,rc_id,compound,db,identifier,frank_annot,inchikey
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3033929,1,116.07055,577.986827,positive,2,C5H9NO2,M+H,True,15367697,L-Proline,kegg,C00148,"{'frank_cmpd_name': 'L-Proline', 'inchikey': N...",ONIBWKKTOPOVIA-BYPYZUCNSA-N
3036581,2653,157.09719,469.781817,positive,2,C5H9NO2,M+ACN+H,True,15390527,L-Proline,kegg,C00148,,ONIBWKKTOPOVIA-BYPYZUCNSA-N
3036855,2927,157.097154,569.55776,positive,2,C5H9NO2,M+ACN+H,True,15392569,L-Proline,kegg,C00148,,ONIBWKKTOPOVIA-BYPYZUCNSA-N
3033929,1,116.07055,577.986827,positive,3,C3H6O2,M+ACN+H,True,15367700,Propanoate,kegg,C00163,"{'frank_cmpd_name': 'L-Proline', 'inchikey': N...",XBDQKXXYIPTUBI-UHFFFAOYSA-N
3033929,1,116.07055,577.986827,positive,5,C5H9NO2,M+H,True,15367704,3-Acetamidopropanal,kegg,C18170,"{'frank_cmpd_name': 'L-Proline', 'inchikey': N...",ARJPPNFIEQKVBB-UHFFFAOYSA-N


In [11]:
experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
try:
    experimental_design_filename = os.path.join(os.getcwd(), 'test_data', 'experimental_design.p')
    with open(experimental_design_filename, 'rb') as f:
        experimental_design = pickle.load(f)
except FileNotFoundError:
    experimental_design = get_experimental_design(token, PIMP_HOST, analysis_id)
    with open(experimental_design_filename, 'wb') as f:
        pickle.dump(experimental_design, f)
        
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

# Set-up KEGG Data Source

In [12]:
ds = DataSource(int_df, formula_df, experimental_design, database_name)

2019-10-29 16:04:03.734 | DEBUG    | feature_extraction:__init__:24 - Loading C:\Users\joewa\Work\git\PALS\pals\data\kegg.json


We pick a particular pathway map00730 for checking

In [13]:
mapid = 'map00730'

In [14]:
ds.pathway_dict[mapid]

'Thiamine metabolism'

In [15]:
len(ds.pathway_cmpd_dict[mapid]), sorted(ds.pathway_cmpd_dict[mapid])

(26,
 ['1-Deoxy-D-xylulose 5-phosphate',
  '2-Methyl-4-amino-5-hydroxymethylpyrimidine diphosphate',
  '4-Amino-2-methyl-5-phosphomethylpyrimidine',
  '4-Amino-5-hydroxymethyl-2-methylpyrimidine',
  '4-Methyl-5-(2-phosphoethyl)-thiazole',
  '5-(2-Hydroxyethyl)-4-methylthiazole',
  'Aminoimidazole ribotide',
  'C15810',
  'C15813',
  'C15814',
  'C15815',
  'D-Glyceraldehyde 3-phosphate',
  'Glycine',
  'Heteropyrithiamine',
  'Iminoglycine',
  'L-Cysteine',
  'L-Tyrosine',
  'Pyruvate',
  'Thiamin diphosphate',
  'Thiamin monophosphate',
  'Thiamin triphosphate',
  'Thiamine',
  'Thiamine acetic acid',
  'Thiamine aldehyde',
  '[Enzyme]-S-sulfanylcysteine',
  '[Enzyme]-cysteine'])

In [16]:
len(ds.pw_cmpd_id_dict[mapid]), ds.pw_cmpd_id_dict[mapid]

(20,
 ['C00022',
  'C00068',
  'C00118',
  'C00037',
  'C00082',
  'C03373',
  'C00097',
  'C00378',
  'C01081',
  'C01279',
  'C02691',
  'C02892',
  'C03028',
  'C04294',
  'C04327',
  'C04556',
  'C04752',
  'C05856',
  'C11437',
  'C15809'])

In [17]:
len(ds.pw_cmpd_formula_dict[mapid]), sorted(ds.pw_cmpd_formula_dict[mapid])

(20,
 ['C11H13N4',
  'C12H15N4O2S',
  'C12H15N4OS',
  'C12H17N4OS',
  'C12H18N4O4PS',
  'C12H19N4O7P2S',
  'C12H20N4O10P3S',
  'C2H3NO2',
  'C2H5NO2',
  'C3H4O3',
  'C3H7NO2S',
  'C3H7O6P',
  'C5H11O7P',
  'C6H10N3O4P',
  'C6H10NO4PS',
  'C6H11N3O7P2',
  'C6H9N3O',
  'C6H9NOS',
  'C8H14N3O7P',
  'C9H11NO3'])

Found 6 here

In [18]:
ds.get_ds_pw_compounds([mapid])

[6]

But in PiMP, we only found 4
```
From PiMP
Assigned formulas 4
Total formulas 20

cid	Name				Formula		pid
C00022	Puryvate			C3H4O3		66, 4595
C11437	1-Deoxy-D-xylulose-5-phosphate	C5H11O7P	758, 1109, 5208, 5209
C03373	Aminoimadazole ribotide		C8H14N3O7P	2307
C00082	L-Tyrosine			C9H11NO3	143
```

Here are the 6 formulas we found

In [19]:
ds.pw_cmpd_formula_dict[mapid].intersection(ds.ds_formulas)

{'C11H13N4', 'C3H4O3', 'C3H7O6P', 'C5H11O7P', 'C8H14N3O7P', 'C9H11NO3'}

And their corresponding peaks

In [20]:
pd.set_option('display.max_rows', 500)

In [21]:
pids = ds.ds_pathways_peak_ids[mapid]
pids

[3033994,
 3036235,
 3034686,
 3035037,
 3035414,
 3035634,
 3034071,
 3034508,
 3035214,
 3035622,
 3035754,
 3035904,
 3036017,
 3037535]

In [22]:
sorted(ds.pw_cmpd_formula_dict[mapid].intersection(formula_df.loc[pids]['formula'].unique()))

['C11H13N4', 'C3H4O3', 'C3H7O6P', 'C5H11O7P', 'C8H14N3O7P', 'C9H11NO3']

Here are the peaks annotated as the 4 found formulas

In [23]:
formula_df[formula_df['formula'].isin(['C3H4O3', 'C5H11O7P', 'C8H14N3O7P', 'C9H11NO3'])].sort_values('formula')

Unnamed: 0_level_0,sec_id,mass,rt,polarity,cmpd_id,formula,adduct,identified,rc_id,compound,db,identifier,frank_annot,inchikey
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3033994,66,130.049887,483.560944,positive,641,C3H4O3,M+ACN+H,True,15368350,Pyruvate,kegg,C00022,"{'frank_cmpd_name': 'L-Pyroglutamic acid', 'in...",LCTONWCANYUPML-UHFFFAOYSA-N
3033994,66,130.049887,483.560944,positive,642,C3H4O3,M+ACN+H,True,15368353,3-Oxopropanoate,kegg,C00222,"{'frank_cmpd_name': 'L-Pyroglutamic acid', 'in...",OAKURXIZZOAYBC-UHFFFAOYSA-N
3033994,66,130.049887,483.560944,positive,650,C3H4O3,M+ACN+H,True,15368363,Ethylene carbonate,kegg,C20363,"{'frank_cmpd_name': 'L-Pyroglutamic acid', 'in...",KMTRUDSVKNLOMY-UHFFFAOYSA-N
3033994,66,130.049887,483.560944,positive,652,C3H4O3,M+ACN+H,True,15368365,3-Hydroxypropenoate,kegg,C12069,"{'frank_cmpd_name': 'L-Pyroglutamic acid', 'in...",ZJKIBABOSPFBNO-OWOJBTEDSA-N
3035037,1109,256.058814,1255.710347,positive,5511,C5H11O7P,M+ACN+H,True,15375924,5-Deoxyribose-1-phosphate,kegg,C16637,,XXQFKXPJJNBLSU-TXICZTDVSA-N
3034686,758,256.057793,422.286094,positive,5514,C5H11O7P,M+ACN+H,True,15373309,1-Deoxy-D-xylulose 5-phosphate,kegg,C11437,,AJPADPZSRRUGHI-RFZPGFLSSA-N
3035037,1109,256.058814,1255.710347,positive,5514,C5H11O7P,M+ACN+H,True,15375928,1-Deoxy-D-xylulose 5-phosphate,kegg,C11437,,AJPADPZSRRUGHI-RFZPGFLSSA-N
3034686,758,256.057793,422.286094,positive,5525,C5H11O7P,M+ACN+H,True,15373321,2-Deoxy-D-ribose 5-phosphate,kegg,C00673,,KKZFLSZAWCYPOC-PYHARJCCSA-N
3035037,1109,256.058814,1255.710347,positive,5525,C5H11O7P,M+ACN+H,True,15375932,2-Deoxy-D-ribose 5-phosphate,kegg,C00673,,KKZFLSZAWCYPOC-PYHARJCCSA-N
3034686,758,256.057793,422.286094,positive,5526,C5H11O7P,M+ACN+H,True,15373322,2-Deoxy-D-ribose 1-phosphate,kegg,C00672,,KBDKAJNTYKVSEK-VPENINKCSA-N


And here are the peaks that are not found in PiMP

In [24]:
formula_df[formula_df['formula'].isin(['C11H13N4', 'C3H7O6P'])]

Unnamed: 0_level_0,sec_id,mass,rt,polarity,cmpd_id,formula,adduct,identified,rc_id,compound,db,identifier,frank_annot,inchikey
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3035414,1486,224.102981,454.038681,positive,8828,C11H13N4,M+Na,True,15378999,Heteropyrithiamine,kegg,C02691,,SPQICHFDXHERAC-UHFFFAOYSA-N
3035634,1706,212.032018,430.311987,positive,9471,C3H7O6P,M+ACN+H,True,15381038,Glycerone phosphate,kegg,C00111,,GNGACRATGGDKBX-UHFFFAOYSA-N
3035634,1706,212.032018,430.311987,positive,9472,C3H7O6P,M+ACN+H,True,15381040,DL-Glyceraldehyde 3-phosphate,kegg,C00661,,LXJXRIRHZLFYRP-UHFFFAOYSA-N
3035634,1706,212.032018,430.311987,positive,9473,C3H7O6P,M+ACN+H,True,15381045,(2S)-2-Phospholactate,kegg,C19156,,CSZRNWHGZPKNKY-REOHCLBHSA-N
3035634,1706,212.032018,430.311987,positive,9474,C3H7O6P,M+ACN+H,True,15381046,D-Glyceraldehyde 3-phosphate,kegg,C00118,,LXJXRIRHZLFYRP-VKHMYHEASA-N
