In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import pathlib
import pickle

sys.path.append('..')

In [4]:
import pandas as pd

In [5]:
from pals.pimp_tools import get_authentication_token, get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS
from pals.common import *



### Load data

Generate token by logging in to PiMP

In [6]:
# username = 'karen' # PiMP username
# password = 'temp' # PiMP password
# host = 'polyomics.mvls.gla.ac.uk' # server address and port
# token = get_authentication_token(host, username, password)
# print (token)

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [7]:
token = get_pimp_API_token_from_env()

In [8]:
analysis_id = 1321 # example beer analysis

In [9]:
int_df, annotation_df, experimental_design = download_from_pimp(token, PIMP_HOST, analysis_id, 'kegg')

2019-12-20 10:51:54.925 | DEBUG    | pals.pimp_tools:download_from_pimp:119 - Trying to load data from temp file: /var/folders/t8/0r47by2j56v3kx2h3nz4bpd40000gn/T/pimp_analysis_1321.p
2019-12-20 10:51:54.927 | DEBUG    | pals.pimp_tools:download_from_pimp:123 - Retrieving data for analysis 1321 from PiMP
2019-12-20 10:51:56.619 | DEBUG    | pals.pimp_tools:get_data:33 - http://polyomics.mvls.gla.ac.uk/export/get_ms1_intensities?analysis_id=1321
2019-12-20 10:52:16.530 | DEBUG    | pals.pimp_tools:get_data:33 - http://polyomics.mvls.gla.ac.uk/export/get_ms1_peaks?analysis_id=1321
2019-12-20 10:52:30.450 | DEBUG    | pals.pimp_tools:get_data:33 - http://polyomics.mvls.gla.ac.uk/export/get_experimental_design?analysis_id=1321
2019-12-20 10:52:30.451 | DEBUG    | pals.pimp_tools:download_from_pimp:132 - Caching analysis data for next use
2019-12-20 10:52:30.452 | DEBUG    | pals.common:save_obj:76 - Saving <class 'dict'> to /var/folders/t8/0r47by2j56v3kx2h3nz4bpd40000gn/T/pimp_analysis_132

# CRC Data Analysis 

In [24]:
crc_table = pd.read_csv('crc_with_kegg.csv')

In [25]:
display(crc_table.head())

Unnamed: 0.1,Unnamed: 0,Metabolites,Kegg Id,Patient group:CRC,Patient group:CRC.1,Patient group:CRC.2,Patient group:CRC.3,Patient group:CRC.4,Patient group:CRC.5,Patient group:CRC.6,...,Patient group:Polyp.66,Patient group:Polyp.67,Patient group:Polyp.68,Patient group:Polyp.69,Patient group:Polyp.70,Patient group:Polyp.71,Patient group:Polyp.72,Patient group:Polyp.73,Patient group:Polyp.74,Patient group:Polyp.75
0,1,1-Methyladenosine,C02494,390953.9725,567984.4359,558842.7545,476949.0,398317.0,411224.0,412250.6387,...,390493.7447,375952.1076,399195.9585,1018448.0,503210.6702,311604.8089,514713.0726,407366.5548,463101.8944,510398.3294
1,2,1-Methylhistamine,C05127,34627.3365,52845.9284,31507.3427,29397.0,38877.0,25493.0,44478.1854,...,18266.089,30752.978,41086.0892,20943.03,27630.5961,30531.1781,24293.0652,29915.2947,32606.5463,29862.9664
2,3,2-Aminoadipate,C00956,141257.364,528024.6865,188272.3016,107076.0918,158299.071,136600.4976,235936.4772,...,149966.9484,188581.2257,153364.1518,2666709.0,228074.4199,132828.406,215838.8101,179582.5152,227973.0218,216277.0586
3,4,2-Deoxyuridine,C00526,13115.8131,14327.696,12756.8649,10539.0,11689.0,12425.0,15692.6312,...,11790.1935,11041.2359,13050.2462,10944.57,13652.0542,9250.8437,14597.4711,14137.262,13281.2552,12653.4161
4,5,4-Pyridoxic acid,C00847,333815.0605,402460.7838,389636.3191,373058.7101,323706.9435,358042.2763,355881.3356,...,379298.6241,356057.1511,412505.9735,426002.7,358592.6192,314369.6151,327164.9675,361262.106,376591.8864,368266.5694


#### Reset index position and name

In [26]:
crc_table.set_index('Unnamed: 0', inplace=True)
crc_table.index.name = 'row_id'

#### Get the annotation dataframe using the index and the KEGG IDs

In [27]:
crc_annotation_df = crc_table[['Kegg Id']]

crc_annotation_df.columns = ['entity_id']
display(crc_annotation_df.head())

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
1,C02494
2,C05127
3,C00956
4,C00526
5,C00847


#### Drop columns not required by the intensity DF

In [28]:
crc_int_df = crc_table.drop(columns=['Metabolites', 'Kegg Id'])
display(crc_int_df.head())
columns = crc_int_df.columns



Unnamed: 0_level_0,Patient group:CRC,Patient group:CRC.1,Patient group:CRC.2,Patient group:CRC.3,Patient group:CRC.4,Patient group:CRC.5,Patient group:CRC.6,Patient group:CRC.7,Patient group:CRC.8,Patient group:CRC.9,...,Patient group:Polyp.66,Patient group:Polyp.67,Patient group:Polyp.68,Patient group:Polyp.69,Patient group:Polyp.70,Patient group:Polyp.71,Patient group:Polyp.72,Patient group:Polyp.73,Patient group:Polyp.74,Patient group:Polyp.75
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,390953.9725,567984.4359,558842.7545,476949.0,398317.0,411224.0,412250.6387,414501.1114,416166.0,384549.0,...,390493.7447,375952.1076,399195.9585,1018448.0,503210.6702,311604.8089,514713.0726,407366.5548,463101.8944,510398.3294
2,34627.3365,52845.9284,31507.3427,29397.0,38877.0,25493.0,44478.1854,27448.735,34757.0,26494.0,...,18266.089,30752.978,41086.0892,20943.03,27630.5961,30531.1781,24293.0652,29915.2947,32606.5463,29862.9664
3,141257.364,528024.6865,188272.3016,107076.0918,158299.071,136600.4976,235936.4772,419826.6337,351043.7015,124681.8762,...,149966.9484,188581.2257,153364.1518,2666709.0,228074.4199,132828.406,215838.8101,179582.5152,227973.0218,216277.0586
4,13115.8131,14327.696,12756.8649,10539.0,11689.0,12425.0,15692.6312,15743.5808,8418.0,9693.0,...,11790.1935,11041.2359,13050.2462,10944.57,13652.0542,9250.8437,14597.4711,14137.262,13281.2552,12653.4161
5,333815.0605,402460.7838,389636.3191,373058.7101,323706.9435,358042.2763,355881.3356,347876.8878,347660.2188,354733.2473,...,379298.6241,356057.1511,412505.9735,426002.7,358592.6192,314369.6151,327164.9675,361262.106,376591.8864,368266.5694


### Make group lists for the experimental design

In [29]:
crc=[]
healthy = []
polyp=[]
for c in columns:
    if 'CRC' in c:
        crc.append(c)
    elif 'Healthy' in c:
        healthy.append(c)
    elif 'Polyp' in c:
        polyp.append(c)
crc_experimental_design = {'comparisons': [{'case': 'CRC', 'control': 'healthy', 'name': 'crc/healthy'},
  {'case': 'polyp', 'control': 'healthy', 'name': 'crc/healthy'}]}


In [30]:
crc_experimental_design['groups']={'healthy':healthy, 'CRC':crc, 'polyp': polyp}

In [31]:
crc_ds = DataSource(crc_int_df, crc_annotation_df, crc_experimental_design, DATABASE_PIMP_KEGG)

2019-12-20 10:59:09.429 | DEBUG    | pals.feature_extraction:__init__:40 - Loading /Users/Karen/PALS/pals/data/PiMP_KEGG.json.zip
2019-12-20 10:59:09.478 | DEBUG    | pals.feature_extraction:__init__:92 - Mapping pathway to unique ids
2019-12-20 10:59:09.486 | DEBUG    | pals.feature_extraction:__init__:106 - Creating dataset to pathway mapping
2019-12-20 10:59:09.498 | DEBUG    | pals.feature_extraction:__init__:134 - Computing unique id counts


In [32]:
pals = PALS(crc_ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-12-20 10:59:12.158 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:385 - Setting the zero intensity values in the dataframe
2019-12-20 10:59:12.263 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:366 - Scaling the data across the sample: zero mean and unit variance
2019-12-20 10:59:12.267 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:205 - Mean values of the rows in the DF is [-0. -0. -0.  0. -0. -0.  0.  0.  0.  0. -0. -0. -0. -0.  0.  0.  0. -0.
 -0.  0.  0. -0. -0.  0. -0.  0. -0. -0. -0.  0. -0.  0.  0.  0. -0. -0.
  0.  0.  0. -0. -0. -0. -0.  0.  0. -0. -0. -0.  0. -0.  0. -0.  0.  0.
  0. -0.  0. -0.  0.  0. -0. -0. -0. -0.  0. -0. -0.  0.  0.  0. -0.  0.
 -0. -0. -0.  0.  0.  0. -0.  0.  0.  0.  0.  0. -0. -0. -0. -0.  0.  0.
  0.  0. -0.  0. -0. -0. -0. -0. -0. -0.  0. -0.  0. -0. -0.]
2019-12-20 10:59:12.268 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:206 - Variance in the rows of the DF is [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

### Back to Beer

In [11]:
experimental_design

{'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'},
  {'case': 'beer3', 'control': 'beer4', 'name': 'beer3/beer4'}],
 'groups': {'beer4': ['Beer_4_full3.mzXML',
   'Beer_4_full2.mzXML',
   'Beer_4_full1.mzXML'],
  'beer3': ['Beer_3_full3.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full1.mzXML'],
  'beer2': ['Beer_2_full3.mzXML', 'Beer_2_full1.mzXML', 'Beer_2_full2.mzXML'],
  'beer1': ['Beer_1_full2.mzXML', 'Beer_1_full1.mzXML', 'Beer_1_full3.mzXML']}}

### PALS analysis using KEGG database exported from PiMP

In [102]:
annotation_df

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163
...,...
3040926,C20522
3040929,C20582
3041077,C20499
3041172,C20504


In [12]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_PIMP_KEGG)

2019-12-18 13:29:54.631 | DEBUG    | pals.feature_extraction:get_data:96 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-18 13:29:54.657 | DEBUG    | pals.feature_extraction:__init__:46 - Mapping pathway to unique ids
2019-12-18 13:29:54.663 | DEBUG    | pals.feature_extraction:__init__:60 - Creating dataset to pathway mapping
2019-12-18 13:29:55.564 | DEBUG    | pals.feature_extraction:__init__:88 - Computing unique id counts


In [13]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-12-18 13:29:55.640 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:385 - Setting the zero intensity values in the dataframe
2019-12-18 13:29:55.670 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:366 - Scaling the data across the sample: zero mean and unit variance
2019-12-18 13:29:55.677 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:205 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-12-18 13:29:55.678 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:206 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-12-18 13:29:56.386 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:217 - Calculating plage p-values with resampling
2019-12-18 13:29:56.387 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:221 - Comparison beer1/beer2
2019-12-18 13:29:56.387 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:229 - Resampling 0/1000
2019-12-18 13:29:56.486 | DEBUG    | pals.pathw

In [14]:
pathway_df.sort_values('PiMP_KEGG beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG beer1/beer2 comb_p,PiMP_KEGG beer3/beer4 comb_p
map00380,Tryptophan metabolism,0.026410,0.050835,64,34,53.12,2.024112e-08,14.26,22.28,0.001464,0.003665
map00330,Arginine and proline metabolism,0.076544,0.031235,79,50,63.29,7.494293e-16,17.61,22.29,0.001512,0.000348
map00460,Cyanoamino acid metabolism,0.033922,0.042044,40,25,62.50,1.448042e-08,8.92,22.30,0.001998,0.002702
map00300,Lysine biosynthesis,0.037750,0.034624,27,20,74.07,3.670262e-09,6.02,22.30,0.002007,0.001775
map00400,"Phenylalanine, tyrosine and tryptophan biosynt...",0.051530,0.031751,30,22,73.33,9.098577e-10,6.69,22.30,0.002736,0.001355
...,...,...,...,...,...,...,...,...,...,...,...
map07227,Histamine H2/H3 receptor agonists/antagonists,1.000000,0.212082,10,1,10.00,7.411824e-01,2.23,22.30,1.000000,0.255651
map00523,Polyketide sugar unit biosynthesis,1.000000,0.053513,19,2,10.53,8.566036e-01,4.23,22.26,1.000000,0.085109
map05032,Morphine addiction,1.000000,0.071093,8,3,37.50,1.185365e-01,1.78,22.25,1.000000,0.047353
map01040,Biosynthesis of unsaturated fatty acids,1.000000,0.069916,42,4,9.52,9.770476e-01,9.36,22.29,1.000000,0.145428


### PALS analysis using KEGG database exported from Reactome

In [15]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True)

2019-12-18 13:29:59.211 | DEBUG    | pals.feature_extraction:get_data:120 - Loading ..\pals\data\reactome\metabolic_pathways\COMPOUND\Homo sapiens.json.zip
2019-12-18 13:29:59.234 | DEBUG    | pals.feature_extraction:__init__:46 - Mapping pathway to unique ids
2019-12-18 13:29:59.236 | DEBUG    | pals.feature_extraction:__init__:60 - Creating dataset to pathway mapping
2019-12-18 13:30:00.161 | DEBUG    | pals.feature_extraction:__init__:88 - Computing unique id counts


In [16]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-12-18 13:30:00.247 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:385 - Setting the zero intensity values in the dataframe
2019-12-18 13:30:00.276 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:366 - Scaling the data across the sample: zero mean and unit variance
2019-12-18 13:30:00.285 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:205 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-12-18 13:30:00.286 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:206 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-12-18 13:30:00.725 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:217 - Calculating plage p-values with resampling
2019-12-18 13:30:00.726 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:221 - Comparison beer1/beer2
2019-12-18 13:30:00.727 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:229 - Resampling 0/1000
2019-12-18 13:30:00.852 | DEBUG    | pals.pathw

In [17]:
pathway_df.sort_values('COMPOUND beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND beer1/beer2 comb_p,COMPOUND beer3/beer4 comb_p
R-HSA-71240,Tryptophan catabolism,0.016712,0.057485,27,14,51.85,0.020106,9.12,33.78,0.006423,0.025709
R-HSA-2024096,HS-GAG degradation,0.011545,0.078462,5,1,20.00,0.659417,1.69,33.80,0.015887,0.095534
R-HSA-163685,Integration of energy metabolism,0.032313,1.000000,1,1,100.00,0.113509,0.34,34.00,0.020237,1.000000
R-HSA-351143,Agmatine biosynthesis,0.023579,0.115855,5,2,40.00,0.327631,1.69,33.80,0.020981,0.103781
R-HSA-71182,Phenylalanine and tyrosine catabolism,0.051213,0.131596,24,13,54.17,0.015294,8.11,33.79,0.021405,0.064101
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-1989781,PPARA activates gene expression,1.000000,0.341515,2,1,50.00,0.264649,0.68,34.00,1.000000,0.300218
R-HSA-2046106,alpha-linolenic acid (ALA) metabolism,1.000000,0.094125,14,1,7.14,0.983784,4.73,33.79,1.000000,0.191954
R-HSA-1855183,"Synthesis of IP2, IP, and Ins in the cytosol",1.000000,0.064520,11,3,27.27,0.622386,3.72,33.82,1.000000,0.076749
R-HSA-70221,Glycogen breakdown (glycogenolysis),1.000000,0.064520,10,3,30.00,0.541649,3.38,33.80,1.000000,0.071064


### PALS analysis of compounds by connecting to Reactome

In [18]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_HOMO_SAPIENS, reactome_metabolic_pathway_only=True, reactome_query=True)

2019-12-18 13:30:03.748 | DEBUG    | pals.feature_extraction:get_data:102 - Retrieving data for Homo sapiens from Reactome COMPOUND metabolic_pathway_only=True
2019-12-18 13:30:08.208 | DEBUG    | pals.feature_extraction:__init__:46 - Mapping pathway to unique ids
2019-12-18 13:30:08.210 | DEBUG    | pals.feature_extraction:__init__:60 - Creating dataset to pathway mapping
2019-12-18 13:30:09.122 | DEBUG    | pals.feature_extraction:__init__:88 - Computing unique id counts


In [19]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2019-12-18 13:30:09.200 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:385 - Setting the zero intensity values in the dataframe
2019-12-18 13:30:09.229 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:366 - Scaling the data across the sample: zero mean and unit variance
2019-12-18 13:30:09.238 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:205 - Mean values of the rows in the DF is [ 0.  0. -0. ... -0. -0. -0.]
2019-12-18 13:30:09.239 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:206 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-12-18 13:30:09.645 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:217 - Calculating plage p-values with resampling
2019-12-18 13:30:09.646 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:221 - Comparison beer1/beer2
2019-12-18 13:30:09.647 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:229 - Resampling 0/1000
2019-12-18 13:30:09.741 | DEBUG    | pals.pathw

In [20]:
pathway_df.sort_values('COMPOUND beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND beer1/beer2 comb_p,COMPOUND beer3/beer4 comb_p
R-HSA-71240,Tryptophan catabolism,0.025106,0.053509,27,14,51.85,0.020106,9.12,33.78,0.010104,0.023696
R-HSA-2024096,HS-GAG degradation,0.018223,0.066147,5,1,20.00,0.659417,1.69,33.80,0.024378,0.081462
R-HSA-163685,Integration of energy metabolism,0.044891,1.000000,1,1,100.00,0.113509,0.34,34.00,0.028682,1.000000
R-HSA-71182,Phenylalanine and tyrosine catabolism,0.069151,0.115252,24,13,54.17,0.015294,8.11,33.79,0.030229,0.054815
R-HSA-351143,Agmatine biosynthesis,0.034558,0.100506,5,2,40.00,0.327631,1.69,33.80,0.030748,0.089895
...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-351200,Interconversion of polyamines,1.000000,0.926579,9,1,11.11,0.904169,3.04,33.78,1.000000,0.953391
R-HSA-2408517,SeMet incorporation into proteins,1.000000,0.080412,3,1,33.33,0.415382,1.01,33.67,1.000000,0.078244
R-HSA-164378,PKA activation in glucagon signalling,1.000000,0.080412,5,1,20.00,0.659417,1.69,33.80,1.000000,0.097748
R-HSA-77289,Mitochondrial Fatty Acid Beta-Oxidation,1.000000,0.080412,8,1,12.50,0.866228,2.70,33.75,1.000000,0.123507


### ORA Analysis

In [21]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_PIMP_KEGG)

2019-12-18 13:30:12.351 | DEBUG    | pals.feature_extraction:get_data:96 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2019-12-18 13:30:12.406 | DEBUG    | pals.feature_extraction:__init__:46 - Mapping pathway to unique ids
2019-12-18 13:30:12.412 | DEBUG    | pals.feature_extraction:__init__:60 - Creating dataset to pathway mapping
2019-12-18 13:30:13.446 | DEBUG    | pals.feature_extraction:__init__:88 - Computing unique id counts


In [22]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_ora_df()

2019-12-18 13:30:13.526 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2019-12-18 13:30:13.528 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:385 - Setting the zero intensity values in the dataframe
2019-12-18 13:30:19.806 | DEBUG    | pals.pathway_analysis:get_ora_df:109 - Correcting for multiple t-tests
2019-12-18 13:30:19.812 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:471 - Calculating dataset formula coverage


In [23]:
pathway_df.sort_values('PiMP_KEGG beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0_level_0,pw_name,beer1/beer2 p-value,beer3/beer4 p-value,PiMP_KEGG beer1/beer2 comb_p,PiMP_KEGG beer3/beer4 comb_p,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
map00350,Tyrosine metabolism,3.621191e-16,1.681454e-16,8.183892e-14,3.800087e-14,53,39,73.58
map00330,Arginine and proline metabolism,1.775486e-12,1.686603e-12,2.006299e-10,1.905861e-10,79,50,63.29
map00290,"Valine, leucine and isoleucine biosynthesis",9.591121e-12,2.028069e-11,7.225312e-10,1.527812e-09,17,16,94.12
map00400,"Phenylalanine, tyrosine and tryptophan biosynt...",2.824918e-10,7.042130e-10,1.596078e-08,3.978804e-08,30,22,73.33
map00660,C5-Branched dibasic acid metabolism,5.693949e-09,1.159169e-08,1.838332e-07,2.619722e-07,21,16,76.19
...,...,...,...,...,...,...,...,...
map00254,Aflatoxin biosynthesis,9.905000e-01,9.925788e-01,1.000000e+00,1.000000e+00,24,3,12.50
map00195,Photosynthesis,1.000000e+00,8.696482e-01,1.000000e+00,9.839475e-01,10,1,10.00
map00981,Insect hormone biosynthesis,9.829573e-01,1.000000e+00,1.000000e+00,1.000000e+00,21,2,9.52
map00942,Anthocyanin biosynthesis,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,50,1,2.00
