# Demonstrates how to load input CSV files and run them through PALS

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.feature_extraction import DataSource
from pals.PLAGE import PLAGE
from pals.ORA import ORA
from pals.GSEA import GSEA
from pals.common import *

2021-01-07 16:58:15.549 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Beer Analysis

### Load data

In [5]:
intensity_csv = os.path.join('test_data', 'beer', 'int_df.csv')
annotation_csv = os.path.join('test_data', 'beer', 'annotation_df.csv')

In [6]:
int_df, annotation_df, groups = load_data(intensity_csv, annotation_csv)

2021-01-07 16:58:16.361 | DEBUG    | pals.common:load_data:176 - Loaded 7375 x 12 peak intensities from test_data\beer\int_df.csv
2021-01-07 16:58:16.362 | DEBUG    | pals.common:load_data:177 - Loaded groups: {'beer1': ['Beer_1_full1.mzXML', 'Beer_1_full2.mzXML', 'Beer_1_full3.mzXML'], 'beer2': ['Beer_2_full1.mzXML', 'Beer_2_full2.mzXML', 'Beer_2_full3.mzXML'], 'beer3': ['Beer_3_full1.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full3.mzXML'], 'beer4': ['Beer_4_full1.mzXML', 'Beer_4_full2.mzXML', 'Beer_4_full3.mzXML']}
2021-01-07 16:58:16.372 | DEBUG    | pals.common:load_data:180 - Loaded 14549 peak annotations from test_data\beer\annotation_df.csv


In [7]:
int_df.head()

Unnamed: 0_level_0,Beer_1_full1.mzXML,Beer_1_full2.mzXML,Beer_1_full3.mzXML,Beer_2_full1.mzXML,Beer_2_full2.mzXML,Beer_2_full3.mzXML,Beer_3_full1.mzXML,Beer_3_full2.mzXML,Beer_3_full3.mzXML,Beer_4_full1.mzXML,Beer_4_full2.mzXML,Beer_4_full3.mzXML
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3033929,2235291000.0,2000478000.0,2170697000.0,2242760000.0,2279882000.0,1959480000.0,2079356000.0,2110473000.0,2243653000.0,1817065000.0,1746443000.0,1779827000.0
3033930,44334910.0,42873870.0,48948530.0,47604480.0,42172800.0,39084520.0,38257780.0,37701920.0,40871890.0,33304770.0,31536300.0,31024100.0
3033931,1723985000.0,1764235000.0,1585143000.0,1543961000.0,1579320000.0,1555666000.0,1698130000.0,1481824000.0,1508645000.0,1642510000.0,1723919000.0,1697806000.0
3033932,625423700.0,650341700.0,591497500.0,463592900.0,429838200.0,403874700.0,429283700.0,370876100.0,477893200.0,390316500.0,408099500.0,430989200.0
3033933,1075022000.0,929347400.0,1092635000.0,1130720000.0,1118146000.0,1192834000.0,1231442000.0,1262046000.0,1460653000.0,1009838000.0,908511100.0,996717600.0


In [8]:
annotation_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
3033929,C00148
3036581,C00148
3036855,C00148
3038249,C00148
3033929,C00163


In [9]:
groups

{'beer1': ['Beer_1_full1.mzXML', 'Beer_1_full2.mzXML', 'Beer_1_full3.mzXML'],
 'beer2': ['Beer_2_full1.mzXML', 'Beer_2_full2.mzXML', 'Beer_2_full3.mzXML'],
 'beer3': ['Beer_3_full1.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full3.mzXML'],
 'beer4': ['Beer_4_full1.mzXML', 'Beer_4_full2.mzXML', 'Beer_4_full3.mzXML']}

Define some comparisons. This should be specified by users from the interface.

For simplicity, we can just let user specify one comparison at a time (what is currently the case in PALS Viewer), although the codes allow us to specify multiple comparisons.

In [10]:
comparisons = [
    ('beer1', 'beer2'), 
    # ('beer3', 'beer4')
]

Create experimental design dictionary

In [11]:
experimental_design = {
    'groups': groups,
    'comparisons': []
}
for case, control in comparisons:
    experimental_design['comparisons'].append({
        'case': case,
        'control': control,
        'name': '%s/%s' % (case, control)
    })
experimental_design

{'groups': {'beer1': ['Beer_1_full1.mzXML',
   'Beer_1_full2.mzXML',
   'Beer_1_full3.mzXML'],
  'beer2': ['Beer_2_full1.mzXML', 'Beer_2_full2.mzXML', 'Beer_2_full3.mzXML'],
  'beer3': ['Beer_3_full1.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full3.mzXML'],
  'beer4': ['Beer_4_full1.mzXML', 'Beer_4_full2.mzXML', 'Beer_4_full3.mzXML']},
 'comparisons': [{'case': 'beer1', 'control': 'beer2', 'name': 'beer1/beer2'}]}

### PALS analysis using KEGG database exported from PiMP

In [12]:
ds = DataSource(int_df, annotation_df, experimental_design, DATABASE_PIMP_KEGG)

2021-01-07 16:58:16.989 | DEBUG    | pals.feature_extraction:__init__:43 - Using PiMP_KEGG as database
2021-01-07 16:58:16.989 | DEBUG    | pals.loader:load_data:42 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2021-01-07 16:58:17.014 | DEBUG    | pals.feature_extraction:__init__:56 - Mapping pathway to unique ids
2021-01-07 16:58:17.020 | DEBUG    | pals.feature_extraction:__init__:70 - Creating dataset to pathway mapping
2021-01-07 16:58:17.874 | DEBUG    | pals.feature_extraction:__init__:98 - Computing unique id counts


In [13]:
plage = PLAGE(ds)
pathway_df = plage.get_results()

2021-01-07 16:58:17.972 | DEBUG    | pals.PLAGE:__init__:29 - PLAGE initialised
2021-01-07 16:58:18.101 | DEBUG    | pals.preprocessing:process:20 - Performing min-value imputation
2021-01-07 16:58:18.111 | DEBUG    | pals.preprocessing:process:36 - Performing row average imputation
2021-01-07 16:58:18.121 | DEBUG    | pals.preprocessing:process:46 - Applying log normalisation
2021-01-07 16:58:18.122 | DEBUG    | pals.preprocessing:process:53 - Scaling the data across the sample: zero mean and unit variance
2021-01-07 16:58:18.185 | DEBUG    | pals.PLAGE:get_plage_activity_df:84 - Mean values of the rows in the DF is [ 0.  0. -0. ...  0. -0. -0.]
2021-01-07 16:58:18.185 | DEBUG    | pals.PLAGE:get_plage_activity_df:85 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2021-01-07 16:58:18.374 | DEBUG    | pals.PLAGE:set_up_resample_plage_p_df:96 - Calculating plage p-values with resampling
2021-01-07 16:58:18.375 | DEBUG    | pals.PLAGE:set_up_resample_plage_p_df:103 - Comparis

In [14]:
pathway_df.sort_values('PiMP_KEGG beer1/beer2 comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,beer1/beer2 p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG beer1/beer2 comb_p
map04740,Olfactory transduction,0.027830,5,1,20.00,7.168639e-01,1.11,22.20,0.027830
map07016,Sulfonamide derivatives - sulfa drugs,0.050534,2,1,50.00,3.961500e-01,0.45,22.50,0.050534
map05146,Amoebiasis,0.064102,8,2,25.00,5.620670e-01,1.78,22.25,0.064102
map00380,Tryptophan metabolism,0.070210,64,34,53.12,5.093550e-08,14.26,22.28,0.070210
map00460,Cyanoamino acid metabolism,0.086354,40,25,62.50,4.257882e-08,8.92,22.30,0.086354
...,...,...,...,...,...,...,...,...,...
map07216,Catecholamine transferase inhibitors,1.000000,5,3,60.00,7.682999e-02,1.11,22.20,1.000000
map00254,Aflatoxin biosynthesis,1.000000,24,3,12.50,9.288490e-01,5.35,22.29,1.000000
map04745,Phototransduction - fly,1.000000,6,1,16.67,7.800807e-01,1.34,22.33,1.000000
map00190,Oxidative phosphorylation,1.000000,13,4,30.77,3.253097e-01,2.90,22.31,1.000000
