In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.pimp_tools import get_authentication_token, get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.feature_extraction import DataSource
from pals.PALS import PALS
from pals.ORA import ORA
from pals.common import *

2020-01-06 13:17:33.856 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# CRC Data Analysis 

In [5]:
crc_table = pd.read_csv('crc_with_kegg.csv')

In [6]:
crc_table.head()

Unnamed: 0.1,Unnamed: 0,Metabolites,Kegg Id,Patient group:CRC,Patient group:CRC.1,Patient group:CRC.2,Patient group:CRC.3,Patient group:CRC.4,Patient group:CRC.5,Patient group:CRC.6,...,Patient group:Polyp.66,Patient group:Polyp.67,Patient group:Polyp.68,Patient group:Polyp.69,Patient group:Polyp.70,Patient group:Polyp.71,Patient group:Polyp.72,Patient group:Polyp.73,Patient group:Polyp.74,Patient group:Polyp.75
0,1,1-Methyladenosine,C02494,390953.9725,567984.4359,558842.7545,476949.0,398317.0,411224.0,412250.6387,...,390493.7447,375952.1076,399195.9585,1018448.0,503210.6702,311604.8089,514713.0726,407366.5548,463101.8944,510398.3294
1,2,1-Methylhistamine,C05127,34627.3365,52845.9284,31507.3427,29397.0,38877.0,25493.0,44478.1854,...,18266.089,30752.978,41086.0892,20943.03,27630.5961,30531.1781,24293.0652,29915.2947,32606.5463,29862.9664
2,3,2-Aminoadipate,C00956,141257.364,528024.6865,188272.3016,107076.0918,158299.071,136600.4976,235936.4772,...,149966.9484,188581.2257,153364.1518,2666709.0,228074.4199,132828.406,215838.8101,179582.5152,227973.0218,216277.0586
3,4,2-Deoxyuridine,C00526,13115.8131,14327.696,12756.8649,10539.0,11689.0,12425.0,15692.6312,...,11790.1935,11041.2359,13050.2462,10944.57,13652.0542,9250.8437,14597.4711,14137.262,13281.2552,12653.4161
4,5,4-Pyridoxic acid,C00847,333815.0605,402460.7838,389636.3191,373058.7101,323706.9435,358042.2763,355881.3356,...,379298.6241,356057.1511,412505.9735,426002.7,358592.6192,314369.6151,327164.9675,361262.106,376591.8864,368266.5694


#### Reset index position and name

In [7]:
crc_table.set_index('Unnamed: 0', inplace=True)
crc_table.index.name = 'row_id'

#### Get the annotation dataframe using the index and the KEGG IDs

In [8]:
crc_annotation_df = crc_table[['Kegg Id']]
crc_annotation_df.columns = ['entity_id']
crc_annotation_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
1,C02494
2,C05127
3,C00956
4,C00526
5,C00847


#### Drop columns not required by the intensity DF

In [9]:
crc_int_df = crc_table.drop(columns=['Metabolites', 'Kegg Id'])
crc_int_df.head()

Unnamed: 0_level_0,Patient group:CRC,Patient group:CRC.1,Patient group:CRC.2,Patient group:CRC.3,Patient group:CRC.4,Patient group:CRC.5,Patient group:CRC.6,Patient group:CRC.7,Patient group:CRC.8,Patient group:CRC.9,...,Patient group:Polyp.66,Patient group:Polyp.67,Patient group:Polyp.68,Patient group:Polyp.69,Patient group:Polyp.70,Patient group:Polyp.71,Patient group:Polyp.72,Patient group:Polyp.73,Patient group:Polyp.74,Patient group:Polyp.75
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,390953.9725,567984.4359,558842.7545,476949.0,398317.0,411224.0,412250.6387,414501.1114,416166.0,384549.0,...,390493.7447,375952.1076,399195.9585,1018448.0,503210.6702,311604.8089,514713.0726,407366.5548,463101.8944,510398.3294
2,34627.3365,52845.9284,31507.3427,29397.0,38877.0,25493.0,44478.1854,27448.735,34757.0,26494.0,...,18266.089,30752.978,41086.0892,20943.03,27630.5961,30531.1781,24293.0652,29915.2947,32606.5463,29862.9664
3,141257.364,528024.6865,188272.3016,107076.0918,158299.071,136600.4976,235936.4772,419826.6337,351043.7015,124681.8762,...,149966.9484,188581.2257,153364.1518,2666709.0,228074.4199,132828.406,215838.8101,179582.5152,227973.0218,216277.0586
4,13115.8131,14327.696,12756.8649,10539.0,11689.0,12425.0,15692.6312,15743.5808,8418.0,9693.0,...,11790.1935,11041.2359,13050.2462,10944.57,13652.0542,9250.8437,14597.4711,14137.262,13281.2552,12653.4161
5,333815.0605,402460.7838,389636.3191,373058.7101,323706.9435,358042.2763,355881.3356,347876.8878,347660.2188,354733.2473,...,379298.6241,356057.1511,412505.9735,426002.7,358592.6192,314369.6151,327164.9675,361262.106,376591.8864,368266.5694


### Make group lists for the experimental design

In [10]:
columns = crc_int_df.columns
crc = []
healthy = []
polyp = []

for c in columns:
    if 'CRC' in c:
        crc.append(c)
    elif 'Healthy' in c:
        healthy.append(c)
    elif 'Polyp' in c:
        polyp.append(c)
        
crc_experimental_design = {
    'comparisons': [
        {'case': 'CRC', 'control': 'healthy', 'name': 'crc/healthy'},
        {'case': 'polyp', 'control': 'healthy', 'name': 'polyp/healthy'}
    ]
}

In [11]:
crc_experimental_design['groups'] = {'healthy':healthy, 'CRC':crc, 'polyp': polyp}

In [12]:
crc_ds = DataSource(crc_int_df, crc_annotation_df, crc_experimental_design, DATABASE_PIMP_KEGG)

2020-01-06 13:17:34.516 | DEBUG    | pals.feature_extraction:__init__:42 - Using PiMP_KEGG as database
2020-01-06 13:17:34.517 | DEBUG    | pals.feature_extraction:get_database:105 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2020-01-06 13:17:34.539 | DEBUG    | pals.feature_extraction:__init__:55 - Mapping pathway to unique ids
2020-01-06 13:17:34.544 | DEBUG    | pals.feature_extraction:__init__:69 - Creating dataset to pathway mapping
2020-01-06 13:17:34.552 | DEBUG    | pals.feature_extraction:__init__:97 - Computing unique id counts


### PALS analysis

In [13]:
pals = PALS(crc_ds, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2020-01-06 13:17:34.633 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:280 - Setting the zero intensity values in the dataframe
2020-01-06 13:17:34.697 | DEBUG    | pals.feature_extraction:standardize_intensity_df:261 - Scaling the data across the sample: zero mean and unit variance
2020-01-06 13:17:34.701 | DEBUG    | pals.PALS:get_plage_activity_df:76 - Mean values of the rows in the DF is [-0. -0. -0.  0. -0. -0.  0.  0.  0.  0. -0. -0. -0. -0.  0.  0.  0. -0.
 -0.  0.  0. -0. -0.  0. -0.  0. -0. -0. -0.  0. -0.  0.  0.  0. -0. -0.
  0.  0.  0. -0. -0. -0. -0.  0.  0. -0. -0. -0.  0. -0.  0. -0.  0.  0.
  0. -0.  0. -0.  0.  0. -0. -0. -0. -0.  0. -0. -0.  0.  0.  0. -0.  0.
 -0. -0. -0.  0.  0.  0. -0.  0.  0.  0.  0.  0. -0. -0. -0. -0.  0.  0.
  0.  0. -0.  0. -0. -0. -0. -0. -0. -0.  0. -0.  0. -0. -0.]
2020-01-06 13:17:34.702 | DEBUG    | pals.PALS:get_plage_activity_df:77 - Variance in the rows of the DF is [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 

In [14]:
pd.set_option('display.max_rows', None)

In [15]:
pathway_df.sort_values('PiMP_KEGG crc/healthy comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,crc/healthy p-value,polyp/healthy p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG crc/healthy comb_p,PiMP_KEGG polyp/healthy comb_p
map00340,Histidine metabolism,2.920302e-07,1.0,41,6,14.63,0.001439225,1.28,3.12,2.080276e-08,1.0
map00410,beta-Alanine metabolism,1.582456e-05,1.0,31,9,29.03,2.107099e-07,0.97,3.13,1.961981e-07,1.0
map00660,C5-Branched dibasic acid metabolism,7.391061e-05,1.0,21,8,38.1,9.426139e-08,0.65,3.1,1.053728e-06,1.0
map00240,Pyrimidine metabolism,0.0002088803,0.975598,56,13,23.21,6.773139e-09,1.74,3.11,2.394464e-06,0.793406
map00300,Lysine biosynthesis,0.0006417282,0.999364,27,5,18.52,0.001228408,0.84,3.11,8.803399e-05,0.99485
map00250,"Alanine, aspartate and glutamate metabolism",0.01018012,1.0,23,11,47.83,1.424652e-11,0.72,3.13,0.000172071,1.0
map04964,Proximal tubule bicarbonate reclamation,0.005840953,1.0,16,7,43.75,2.042991e-07,0.5,3.12,0.0002641784,1.0
map04724,Glutamatergic synapse,0.001234589,1.0,7,2,28.57,0.01818771,0.22,3.14,0.0003641173,1.0
map00471,D-Glutamine and D-glutamate metabolism,0.005225202,1.0,7,4,57.14,2.865671e-05,0.22,3.14,0.0004835471,1.0
map04727,GABAergic synapse,0.005225202,1.0,9,4,44.44,9.831903e-05,0.28,3.11,0.0005954872,1.0


### ORA Analysis

In [16]:
ds = DataSource(crc_int_df, crc_annotation_df, crc_experimental_design, DATABASE_PIMP_KEGG)

2020-01-06 13:17:42.580 | DEBUG    | pals.feature_extraction:__init__:42 - Using PiMP_KEGG as database
2020-01-06 13:17:42.580 | DEBUG    | pals.feature_extraction:get_database:105 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2020-01-06 13:17:42.601 | DEBUG    | pals.feature_extraction:__init__:55 - Mapping pathway to unique ids
2020-01-06 13:17:42.606 | DEBUG    | pals.feature_extraction:__init__:69 - Creating dataset to pathway mapping
2020-01-06 13:17:42.614 | DEBUG    | pals.feature_extraction:__init__:97 - Computing unique id counts


In [17]:
ora = ORA(ds)
pathway_df = ora.get_pathway_df()

2020-01-06 13:17:42.687 | DEBUG    | pals.ORA:get_pathway_df:33 - Calculating ORA
2020-01-06 13:17:42.688 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:280 - Setting the zero intensity values in the dataframe
2020-01-06 13:17:44.267 | DEBUG    | pals.ORA:get_pathway_df:93 - Correcting for multiple t-tests
2020-01-06 13:17:44.272 | DEBUG    | pals.feature_extraction:_calculate_coverage_df:301 - Calculating dataset formula coverage


In [18]:
pathway_df.sort_values('PiMP_KEGG crc/healthy comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0_level_0,pw_name,crc/healthy p-value,polyp/healthy p-value,PiMP_KEGG crc/healthy comb_p,PiMP_KEGG polyp/healthy comb_p,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
map00970,Aminoacyl-tRNA biosynthesis,5.536095e-11,1.0,8.304143e-09,1.0,23,18,78.26
map00250,"Alanine, aspartate and glutamate metabolism",3.002766e-09,1.0,2.252074e-07,1.0,23,11,47.83
map00330,Arginine and proline metabolism,8.094736e-09,0.154929,3.035526e-07,0.663983,79,19,24.05
map02010,ABC transporters,8.094736e-09,0.01053,3.035526e-07,0.225644,79,27,34.18
map04974,Protein digestion and absorption,1.147609e-08,1.0,3.442826e-07,1.0,42,20,47.62
map00410,beta-Alanine metabolism,3.020148e-08,1.0,7.55037e-07,1.0,31,9,29.03
map04964,Proximal tubule bicarbonate reclamation,5.981264e-07,0.033152,1.281699e-05,0.320417,16,7,43.75
map02020,Two-component system,1.574146e-06,0.069348,2.951523e-05,0.358696,34,10,29.41
map04727,GABAergic synapse,1.784262e-06,1.0,2.973769e-05,1.0,9,4,44.44
map00260,"Glycine, serine and threonine metabolism",4.969895e-06,0.083116,7.454842e-05,0.415579,41,19,46.34
