In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.pimp_tools import get_authentication_token, get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS
from pals.common import *

2020-01-06 11:17:23.445 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# CRC Data Analysis 

In [5]:
crc_table = pd.read_csv('crc_with_kegg.csv')

In [6]:
crc_table.head()

Unnamed: 0.1,Unnamed: 0,Metabolites,Kegg Id,Patient group:CRC,Patient group:CRC.1,Patient group:CRC.2,Patient group:CRC.3,Patient group:CRC.4,Patient group:CRC.5,Patient group:CRC.6,...,Patient group:Polyp.66,Patient group:Polyp.67,Patient group:Polyp.68,Patient group:Polyp.69,Patient group:Polyp.70,Patient group:Polyp.71,Patient group:Polyp.72,Patient group:Polyp.73,Patient group:Polyp.74,Patient group:Polyp.75
0,1,1-Methyladenosine,C02494,390953.9725,567984.4359,558842.7545,476949.0,398317.0,411224.0,412250.6387,...,390493.7447,375952.1076,399195.9585,1018448.0,503210.6702,311604.8089,514713.0726,407366.5548,463101.8944,510398.3294
1,2,1-Methylhistamine,C05127,34627.3365,52845.9284,31507.3427,29397.0,38877.0,25493.0,44478.1854,...,18266.089,30752.978,41086.0892,20943.03,27630.5961,30531.1781,24293.0652,29915.2947,32606.5463,29862.9664
2,3,2-Aminoadipate,C00956,141257.364,528024.6865,188272.3016,107076.0918,158299.071,136600.4976,235936.4772,...,149966.9484,188581.2257,153364.1518,2666709.0,228074.4199,132828.406,215838.8101,179582.5152,227973.0218,216277.0586
3,4,2-Deoxyuridine,C00526,13115.8131,14327.696,12756.8649,10539.0,11689.0,12425.0,15692.6312,...,11790.1935,11041.2359,13050.2462,10944.57,13652.0542,9250.8437,14597.4711,14137.262,13281.2552,12653.4161
4,5,4-Pyridoxic acid,C00847,333815.0605,402460.7838,389636.3191,373058.7101,323706.9435,358042.2763,355881.3356,...,379298.6241,356057.1511,412505.9735,426002.7,358592.6192,314369.6151,327164.9675,361262.106,376591.8864,368266.5694


#### Reset index position and name

In [7]:
crc_table.set_index('Unnamed: 0', inplace=True)
crc_table.index.name = 'row_id'

#### Get the annotation dataframe using the index and the KEGG IDs

In [8]:
crc_annotation_df = crc_table[['Kegg Id']]
crc_annotation_df.columns = ['entity_id']
crc_annotation_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
1,C02494
2,C05127
3,C00956
4,C00526
5,C00847


#### Drop columns not required by the intensity DF

In [9]:
crc_int_df = crc_table.drop(columns=['Metabolites', 'Kegg Id'])
crc_int_df.head()

Unnamed: 0_level_0,Patient group:CRC,Patient group:CRC.1,Patient group:CRC.2,Patient group:CRC.3,Patient group:CRC.4,Patient group:CRC.5,Patient group:CRC.6,Patient group:CRC.7,Patient group:CRC.8,Patient group:CRC.9,...,Patient group:Polyp.66,Patient group:Polyp.67,Patient group:Polyp.68,Patient group:Polyp.69,Patient group:Polyp.70,Patient group:Polyp.71,Patient group:Polyp.72,Patient group:Polyp.73,Patient group:Polyp.74,Patient group:Polyp.75
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,390953.9725,567984.4359,558842.7545,476949.0,398317.0,411224.0,412250.6387,414501.1114,416166.0,384549.0,...,390493.7447,375952.1076,399195.9585,1018448.0,503210.6702,311604.8089,514713.0726,407366.5548,463101.8944,510398.3294
2,34627.3365,52845.9284,31507.3427,29397.0,38877.0,25493.0,44478.1854,27448.735,34757.0,26494.0,...,18266.089,30752.978,41086.0892,20943.03,27630.5961,30531.1781,24293.0652,29915.2947,32606.5463,29862.9664
3,141257.364,528024.6865,188272.3016,107076.0918,158299.071,136600.4976,235936.4772,419826.6337,351043.7015,124681.8762,...,149966.9484,188581.2257,153364.1518,2666709.0,228074.4199,132828.406,215838.8101,179582.5152,227973.0218,216277.0586
4,13115.8131,14327.696,12756.8649,10539.0,11689.0,12425.0,15692.6312,15743.5808,8418.0,9693.0,...,11790.1935,11041.2359,13050.2462,10944.57,13652.0542,9250.8437,14597.4711,14137.262,13281.2552,12653.4161
5,333815.0605,402460.7838,389636.3191,373058.7101,323706.9435,358042.2763,355881.3356,347876.8878,347660.2188,354733.2473,...,379298.6241,356057.1511,412505.9735,426002.7,358592.6192,314369.6151,327164.9675,361262.106,376591.8864,368266.5694


### Make group lists for the experimental design

In [10]:
columns = crc_int_df.columns
crc = []
healthy = []
polyp = []

for c in columns:
    if 'CRC' in c:
        crc.append(c)
    elif 'Healthy' in c:
        healthy.append(c)
    elif 'Polyp' in c:
        polyp.append(c)
        
crc_experimental_design = {
    'comparisons': [
        {'case': 'CRC', 'control': 'healthy', 'name': 'crc/healthy'},
        {'case': 'polyp', 'control': 'healthy', 'name': 'polyp/healthy'}
    ]
}

In [11]:
crc_experimental_design['groups'] = {'healthy':healthy, 'CRC':crc, 'polyp': polyp}

In [12]:
crc_ds = DataSource(crc_int_df, crc_annotation_df, crc_experimental_design, DATABASE_PIMP_KEGG)

2020-01-06 11:17:25.259 | DEBUG    | pals.feature_extraction:__init__:38 - Using PiMP_KEGG as database
2020-01-06 11:17:25.261 | DEBUG    | pals.feature_extraction:get_database:101 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2020-01-06 11:17:25.284 | DEBUG    | pals.feature_extraction:__init__:51 - Mapping pathway to unique ids
2020-01-06 11:17:25.291 | DEBUG    | pals.feature_extraction:__init__:65 - Creating dataset to pathway mapping
2020-01-06 11:17:25.300 | DEBUG    | pals.feature_extraction:__init__:93 - Computing unique id counts


In [13]:
pals = PALS(crc_ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_df = pals.get_pathway_df()

2020-01-06 11:17:25.379 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:407 - Setting the zero intensity values in the dataframe
2020-01-06 11:17:25.445 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:388 - Scaling the data across the sample: zero mean and unit variance
2020-01-06 11:17:25.449 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:215 - Mean values of the rows in the DF is [-0. -0. -0.  0. -0. -0.  0.  0.  0.  0. -0. -0. -0. -0.  0.  0.  0. -0.
 -0.  0.  0. -0. -0.  0. -0.  0. -0. -0. -0.  0. -0.  0.  0.  0. -0. -0.
  0.  0.  0. -0. -0. -0. -0.  0.  0. -0. -0. -0.  0. -0.  0. -0.  0.  0.
  0. -0.  0. -0.  0.  0. -0. -0. -0. -0.  0. -0. -0.  0.  0.  0. -0.  0.
 -0. -0. -0.  0.  0.  0. -0.  0.  0.  0.  0.  0. -0. -0. -0. -0.  0.  0.
  0.  0. -0.  0. -0. -0. -0. -0. -0. -0.  0. -0.  0. -0. -0.]
2020-01-06 11:17:25.450 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:216 - Variance in the rows of the DF is [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 

In [14]:
pathway_df.sort_values('PiMP_KEGG crc/healthy comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0,pw_name,crc/healthy p-value,polyp/healthy p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,PiMP_KEGG crc/healthy comb_p,PiMP_KEGG polyp/healthy comb_p
map00340,Histidine metabolism,3.403727e-10,1.000000,41,6,14.63,1.439225e-03,1.28,3.12,1.620502e-11,1.000000
map00410,beta-Alanine metabolism,9.410025e-07,1.000000,31,9,29.03,2.107099e-07,0.97,3.13,7.338076e-09,0.999998
map00660,C5-Branched dibasic acid metabolism,9.386299e-06,1.000000,21,8,38.10,9.426139e-08,0.65,3.10,9.055904e-08,1.000000
map00240,Pyrimidine metabolism,3.942062e-05,0.967447,56,13,23.21,6.773139e-09,1.74,3.11,3.098430e-07,0.756460
map00300,Lysine biosynthesis,1.432349e-04,0.998993,27,5,18.52,1.228408e-03,0.84,3.11,1.656879e-05,0.992540
...,...,...,...,...,...,...,...,...,...,...,...
map00440,Phosphonate and phosphinate metabolism,1.000000e+00,0.908577,44,2,4.55,4.007934e-01,1.37,3.11,1.000000e+00,0.895603
map00380,Tryptophan metabolism,1.000000e+00,0.973119,64,6,9.38,1.353545e-02,1.99,3.11,1.000000e+00,0.927549
map00230,Purine metabolism,1.000000e+00,1.000000,78,12,15.38,3.177677e-06,2.43,3.12,1.000000e+00,1.000000
map00360,Phenylalanine metabolism,1.000000e+00,0.999998,55,7,12.73,1.346355e-03,1.71,3.11,1.000000e+00,0.999958


### ORA Analysis

In [15]:
ds = DataSource(crc_int_df, crc_annotation_df, crc_experimental_design, DATABASE_PIMP_KEGG)

2020-01-06 11:17:33.075 | DEBUG    | pals.feature_extraction:__init__:38 - Using PiMP_KEGG as database
2020-01-06 11:17:33.075 | DEBUG    | pals.feature_extraction:get_database:101 - Loading C:\Users\joewa\Work\git\PALS\pals\data\PiMP_KEGG.json.zip
2020-01-06 11:17:33.096 | DEBUG    | pals.feature_extraction:__init__:51 - Mapping pathway to unique ids
2020-01-06 11:17:33.101 | DEBUG    | pals.feature_extraction:__init__:65 - Creating dataset to pathway mapping
2020-01-06 11:17:33.109 | DEBUG    | pals.feature_extraction:__init__:93 - Computing unique id counts


In [16]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_ora_df()

2020-01-06 11:17:33.183 | DEBUG    | pals.pathway_analysis:get_ora_df:50 - Calculating ORA
2020-01-06 11:17:33.185 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:407 - Setting the zero intensity values in the dataframe
2020-01-06 11:17:34.543 | DEBUG    | pals.pathway_analysis:get_ora_df:111 - Correcting for multiple t-tests
2020-01-06 11:17:34.547 | DEBUG    | pals.pathway_analysis:_calculate_coverage_df:493 - Calculating dataset formula coverage


In [17]:
pathway_df.sort_values('PiMP_KEGG crc/healthy comb_p', ascending=True, inplace=True)
pathway_df

Unnamed: 0_level_0,pw_name,crc/healthy p-value,polyp/healthy p-value,PiMP_KEGG crc/healthy comb_p,PiMP_KEGG polyp/healthy comb_p,unq_pw_F,tot_ds_F,F_coverage
mapids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
map00970,Aminoacyl-tRNA biosynthesis,5.536095e-11,1.000000,8.304143e-09,1.000000,23,18,78.26
map00250,"Alanine, aspartate and glutamate metabolism",3.002766e-09,1.000000,2.252074e-07,1.000000,23,11,47.83
map02010,ABC transporters,8.094736e-09,0.010530,3.035526e-07,0.225644,79,27,34.18
map00330,Arginine and proline metabolism,8.094736e-09,0.154929,3.035526e-07,0.663983,79,19,24.05
map04974,Protein digestion and absorption,1.147609e-08,1.000000,3.442826e-07,1.000000,42,20,47.62
...,...,...,...,...,...,...,...,...
map00930,Caprolactam degradation,1.000000e+00,1.000000,1.000000e+00,1.000000,19,3,15.79
map04150,mTOR signaling pathway,1.000000e+00,1.000000,1.000000e+00,1.000000,6,1,16.67
map05143,African trypanosomiasis,1.000000e+00,1.000000,1.000000e+00,1.000000,7,2,28.57
map04970,Salivary secretion,1.000000e+00,1.000000,1.000000e+00,1.000000,16,1,6.25
