In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle
import pandas as pd

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, get_ms1_intensities, get_ms1_peaks, get_formula_df, get_experimental_design
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS
from pals.common import *

# Zebrafish Analysis

Analysis of multi-omics zebrafish data from https://www.pnas.org/content/114/5/E717.short

### Load data

In [5]:
def get_data_source(measurement_csv, design_csv, database_name, species_name, comparisons, metabolic_pathway_only):
    measurement_df = pd.read_csv(measurement_csv)
    
    # remove 'Identifier' column from measurement dataframe
    int_df = measurement_df.drop('Identifier', axis=1)
    
    # create a new annotation dataframe from the identifier, having only one column: 'entity_id'
    annotation_df = pd.DataFrame(measurement_df['Identifier'])
    annotation_df = annotation_df.rename(columns={'Identifier': 'entity_id'})
    
    # create experimental design
    design_df = pd.read_csv(design_csv)
    groups = {}
    for k, v in design_df.groupby('group'):
        groups[k] = v['sample'].values.tolist()
    experimental_design = {
        'comparisons': comparisons,
        'groups': groups
    }
    
    ds = DataSource(int_df, annotation_df, experimental_design, database_name, 
                    reactome_species=species_name, reactome_metabolic_pathway_only=metabolic_pathway_only, reactome_query=True)
    return ds

In [6]:
comparisons = [
    {'case': 'Distal', 'control': 'Proximal', 'name': 'Distal_vs_Proximal'}
]
species_name = 'Danio rerio'
metabolic_pathway_only = True

### PALS analysis of transcripts

In [7]:
database_name = DATABASE_REACTOME_ENSEMBL
ds = get_data_source('test_data/gene_data.csv', 'test_data/gene_design.csv', database_name, 
                                 species_name, comparisons, metabolic_pathway_only)


        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rs.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rs.identifier AS entity_id, 
            rs.geneName[0] AS display_name
        
MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (rg:ReferenceGeneProduct)-[:referenceGene]->
              (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
              rs.databaseName = {database_name} AND            
              s.displayName IN {species} AND
         (p)-[:hasEvent]->(rle) AND  tp.displayName = 'Metabolism' 


2019-11-15 10:41:00.412 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-15 10:41:00.414 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-15 10:41:02.532 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [8]:
pals = PALS(ds, min_replace=100)
pathway_transcripts_df = pals.get_pathway_df()

2019-11-15 10:41:02.602 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:248 - Setting the zero intensity values in the dataframe
2019-11-15 10:41:02.667 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:228 - Scaling the data across the sample: zero mean and unit variance
2019-11-15 10:41:02.709 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [-0. -0. -0. ...  0. -0.  0.]
2019-11-15 10:41:02.709 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:68 - Variance in the rows of the DF is [0. 0. 0. ... 0. 0. 0.]
2019-11-15 10:41:03.704 | INFO     | pals.pathway_analysis:set_up_resample_plage_p_df:79 - Calculating plage p-values with resampling
2019-11-15 10:41:03.705 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:83 - Comparison Distal_vs_Proximal
2019-11-15 10:41:03.705 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 0/1000
  tvalues = (m1 - m2) / se_total
2019-11-15 1

In [9]:
sort_column_transcripts = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_transcripts_df.sort_values(sort_column_transcripts, inplace=True)

### PALS analysis of proteins

In [10]:
database_name = DATABASE_REACTOME_UNIPROT
ds = get_data_source('test_data/protein_data.csv', 'test_data/protein_design.csv', database_name, 
                                 species_name, comparisons, metabolic_pathway_only)


        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rg.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rg.identifier AS entity_id, 
            rg.description AS display_name
        
MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (re:ReferenceEntity)-[:referenceDatabase]->
              (rd:ReferenceDatabase)
        WHERE
              rle.speciesName IN {species} AND
              rd.displayName = {database_name} AND
         (p)-[:hasEvent]->(rle) AND  tp.displayName = 'Metabolism' 
            RETURN DISTINCT
        

2019-11-15 10:41:06.276 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-15 10:41:06.278 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-15 10:41:06.473 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [11]:
pals = PALS(ds, min_replace=5000)
pathway_proteins_df = pals.get_pathway_df()

2019-11-15 10:41:06.550 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:248 - Setting the zero intensity values in the dataframe
2019-11-15 10:41:06.567 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:228 - Scaling the data across the sample: zero mean and unit variance
2019-11-15 10:41:06.572 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [ 0. -0.  0. ... -0. -0. -0.]
2019-11-15 10:41:06.572 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:68 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-15 10:41:07.167 | INFO     | pals.pathway_analysis:set_up_resample_plage_p_df:79 - Calculating plage p-values with resampling
2019-11-15 10:41:07.168 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:83 - Comparison Distal_vs_Proximal
2019-11-15 10:41:07.169 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 0/1000
2019-11-15 10:41:07.268 | DEBUG    | pals.pat

In [12]:
sort_column_proteins = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_proteins_df.sort_values(sort_column_proteins, inplace=True)

### PALS analysis of compounds

In [13]:
database_name = DATABASE_REACTOME_KEGG
ds = get_data_source('test_data/compound_data.csv', 'test_data/compound_design.csv', database_name, 
                                 species_name, comparisons, metabolic_pathway_only)

2019-11-15 10:41:08.675 | DEBUG    | pals.feature_extraction:__init__:40 - Retrieving data for Danio rerio from Reactome COMPOUND metabolic_pathway_only=True
2019-11-15 10:41:09.005 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-15 10:41:09.006 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-15 10:41:09.017 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [14]:
pals = PALS(ds, min_replace=5000)
pathway_compounds_df = pals.get_pathway_df()

2019-11-15 10:41:09.091 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:248 - Setting the zero intensity values in the dataframe
2019-11-15 10:41:09.120 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:228 - Scaling the data across the sample: zero mean and unit variance
2019-11-15 10:41:09.126 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [-0. -0.  0.  0. -0. -0. -0.  0. -0.  0.  0. -0.  0.  0. -0. -0. -0. -0.
  0. -0.  0. -0. -0.  0. -0.  0.  0. -0.  0. -0. -0.  0. -0.  0. -0.  0.
 -0.  0. -0. -0.  0. -0.  0.  0.  0. -0. -0. -0. -0. -0. -0.  0.  0. -0.
  0. -0.  0.  0.  0.  0. -0.  0.  0. -0.  0. -0.  0.  0.  0. -0. -0. -0.
  0.  0.  0. -0. -0. -0.  0.  0.  0. -0. -0. -0. -0. -0.  0.  0.  0. -0.
 -0. -0.  0. -0.  0.  0. -0. -0. -0. -0. -0.  0.  0. -0. -0. -0. -0.  0.
 -0. -0. -0. -0. -0. -0. -0. -0.  0.  0. -0.  0. -0.  0. -0. -0.  0. -0.
 -0.  0.  0.  0.]
2019-11-15 10:41:09.128 | DEBUG    | pals.pathway_anal

In [15]:
sort_column_compounds = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_compounds_df.sort_values(sort_column_compounds, inplace=True)

### Show all the results together

In [24]:
pathway_transcripts_df.head(20)

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,ENSEMBL Distal_vs_Proximal comb_p
R-DRE-163358,PKA-mediated phosphorylation of key metabolic ...,2.101256e-13,1,1,100.00,0.926558,0.96,96.00,4.438915e-12
R-DRE-2022928,HS-GAG biosynthesis,8.021968e-02,35,35,100.00,0.250277,33.69,96.26,6.572022e-02
R-DRE-1971475,A tetrasaccharide linker sequence is required ...,9.332392e-02,32,32,100.00,0.281180,30.80,96.25,7.948649e-02
R-DRE-192105,Synthesis of bile acids and bile salts,8.262035e-02,18,18,100.00,0.482952,17.33,96.28,8.548471e-02
R-DRE-2162123,Synthesis of Prostaglandins (PG) and Thromboxa...,8.291247e-02,18,18,100.00,0.482952,17.33,96.28,8.577858e-02
...,...,...,...,...,...,...,...,...,...
R-DRE-1483248,Synthesis of PIPs at the ER membrane,1.000000e+00,6,6,100.00,0.765429,5.78,96.33,1.000000e+00
R-DRE-1483206,Glycerophospholipid biosynthesis,1.000000e+00,4,4,100.00,0.826262,3.85,96.25,1.000000e+00
R-DRE-6806664,Metabolism of vitamin K,1.000000e+00,2,2,100.00,0.891859,1.93,96.50,1.000000e+00
R-DRE-350562,Regulation of ornithine decarboxylase (ODC),1.000000e+00,58,57,98.28,0.342845,55.83,96.26,1.000000e+00


In [22]:
pathway_proteins_df.head(20)

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,UniProt Distal_vs_Proximal comb_p
R-DRE-71182,Phenylalanine and tyrosine catabolism,0.002259,20,6,30.0,0.04116351,3.24,16.2,0.000889
R-DRE-350562,Regulation of ornithine decarboxylase (ODC),0.041468,87,41,47.13,2.002705e-12,14.09,16.2,0.001104
R-DRE-975634,Retinoid metabolism and transport,0.004729,38,12,31.58,0.006221257,6.15,16.18,0.001204
R-DRE-156590,Glutathione conjugation,0.017382,44,16,36.36,0.0003555361,7.12,16.18,0.003128
R-DRE-211994,Sterols are 12-hydroxylated by CYP8B1,0.011625,4,1,25.0,0.1870211,0.65,16.25,0.008207
R-DRE-71403,Citric acid cycle (TCA cycle),0.083767,35,18,51.43,3.887734e-07,5.67,16.2,0.010107
R-DRE-211979,Eicosanoids,0.011625,8,1,12.5,0.4415238,1.3,16.25,0.012095
R-DRE-163210,Formation of ATP by chemiosmotic coupling,0.053689,29,12,41.38,0.0003732602,4.7,16.21,0.012542
R-DRE-9026762,Biosynthesis of maresin conjugates in tissue r...,0.037401,5,3,60.0,0.007751427,0.81,16.2,0.01315
R-DRE-499943,Interconversion of nucleotide di- and triphosp...,0.027053,47,11,23.4,0.07566238,7.61,16.19,0.015004


In [23]:
pathway_compounds_df.head(20)

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND Distal_vs_Proximal comb_p
R-DRE-70614,Amino acid synthesis and interconversion (tran...,0.003308,31,12,38.71,0.002659,5.94,19.16,0.000665
R-DRE-71240,Tryptophan catabolism,0.004767,21,9,42.86,0.003328,4.03,19.19,0.001055
R-DRE-1362409,Mitochondrial iron-sulfur cluster biogenesis,0.001454,4,1,25.0,0.245081,0.77,19.25,0.001126
R-DRE-70688,Proline catabolism,0.002866,10,3,30.0,0.139951,1.92,19.2,0.001744
R-DRE-2408508,"Metabolism of ingested SeMet, Sec, MeSec into ...",0.001454,8,1,12.5,0.540576,1.53,19.12,0.001869
R-DRE-71288,Creatine metabolism,0.007701,11,5,45.45,0.014212,2.11,19.18,0.002512
R-DRE-196780,Biotin transport and metabolism,0.005813,7,3,42.86,0.046999,1.34,19.14,0.002534
R-DRE-9026766,Biosynthesis of protectin and resolvin conjuga...,0.006284,1,1,100.0,0.036305,0.19,19.0,0.00256
R-DRE-9026762,Biosynthesis of maresin conjugates in tissue r...,0.006284,1,1,100.0,0.036305,0.19,19.0,0.00256
R-DRE-880009,Interconversion of 2-oxoglutarate and 2-hydrox...,0.002874,5,1,20.0,0.324019,0.96,19.2,0.002573
