In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle
import pandas as pd

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.feature_extraction import DataSource
from pals.pathway_analysis import PALS
from pals.common import *

2019-11-25 14:08:15.224 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Zebrafish Analysis

Analysis of multi-omics zebrafish data from https://www.pnas.org/content/114/5/E717.short

### Load data

In [5]:
def get_data_source(measurement_csv, design_csv, database_name, species_name, comparisons, metabolic_pathway_only):
    measurement_df = pd.read_csv(measurement_csv)
    
    # remove 'Identifier' column from measurement dataframe
    int_df = measurement_df.drop('Identifier', axis=1)
    
    # create a new annotation dataframe from the identifier, having only one column: 'entity_id'
    annotation_df = pd.DataFrame(measurement_df['Identifier'])
    annotation_df = annotation_df.rename(columns={'Identifier': 'entity_id'})
    
    # create experimental design
    design_df = pd.read_csv(design_csv)
    groups = {}
    for k, v in design_df.groupby('group'):
        groups[k] = v['sample'].values.tolist()
    experimental_design = {
        'comparisons': comparisons,
        'groups': groups
    }
    
    ds = DataSource(int_df, annotation_df, experimental_design, database_name, 
                    reactome_species=species_name, reactome_metabolic_pathway_only=metabolic_pathway_only, reactome_query=True)
    return ds

In [6]:
comparisons = [
    {'case': 'Distal', 'control': 'Proximal', 'name': 'Distal_vs_Proximal'}
]
species_name = 'Danio rerio'
metabolic_pathway_only = True

### PALS analysis of transcripts

In [7]:
database_name = DATABASE_REACTOME_ENSEMBL
data_file = os.path.join('test_data', 'zebrafish', 'gene_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'gene_design.csv')
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only)


        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rs.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rs.identifier AS entity_id, 
            rs.geneName[0] AS display_name
        
MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (rg:ReferenceGeneProduct)-[:referenceGene]->
              (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
              rs.databaseName = {database_name} AND            
              s.displayName IN {species} AND
         (p)-[:hasEvent]->(rle) AND  tp.displayName = 'Metabolism' 


2019-11-25 14:08:20.742 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-25 14:08:20.744 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-25 14:08:22.677 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [8]:
pals = PALS(ds, min_replace=100, plage_weight=5, hg_weight=1)
pathway_transcripts_df = pals.get_pathway_df()

2019-11-25 14:08:22.737 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:250 - Setting the zero intensity values in the dataframe
2019-11-25 14:08:22.797 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:230 - Scaling the data across the sample: zero mean and unit variance
2019-11-25 14:08:22.836 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:69 - Mean values of the rows in the DF is [-0. -0. -0. ...  0. -0.  0.]
2019-11-25 14:08:22.837 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:70 - Variance in the rows of the DF is [0. 0. 0. ... 0. 0. 0.]
2019-11-25 14:08:23.482 | INFO     | pals.pathway_analysis:set_up_resample_plage_p_df:81 - Calculating plage p-values with resampling
2019-11-25 14:08:23.483 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:85 - Comparison Distal_vs_Proximal
2019-11-25 14:08:23.483 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:93 - Resampling 0/1000
  tvalues = (m1 - m2) / se_total
2019-11-25 1

In [9]:
sort_column_transcripts = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_transcripts_df.sort_values(sort_column_transcripts, inplace=True)

In [10]:
pathway_transcripts_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,ENSEMBL Distal_vs_Proximal comb_p
R-DRE-163358,PKA-mediated phosphorylation of key metabolic ...,2.101253e-13,1,1,100.0,0.926558,0.96,96.00,4.438909e-12
R-DRE-2022928,HS-GAG biosynthesis,8.019388e-02,35,35,100.0,0.250277,33.69,96.26,6.569850e-02
R-DRE-1971475,A tetrasaccharide linker sequence is required ...,9.332006e-02,32,32,100.0,0.281180,30.80,96.25,7.948314e-02
R-DRE-192105,Synthesis of bile acids and bile salts,8.261743e-02,18,18,100.0,0.482952,17.33,96.28,8.548178e-02
R-DRE-2162123,Synthesis of Prostaglandins (PG) and Thromboxa...,8.290953e-02,18,18,100.0,0.482952,17.33,96.28,8.577562e-02
...,...,...,...,...,...,...,...,...,...
R-DRE-351143,Agmatine biosynthesis,1.000000e+00,1,1,100.0,0.926558,0.96,96.00,1.000000e+00
R-DRE-6806664,Metabolism of vitamin K,1.000000e+00,2,2,100.0,0.891859,1.93,96.50,1.000000e+00
R-DRE-9018679,Biosynthesis of EPA-derived SPMs,1.000000e+00,2,2,100.0,0.891859,1.93,96.50,1.000000e+00
R-DRE-167826,The fatty acid cycling model,1.000000e+00,4,4,100.0,0.826262,3.85,96.25,1.000000e+00


### PALS analysis of proteins

In [11]:
database_name = DATABASE_REACTOME_UNIPROT
data_file = os.path.join('test_data', 'zebrafish', 'protein_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'protein_design.csv')
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only)


        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rg.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rg.identifier AS entity_id, 
            rg.description AS display_name
        
MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (re:ReferenceEntity)-[:referenceDatabase]->
              (rd:ReferenceDatabase)
        WHERE
              rle.speciesName IN {species} AND
              rd.displayName = {database_name} AND
         (p)-[:hasEvent]->(rle) AND  tp.displayName = 'Metabolism' 
            RETURN DISTINCT
        

2019-11-25 14:08:26.083 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-25 14:08:26.085 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-25 14:08:26.277 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [12]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_proteins_df = pals.get_pathway_df()

2019-11-25 14:08:26.343 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:250 - Setting the zero intensity values in the dataframe
2019-11-25 14:08:26.362 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:230 - Scaling the data across the sample: zero mean and unit variance
2019-11-25 14:08:26.365 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:69 - Mean values of the rows in the DF is [-0.  0.  0. ...  0.  0.  0.]
2019-11-25 14:08:26.366 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:70 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2019-11-25 14:08:26.757 | INFO     | pals.pathway_analysis:set_up_resample_plage_p_df:81 - Calculating plage p-values with resampling
2019-11-25 14:08:26.757 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:85 - Comparison Distal_vs_Proximal
2019-11-25 14:08:26.758 | DEBUG    | pals.pathway_analysis:set_up_resample_plage_p_df:93 - Resampling 0/1000
2019-11-25 14:08:26.851 | DEBUG    | pals.pat

In [13]:
sort_column_proteins = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_proteins_df.sort_values(sort_column_proteins, inplace=True)

In [14]:
pathway_proteins_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,UniProt Distal_vs_Proximal comb_p
R-DRE-71182,Phenylalanine and tyrosine catabolism,0.001014,20,6,30.00,4.116351e-02,3.24,16.20,0.000380
R-DRE-975634,Retinoid metabolism and transport,0.002993,38,12,31.58,6.221257e-03,6.15,16.18,0.000723
R-DRE-350562,Regulation of ornithine decarboxylase (ODC),0.034200,87,41,47.13,2.002705e-12,14.09,16.20,0.000824
R-DRE-156590,Glutathione conjugation,0.012949,44,16,36.36,3.555361e-04,7.12,16.18,0.002197
R-DRE-211994,Sterols are 12-hydroxylated by CYP8B1,0.006596,4,1,25.00,1.870211e-01,0.65,16.25,0.004597
...,...,...,...,...,...,...,...,...,...
R-DRE-174403,Glutathione synthesis and recycling,1.000000,22,1,4.55,9.072509e-01,3.56,16.18,1.000000
R-DRE-77289,Mitochondrial Fatty Acid Beta-Oxidation,1.000000,40,7,17.50,3.416790e-01,6.48,16.20,1.000000
R-DRE-170822,Regulation of Glucokinase by Glucokinase Regul...,1.000000,55,8,14.55,5.669582e-01,8.91,16.20,1.000000
R-DRE-196780,Biotin transport and metabolism,1.000000,20,4,20.00,2.444762e-01,3.24,16.20,1.000000


### PALS analysis of compounds

In [15]:
database_name = DATABASE_REACTOME_KEGG
data_file = os.path.join('test_data', 'zebrafish', 'compound_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'compound_design.csv')
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only)

2019-11-25 14:08:28.298 | DEBUG    | pals.feature_extraction:__init__:40 - Retrieving data for Danio rerio from Reactome COMPOUND metabolic_pathway_only=True
2019-11-25 14:08:28.627 | DEBUG    | pals.feature_extraction:__init__:85 - Mapping pathway to unique ids
2019-11-25 14:08:28.628 | DEBUG    | pals.feature_extraction:__init__:99 - Creating dataset to pathway mapping
2019-11-25 14:08:28.638 | DEBUG    | pals.feature_extraction:__init__:124 - Computing unique id counts


In [16]:
pals = PALS(ds, min_replace=5000, plage_weight=5, hg_weight=1)
pathway_compounds_df = pals.get_pathway_df()

2019-11-25 14:08:28.700 | DEBUG    | pals.pathway_analysis:_change_zero_peak_ints:250 - Setting the zero intensity values in the dataframe
2019-11-25 14:08:28.728 | DEBUG    | pals.pathway_analysis:_standardize_intensity_df:230 - Scaling the data across the sample: zero mean and unit variance
2019-11-25 14:08:28.734 | DEBUG    | pals.pathway_analysis:get_plage_activity_df:69 - Mean values of the rows in the DF is [-0. -0.  0.  0. -0. -0. -0.  0. -0.  0.  0. -0.  0.  0. -0. -0. -0. -0.
  0. -0.  0. -0. -0.  0. -0.  0.  0. -0.  0. -0. -0.  0. -0.  0. -0.  0.
 -0.  0. -0. -0.  0. -0.  0.  0.  0. -0. -0. -0. -0. -0. -0.  0.  0. -0.
  0. -0.  0.  0.  0.  0. -0.  0.  0. -0.  0. -0.  0.  0.  0. -0. -0. -0.
  0.  0.  0. -0. -0. -0.  0.  0.  0. -0. -0. -0. -0. -0.  0.  0.  0. -0.
 -0. -0.  0. -0.  0.  0. -0. -0. -0. -0. -0.  0.  0. -0. -0. -0. -0.  0.
 -0. -0. -0. -0. -0. -0. -0. -0.  0.  0. -0.  0. -0.  0. -0. -0.  0. -0.
 -0.  0.  0.  0.]
2019-11-25 14:08:28.735 | DEBUG    | pals.pathway_anal

In [17]:
sort_column_compounds = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_compounds_df.sort_values(sort_column_compounds, inplace=True)

In [18]:
pathway_compounds_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND Distal_vs_Proximal comb_p
R-DRE-70614,Amino acid synthesis and interconversion (tran...,0.003214,31,12,38.71,0.002659,5.94,19.16,0.000644
R-DRE-71240,Tryptophan catabolism,0.004594,21,9,42.86,0.003328,4.03,19.19,0.001012
R-DRE-1362409,Mitochondrial iron-sulfur cluster biogenesis,0.001442,4,1,25.00,0.245081,0.77,19.25,0.001117
R-DRE-70688,Proline catabolism,0.002865,10,3,30.00,0.139951,1.92,19.20,0.001744
R-DRE-2408508,"Metabolism of ingested SeMet, Sec, MeSec into ...",0.001442,8,1,12.50,0.540576,1.53,19.12,0.001854
...,...,...,...,...,...,...,...,...,...
R-DRE-1483226,Synthesis of PI,0.566958,6,2,33.33,0.132027,1.15,19.17,0.478597
R-DRE-77108,Utilization of Ketone Bodies,0.956438,10,2,20.00,0.354530,1.92,19.20,0.945684
R-DRE-2024096,HS-GAG degradation,0.992308,5,1,20.00,0.324019,0.96,19.20,0.988891
R-DRE-1483076,Synthesis of CL,0.999997,1,1,100.00,0.036305,0.19,19.00,0.999976
