In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle
import pandas as pd

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.feature_extraction import DataSource
from pals.PALS import PALS
from pals.ORA import ORA
from pals.common import *

2020-01-06 13:24:04.688 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Zebrafish Analysis

Analysis of multi-omics zebrafish data from https://www.pnas.org/content/114/5/E717.short

### Load data

In [5]:
def get_data_source(measurement_csv, design_csv, database_name, species_name, comparisons, metabolic_pathway_only, min_replace):
    measurement_df = pd.read_csv(measurement_csv)
    
    # remove 'Identifier' column from measurement dataframe
    int_df = measurement_df.drop('Identifier', axis=1)
    
    # create a new annotation dataframe from the identifier, having only one column: 'entity_id'
    annotation_df = pd.DataFrame(measurement_df['Identifier'])
    annotation_df = annotation_df.rename(columns={'Identifier': 'entity_id'})
    
    # create experimental design
    design_df = pd.read_csv(design_csv)
    groups = {}
    for k, v in design_df.groupby('group'):
        groups[k] = v['sample'].values.tolist()
    experimental_design = {
        'comparisons': comparisons,
        'groups': groups
    }
    
    ds = DataSource(int_df, annotation_df, experimental_design, database_name, 
                    reactome_species=species_name, reactome_metabolic_pathway_only=metabolic_pathway_only, reactome_query=True, 
                    min_replace=min_replace)
    return ds

In [6]:
comparisons = [
    {'case': 'Distal', 'control': 'Proximal', 'name': 'Distal_vs_Proximal'}
]
species_name = 'Danio rerio'
metabolic_pathway_only = True

### PALS analysis of transcripts

In [7]:
database_name = DATABASE_REACTOME_ENSEMBL
data_file = os.path.join('test_data', 'zebrafish', 'gene_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'gene_design.csv')
min_replace = 100
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only, min_replace)

2020-01-06 13:24:05.075 | DEBUG    | pals.feature_extraction:__init__:42 - Using ENSEMBL as database
2020-01-06 13:24:05.781 | DEBUG    | pals.reactome:get_gene_entity_dict:441 - 
        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rs.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rs.identifier AS entity_id, 
            rs.geneName[0] AS display_name
        
2020-01-06 13:24:15.023 | DEBUG    | pals.reactome:get_gene_mapping_dict:304 - MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (rg:ReferenceGeneProduct)-[:referenceGe

In [8]:
pals = PALS(ds, plage_weight=5, hg_weight=1)
pathway_transcripts_df = pals.get_pathway_df()

2020-01-06 13:24:25.272 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:280 - Setting the zero intensity values in the dataframe
2020-01-06 13:24:25.332 | DEBUG    | pals.feature_extraction:standardize_intensity_df:261 - Scaling the data across the sample: zero mean and unit variance
2020-01-06 13:24:25.366 | DEBUG    | pals.PALS:get_plage_activity_df:76 - Mean values of the rows in the DF is [-0. -0. -0. ...  0. -0.  0.]
2020-01-06 13:24:25.367 | DEBUG    | pals.PALS:get_plage_activity_df:77 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2020-01-06 13:24:26.020 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:88 - Calculating plage p-values with resampling
2020-01-06 13:24:26.021 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:92 - Comparison Distal_vs_Proximal
2020-01-06 13:24:26.021 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:100 - Resampling 0/1000
2020-01-06 13:24:26.116 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:100 - Resampling 100/1000
20

In [9]:
sort_column_transcripts = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_transcripts_df.sort_values(sort_column_transcripts, inplace=True)

In [10]:
pathway_transcripts_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,ENSEMBL Distal_vs_Proximal comb_p
R-DRE-163358,PKA-mediated phosphorylation of key metabolic ...,1.149077e-13,1,1,100.00,0.962588,0.96,96.00,4.002482e-12
R-DRE-2022928,HS-GAG biosynthesis,8.013032e-02,35,35,100.00,0.260186,33.69,96.26,6.642149e-02
R-DRE-1971475,A tetrasaccharide linker sequence is required ...,9.336779e-02,32,32,100.00,0.292295,30.80,96.25,8.047650e-02
R-DRE-192105,Synthesis of bile acids and bile salts,8.265327e-02,18,18,100.00,0.501901,17.33,96.28,8.698369e-02
R-DRE-2162123,Synthesis of Prostaglandins (PG) and Thromboxa...,8.294567e-02,18,18,100.00,0.501901,17.33,96.28,8.728159e-02
...,...,...,...,...,...,...,...,...,...
R-DRE-351200,Interconversion of polyamines,1.000000e+00,3,2,66.67,0.995956,2.89,96.33,1.000000e+00
R-DRE-163359,Glucagon signaling in metabolic regulation,1.000000e+00,4,4,100.00,0.858443,3.85,96.25,1.000000e+00
R-DRE-71737,Pyrophosphate hydrolysis,1.000000e+00,3,3,100.00,0.891859,2.89,96.33,1.000000e+00
R-DRE-70370,Galactose catabolism,1.000000e+00,5,5,100.00,0.826262,4.81,96.20,1.000000e+00


### PALS analysis of proteins

In [11]:
database_name = DATABASE_REACTOME_UNIPROT
data_file = os.path.join('test_data', 'zebrafish', 'protein_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'protein_design.csv')
min_replace = 5000
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only, min_replace)

2020-01-06 13:24:27.747 | DEBUG    | pals.feature_extraction:__init__:42 - Using UniProt as database
2020-01-06 13:24:27.877 | DEBUG    | pals.reactome:get_protein_entity_dict:389 - 
        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rg.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rg.identifier AS entity_id, 
            rg.description AS display_name
        
2020-01-06 13:24:28.194 | DEBUG    | pals.reactome:get_protein_mapping_dict:200 - MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (re:ReferenceEntity)-[:referenceD

In [12]:
pals = PALS(ds, plage_weight=5, hg_weight=1)
pathway_proteins_df = pals.get_pathway_df()

2020-01-06 13:24:29.792 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:280 - Setting the zero intensity values in the dataframe
2020-01-06 13:24:29.811 | DEBUG    | pals.feature_extraction:standardize_intensity_df:261 - Scaling the data across the sample: zero mean and unit variance
2020-01-06 13:24:29.814 | DEBUG    | pals.PALS:get_plage_activity_df:76 - Mean values of the rows in the DF is [-0.  0.  0. ...  0.  0.  0.]
2020-01-06 13:24:29.814 | DEBUG    | pals.PALS:get_plage_activity_df:77 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2020-01-06 13:24:30.211 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:88 - Calculating plage p-values with resampling
2020-01-06 13:24:30.211 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:92 - Comparison Distal_vs_Proximal
2020-01-06 13:24:30.212 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:100 - Resampling 0/1000
2020-01-06 13:24:30.312 | DEBUG    | pals.PALS:set_up_resample_plage_p_df:100 - Resampling 100/1000
20

In [13]:
sort_column_proteins = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_proteins_df.sort_values(sort_column_proteins, inplace=True)

In [14]:
pathway_proteins_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,UniProt Distal_vs_Proximal comb_p
R-DRE-975634,Retinoid metabolism and transport,0.002226,38,12,31.58,1.346074e-02,6.15,16.18,0.000634
R-DRE-71182,Phenylalanine and tyrosine catabolism,0.001286,20,6,30.00,9.039448e-02,3.24,16.20,0.000644
R-DRE-350562,Regulation of ornithine decarboxylase (ODC),0.029959,87,41,47.13,6.315088e-12,14.09,16.20,0.000754
R-DRE-156590,Glutathione conjugation,0.010649,44,16,36.36,8.655119e-04,7.12,16.18,0.002037
R-DRE-71403,Citric acid cycle (TCA cycle),0.068577,35,18,51.43,1.315025e-06,5.67,16.20,0.008682
...,...,...,...,...,...,...,...,...,...
R-DRE-8849175,Threonine catabolism,1.000000,3,1,33.33,4.114654e-01,0.49,16.33,1.000000
R-DRE-196780,Biotin transport and metabolism,1.000000,20,4,20.00,4.104198e-01,3.24,16.20,1.000000
R-DRE-77289,Mitochondrial Fatty Acid Beta-Oxidation,1.000000,40,7,17.50,4.767353e-01,6.48,16.20,1.000000
R-DRE-73843,5-Phosphoribose 1-diphosphate biosynthesis,1.000000,6,1,16.67,6.538366e-01,0.97,16.17,1.000000


### PALS analysis of compounds

In [15]:
database_name = DATABASE_REACTOME_KEGG
data_file = os.path.join('test_data', 'zebrafish', 'compound_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'compound_design.csv')
min_replace = 5000
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only, min_replace)

2020-01-06 13:24:31.800 | DEBUG    | pals.feature_extraction:__init__:42 - Using COMPOUND as database
2020-01-06 13:24:31.801 | DEBUG    | pals.feature_extraction:get_database:111 - Retrieving data for Danio rerio from Reactome COMPOUND metabolic_pathway_only=True
2020-01-06 13:24:32.162 | DEBUG    | pals.feature_extraction:__init__:55 - Mapping pathway to unique ids
2020-01-06 13:24:32.163 | DEBUG    | pals.feature_extraction:__init__:69 - Creating dataset to pathway mapping
2020-01-06 13:24:32.175 | DEBUG    | pals.feature_extraction:__init__:97 - Computing unique id counts


In [16]:
pals = PALS(ds, plage_weight=5, hg_weight=1)
pathway_compounds_df = pals.get_pathway_df()

2020-01-06 13:24:32.237 | DEBUG    | pals.feature_extraction:change_zero_peak_ints:280 - Setting the zero intensity values in the dataframe
2020-01-06 13:24:32.264 | DEBUG    | pals.feature_extraction:standardize_intensity_df:261 - Scaling the data across the sample: zero mean and unit variance
2020-01-06 13:24:32.270 | DEBUG    | pals.PALS:get_plage_activity_df:76 - Mean values of the rows in the DF is [-0. -0.  0.  0. -0. -0. -0.  0. -0.  0.  0. -0.  0.  0. -0. -0. -0. -0.
  0. -0.  0. -0. -0.  0. -0.  0.  0. -0.  0. -0. -0.  0. -0.  0. -0.  0.
 -0.  0. -0. -0.  0. -0.  0.  0.  0. -0. -0. -0. -0. -0. -0.  0.  0. -0.
  0. -0.  0.  0.  0.  0. -0.  0.  0. -0.  0. -0.  0.  0.  0. -0. -0. -0.
  0.  0.  0. -0. -0. -0.  0.  0.  0. -0. -0. -0. -0. -0.  0.  0.  0. -0.
 -0. -0.  0. -0.  0.  0. -0. -0. -0. -0. -0.  0.  0. -0. -0. -0. -0.  0.
 -0. -0. -0. -0. -0. -0. -0. -0.  0.  0. -0.  0. -0.  0. -0. -0.  0. -0.
 -0.  0.  0.  0.]
2020-01-06 13:24:32.271 | DEBUG    | pals.PALS:get_plage_activit

In [17]:
sort_column_compounds = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_compounds_df.sort_values(sort_column_compounds, inplace=True)

In [18]:
pathway_compounds_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND Distal_vs_Proximal comb_p
R-DRE-70614,Amino acid synthesis and interconversion (tran...,0.003237,31,12,38.71,0.006442,5.94,19.16,0.000795
R-DRE-71240,Tryptophan catabolism,0.004623,21,9,42.86,0.008818,4.03,19.19,0.001274
R-DRE-1362409,Mitochondrial iron-sulfur cluster biogenesis,0.001454,4,1,25.00,0.574761,0.77,19.25,0.001973
R-DRE-70688,Proline catabolism,0.003424,10,3,30.00,0.296009,1.92,19.20,0.002919
R-DRE-2408508,"Metabolism of ingested SeMet, Sec, MeSec into ...",0.001454,8,1,12.50,0.821109,1.53,19.12,0.003080
...,...,...,...,...,...,...,...,...,...
R-DRE-1483226,Synthesis of PI,0.565316,6,2,33.33,0.324019,1.15,19.17,0.528597
R-DRE-77108,Utilization of Ketone Bodies,0.952332,10,2,20.00,0.601718,1.92,19.20,0.954108
R-DRE-2024096,HS-GAG degradation,0.990893,5,1,20.00,0.657181,0.96,19.20,0.991685
R-DRE-1483076,Synthesis of CL,0.999994,1,1,100.00,0.191667,0.19,19.00,0.999982
