In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pathlib
import pickle
import pandas as pd

sys.path.append('..')

In [3]:
import pandas as pd

In [4]:
from pals.feature_extraction import DataSource
from pals.PLAGE import PLAGE
from pals.ORA import ORA
from pals.common import *

2021-01-07 17:05:09.519 | INFO     | pals.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Zebrafish Analysis

Analysis of multi-omics zebrafish data from https://www.pnas.org/content/114/5/E717.short

### Load data

In [5]:
def get_data_source(measurement_csv, design_csv, database_name, species_name, comparisons, metabolic_pathway_only, min_replace):
    measurement_df = pd.read_csv(measurement_csv)
    
    # remove 'Identifier' column from measurement dataframe
    int_df = measurement_df.drop('Identifier', axis=1)
    
    # create a new annotation dataframe from the identifier, having only one column: 'entity_id'
    annotation_df = pd.DataFrame(measurement_df['Identifier'])
    annotation_df = annotation_df.rename(columns={'Identifier': 'entity_id'})
    
    # create experimental design
    design_df = pd.read_csv(design_csv)
    groups = {}
    for k, v in design_df.groupby('group'):
        groups[k] = v['sample'].values.tolist()
    experimental_design = {
        'comparisons': comparisons,
        'groups': groups
    }
    
    ds = DataSource(int_df, annotation_df, experimental_design, database_name, 
                    reactome_species=species_name, reactome_metabolic_pathway_only=metabolic_pathway_only, reactome_query=True, 
                    min_replace=min_replace)
    return ds

In [6]:
comparisons = [
    {'case': 'Distal', 'control': 'Proximal', 'name': 'Distal_vs_Proximal'}
]
species_name = 'Danio rerio'
metabolic_pathway_only = True

### PALS analysis of transcripts

In [7]:
database_name = DATABASE_REACTOME_ENSEMBL
data_file = os.path.join('test_data', 'zebrafish', 'gene_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'gene_design.csv')
min_replace = SMALL
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only, min_replace)

2021-01-07 17:05:09.865 | DEBUG    | pals.feature_extraction:__init__:43 - Using ENSEMBL as database
2021-01-07 17:05:09.918 | DEBUG    | pals.reactome:get_gene_entity_dict:389 - 
        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rs.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rs.identifier AS entity_id, 
            rs.geneName[0] AS display_name
        
2021-01-07 17:05:10.184 | DEBUG    | pals.reactome:get_gene_mapping_dict:252 - MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (rg:ReferenceGeneProduct)-[:referenceGe

In [8]:
plage = PLAGE(ds)
pathway_transcripts_df = plage.get_results()

2021-01-07 17:05:13.667 | DEBUG    | pals.PLAGE:__init__:29 - PLAGE initialised
2021-01-07 17:05:13.791 | DEBUG    | pals.preprocessing:process:20 - Performing min-value imputation
2021-01-07 17:05:13.810 | DEBUG    | pals.preprocessing:process:36 - Performing row average imputation
2021-01-07 17:05:13.827 | DEBUG    | pals.preprocessing:process:46 - Applying log normalisation
2021-01-07 17:05:13.832 | DEBUG    | pals.preprocessing:process:53 - Scaling the data across the sample: zero mean and unit variance
2021-01-07 17:05:14.085 | DEBUG    | pals.PLAGE:get_plage_activity_df:84 - Mean values of the rows in the DF is [-0. -0. -0. ...  0. -0.  0.]
2021-01-07 17:05:14.085 | DEBUG    | pals.PLAGE:get_plage_activity_df:85 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2021-01-07 17:05:14.150 | DEBUG    | pals.PLAGE:set_up_resample_plage_p_df:96 - Calculating plage p-values with resampling
2021-01-07 17:05:14.151 | DEBUG    | pals.PLAGE:set_up_resample_plage_p_df:103 - Comparis

In [9]:
sort_column_transcripts = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_transcripts_df.sort_values(sort_column_transcripts, inplace=True)

In [10]:
pathway_transcripts_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,ENSEMBL Distal_vs_Proximal comb_p
R-DRE-170822,Regulation of Glucokinase by Glucokinase Regul...,1.109894e-13,1,1,100.00,0.861082,0.86,86.00,1.109894e-13
R-DRE-975634,Retinoid metabolism and transport,5.334716e-02,37,33,89.19,0.399819,31.86,86.11,5.334716e-02
R-DRE-193048,Androgen biosynthesis,5.511272e-02,13,11,84.62,0.733958,11.19,86.08,5.511272e-02
R-DRE-156590,Glutathione conjugation,7.228642e-02,24,17,70.83,0.987741,20.67,86.12,7.228642e-02
R-DRE-1971475,A tetrasaccharide linker sequence is required ...,7.432583e-02,24,23,95.83,0.132846,20.67,86.12,7.432583e-02
...,...,...,...,...,...,...,...,...,...
R-DRE-1483206,Glycerophospholipid biosynthesis,1.000000e+00,4,4,100.00,0.549483,3.44,86.00,1.000000e+00
R-DRE-2393930,Phosphate bond hydrolysis by NUDT proteins,1.000000e+00,7,6,85.71,0.747516,6.03,86.14,1.000000e+00
R-DRE-9673163,Oleoyl-phe metabolism,1.000000e+00,1,1,100.00,0.861082,0.86,86.00,1.000000e+00
R-DRE-167827,The proton buffering model,1.000000e+00,5,4,80.00,0.855465,4.31,86.20,1.000000e+00


### PALS analysis of proteins

In [11]:
database_name = DATABASE_REACTOME_UNIPROT
data_file = os.path.join('test_data', 'zebrafish', 'protein_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'protein_design.csv')
min_replace = SMALL
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only, min_replace)

2021-01-07 17:05:15.728 | DEBUG    | pals.feature_extraction:__init__:43 - Using UniProt as database
2021-01-07 17:05:15.776 | DEBUG    | pals.reactome:get_protein_entity_dict:337 - 
        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rg.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rg.identifier AS entity_id, 
            rg.description AS display_name
        
2021-01-07 17:05:15.986 | DEBUG    | pals.reactome:get_protein_mapping_dict:200 - MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (re:ReferenceEntity)-[:referenceD

In [12]:
plage = PLAGE(ds)
pathway_proteins_df = plage.get_results()

2021-01-07 17:05:16.613 | DEBUG    | pals.PLAGE:__init__:29 - PLAGE initialised
2021-01-07 17:05:16.648 | DEBUG    | pals.preprocessing:process:20 - Performing min-value imputation
2021-01-07 17:05:16.654 | DEBUG    | pals.preprocessing:process:36 - Performing row average imputation
2021-01-07 17:05:16.660 | DEBUG    | pals.preprocessing:process:46 - Applying log normalisation
2021-01-07 17:05:16.661 | DEBUG    | pals.preprocessing:process:53 - Scaling the data across the sample: zero mean and unit variance
2021-01-07 17:05:16.693 | DEBUG    | pals.PLAGE:get_plage_activity_df:84 - Mean values of the rows in the DF is [-0.  0.  0. ...  0.  0.  0.]
2021-01-07 17:05:16.693 | DEBUG    | pals.PLAGE:get_plage_activity_df:85 - Variance in the rows of the DF is [1. 1. 1. ... 1. 1. 1.]
2021-01-07 17:05:16.731 | DEBUG    | pals.PLAGE:set_up_resample_plage_p_df:96 - Calculating plage p-values with resampling
2021-01-07 17:05:16.732 | DEBUG    | pals.PLAGE:set_up_resample_plage_p_df:103 - Comparis

In [13]:
sort_column_proteins = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_proteins_df.sort_values(sort_column_proteins, inplace=True)

In [14]:
pathway_proteins_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,UniProt Distal_vs_Proximal comb_p
R-DRE-8964208,Phenylalanine metabolism,0.003387,6,3,50.00,0.093602,1.18,19.67,0.003387
R-DRE-499943,Interconversion of nucleotide di- and triphosp...,0.007733,27,7,25.93,0.267003,5.29,19.59,0.007733
R-DRE-8963684,Tyrosine catabolism,0.009309,5,1,20.00,0.664430,0.98,19.60,0.009309
R-DRE-211945,Phase I - Functionalization of compounds,0.010343,18,6,33.33,0.122023,3.53,19.61,0.010343
R-DRE-156590,Glutathione conjugation,0.013735,18,9,50.00,0.003494,3.53,19.61,0.013735
...,...,...,...,...,...,...,...,...,...
R-DRE-400206,Regulation of lipid metabolism by PPARalpha,1.000000,17,1,5.88,0.975947,3.33,19.59,1.000000
R-DRE-434313,Intracellular metabolism of fatty acids regula...,1.000000,3,1,33.33,0.480411,0.59,19.67,1.000000
R-DRE-71288,Creatine metabolism,1.000000,17,4,23.53,0.434297,3.33,19.59,1.000000
R-DRE-197264,Nicotinamide salvaging,1.000000,22,2,9.09,0.948616,4.31,19.59,1.000000


### PALS analysis of compounds

In [15]:
database_name = DATABASE_REACTOME_KEGG
data_file = os.path.join('test_data', 'zebrafish', 'compound_data.csv')
design_file = os.path.join('test_data', 'zebrafish', 'compound_design.csv')
min_replace = SMALL
ds = get_data_source(data_file, design_file, database_name, species_name, comparisons, metabolic_pathway_only, min_replace)

2021-01-07 17:05:18.043 | DEBUG    | pals.feature_extraction:__init__:43 - Using COMPOUND as database
2021-01-07 17:05:18.044 | DEBUG    | pals.loader:load_data:56 - Retrieving data for Danio rerio from Reactome COMPOUND metabolic_pathway_only=True
2021-01-07 17:05:18.218 | DEBUG    | pals.feature_extraction:__init__:56 - Mapping pathway to unique ids
2021-01-07 17:05:18.220 | DEBUG    | pals.feature_extraction:__init__:70 - Creating dataset to pathway mapping
2021-01-07 17:05:18.228 | DEBUG    | pals.feature_extraction:__init__:98 - Computing unique id counts


In [16]:
plage = PLAGE(ds)
pathway_compounds_df = plage.get_results()

2021-01-07 17:05:18.308 | DEBUG    | pals.PLAGE:__init__:29 - PLAGE initialised
2021-01-07 17:05:18.363 | DEBUG    | pals.preprocessing:process:20 - Performing min-value imputation
2021-01-07 17:05:18.369 | DEBUG    | pals.preprocessing:process:36 - Performing row average imputation
2021-01-07 17:05:18.377 | DEBUG    | pals.preprocessing:process:46 - Applying log normalisation
2021-01-07 17:05:18.379 | DEBUG    | pals.preprocessing:process:53 - Scaling the data across the sample: zero mean and unit variance
2021-01-07 17:05:18.384 | DEBUG    | pals.PLAGE:get_plage_activity_df:84 - Mean values of the rows in the DF is [-0. -0.  0.  0. -0. -0. -0.  0. -0.  0.  0. -0.  0.  0. -0. -0. -0. -0.
  0. -0.  0. -0. -0.  0. -0.  0.  0. -0.  0. -0. -0.  0. -0.  0. -0.  0.
 -0.  0. -0. -0.  0. -0.  0.  0.  0. -0. -0. -0. -0. -0. -0.  0.  0. -0.
  0. -0.  0.  0.  0.  0. -0.  0.  0. -0.  0. -0.  0.  0.  0. -0. -0. -0.
  0.  0.  0. -0. -0. -0.  0.  0.  0. -0. -0. -0. -0. -0.  0.  0.  0. -0.
 -0. -0.  

In [17]:
sort_column_compounds = '%s %s comb_p' % (database_name, comparisons[0]['name'])
pathway_compounds_df.sort_values(sort_column_compounds, inplace=True)

In [18]:
pathway_compounds_df

Unnamed: 0,pw_name,Distal_vs_Proximal p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND Distal_vs_Proximal comb_p
R-DRE-2408508,"Metabolism of ingested SeMet, Sec, MeSec into ...",0.001561,7,1,14.29,0.771919,1.32,18.86,0.001561
R-DRE-1362409,Mitochondrial iron-sulfur cluster biogenesis,0.001561,3,1,33.33,0.467114,0.57,19.00,0.001561
R-DRE-389542,NADPH regeneration,0.003046,6,1,16.67,0.717729,1.13,18.83,0.003046
R-DRE-880009,Interconversion of 2-oxoglutarate and 2-hydrox...,0.003046,5,1,20.00,0.650902,0.94,18.80,0.003046
R-DRE-211945,Phase I - Functionalization of compounds,0.003393,16,2,12.50,0.840088,3.02,18.88,0.003393
...,...,...,...,...,...,...,...,...,...
R-DRE-77108,Utilization of Ketone Bodies,0.953054,9,2,22.22,0.531351,1.70,18.89,0.953054
R-DRE-2024096,HS-GAG degradation,0.991322,5,1,20.00,0.650902,0.94,18.80,0.991322
R-DRE-2022854,Keratan sulfate biosynthesis,0.999996,5,2,40.00,0.239114,0.94,18.80,0.999996
R-DRE-9037629,Lewis blood group biosynthesis,0.999996,2,1,50.00,0.342277,0.38,19.00,0.999996
