In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pandas as pd
from loguru import logger

In [4]:
sys.path.append('..')

from pyMultiOmics.constants import *
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info
from pyMultiOmics.analysis import *
from pyMultiOmics.query import *
from pyMultiOmics.pipelines import *

2021-02-16 22:56:33.761 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyMultiOmics

## Load the processed Zebrafish data from [1]

[1] [Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.](https://www.pnas.org/content/114/5/E717.short)

In [5]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'zebrafish_data'))
DATA_FOLDER

'C:\\Users\\joewa\\Work\\git\\pyMultiOmics\\notebooks\\test_data\\zebrafish_data'

Read transcriptomics data (identified by their gene ids)

In [6]:
gene_data = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_data_combined.csv'), index_col='Identifier')
gene_design = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_design.csv'), index_col='sample')

Read proteomics data

In [7]:
protein_data = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Uniprot')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample')

Read metabolomics data

In [8]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_kegg.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [9]:
set_log_level_info()

1

## Create a mapping object

The mapping object uses Reactome to map the different biological entities in the data:
- Transcripts (or genes) are connected to the proteins they encode
- Proteins and compounds are connected to reactions they're involved in
- Reactions are connected to pathways

In [10]:
m = Mapper(DANIO_RERIO, metabolic_pathway_only=True) \
        .set_gene(gene_data, gene_design) \
        .set_protein(protein_data, protein_design) \
        .set_compound(compound_data, compound_design) \
        .build()

2021-02-16 22:56:34.778 | INFO     | pyMultiOmics.functions:reactome_mapping:72 - Mapping genes -> proteins
2021-02-16 22:56:35.545 | INFO     | pyMultiOmics.functions:reactome_mapping:77 - Mapping proteins -> reactions
2021-02-16 22:56:37.618 | INFO     | pyMultiOmics.functions:reactome_mapping:85 - Mapping compounds -> reactions
2021-02-16 22:56:38.408 | INFO     | pyMultiOmics.functions:reactome_mapping:91 - Mapping reactions -> pathways
2021-02-16 22:56:38.730 | INFO     | pyMultiOmics.functions:reactome_mapping:102 - Mapping reactions -> proteins
2021-02-16 22:56:40.954 | INFO     | pyMultiOmics.functions:reactome_mapping:109 - Mapping reactions -> compounds
2021-02-16 22:56:42.179 | INFO     | pyMultiOmics.functions:reactome_mapping:121 - Mapping proteins -> genes
2021-02-16 22:56:58.192 | INFO     | pyMultiOmics.mapping:_add_nodes:181 - Processing nodes: genes
2021-02-16 22:57:00.523 | INFO     | pyMultiOmics.mapping:_add_nodes:181 - Processing nodes: proteins
2021-02-16 22:57:0

## DE analysis

In [11]:
method = INFERENCE_T_TEST
ap = AnalysisPipeline(m)

ap.run_de(method, GENES, 'Distal', 'Proximal')
ap.run_de(method, GENES, 'Distal', 'Middle')
ap.run_de(method, GENES, 'Proximal', 'Middle')

ap.run_de(method, PROTEINS, 'Distal', 'Proximal')
ap.run_de(method, PROTEINS, 'Distal', 'Middle')
ap.run_de(method, PROTEINS, 'Proximal', 'Middle')

ap.run_de(method, COMPOUNDS, 'Distal', 'Proximal')
ap.run_de(method, COMPOUNDS, 'Distal', 'Middle')
ap.run_de(method, COMPOUNDS, 'Proximal', 'Middle')

2021-02-16 22:57:07.914 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Distal, control is Proximal
2021-02-16 22:57:20.867 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Distal, control is Middle
2021-02-16 22:57:33.921 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Proximal, control is Middle
2021-02-16 22:57:47.381 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Distal, control is Proximal
2021-02-16 22:57:48.779 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Distal, control is Middle
2021-02-16 22:57:50.157 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Proximal, control is Middle
2021-02-16 22:57:51.612 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Distal, control is Proximal
2021-02-16 22:57:51.705 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Distal, control is Middle
2021-02-16 22:57:51.798 | INFO     | pyMultiOmics.pipe

In [12]:
ap.run_de(method, COMPOUNDS, 'Distal', 'Proximal')



In [13]:
df = ap.de_results(GENES, 'Distal', 'Proximal')
df.head(10)

Unnamed: 0,padj_Distal_vs_Proximal,FC_Distal_vs_Proximal
ENSDARG00000000001,0.0244604,0.715686
ENSDARG00000000002,0.5147844,0.127781
ENSDARG00000000018,0.00363021,-0.620668
ENSDARG00000000019,0.04935084,0.349941
ENSDARG00000000068,0.3961284,-0.113797
ENSDARG00000000069,0.03747644,-0.296828
ENSDARG00000000086,0.02716414,-0.457724
ENSDARG00000000103,1.91061e-07,-1.54394
ENSDARG00000000142,0.008461476,0.354184
ENSDARG00000000151,0.2412045,0.112137


## Query results

Retrieve a single node

In [14]:
node_id = '15366'
res = QueryBuilder(ap) \
        .add(Entity(node_id)) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15366,Acetic acid,compounds,False


Retrieve multiple nodes

In [15]:
node_id = ['15366', 'ENSDARG00000037781', 'F1QAA7']
res = QueryBuilder(ap) \
        .add(Entity(node_id)) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15366,Acetic acid,compounds,False
ENSDARG00000037781,Acss2,genes,True
F1QAA7,F1QAA7,proteins,True


Retrieve nodes connected to a single node

In [16]:
query_id = 'F1QAA7'
res = QueryBuilder(ap) \
        .add(Entity(query_id)) \
        .add(Connected()) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSDARG00000037781,Acss2,genes,True
15346,Coenzyme a,compounds,False
16027,Adenosine 5-monophosphate,compounds,True
15351,Acetyl-coa,compounds,False
15366,Acetic acid,compounds,False
29888,Diphosphoric acid,compounds,False
30616,Atp(4-),compounds,False
R-DRE-71735,Acetate + coa + atp => acetyl-coa + amp + pyro...,reactions,
R-DRE-71384,Ethanol oxidation,pathways,


Retrieve top-10 significantly changing genes

In [18]:
case = 'Distal'
control = 'Proximal'
pval = 0.05
fc_lte = -2
fc_gte = 2
N = 10

In [19]:
res = QueryBuilder(ap) \
        .add(Select(GENES)) \
        .add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \
        .run()
res

Unnamed: 0,display_name,data_type,observed,padj_Distal_vs_Proximal,FC_Distal_vs_Proximal
ENSDARG00000045854,Fgf23,genes,True,3.56955e-10,24.000289
ENSDARG00000071091,Chrm3a,genes,True,3.307603e-11,22.486196
ENSDARG00000104259,Cabz01072885.1,genes,True,2.782933e-10,20.545646
ENSDARG00000005522,Galr1a,genes,True,2.091894e-09,20.531569
ENSDARG00000091254,Si:ch73-59p9.2,genes,True,3.736906e-10,19.689389
ENSDARG00000093089,Tac3a,genes,True,3.56955e-10,-22.875241
ENSDARG00000038894,Tmx3a,genes,True,4.745826e-10,-23.095898
ENSDARG00000039067,Srd5a2b,genes,True,6.49333e-11,-23.483296
ENSDARG00000076270,Adamts13,genes,True,6.923704e-11,-23.815791
ENSDARG00000025783,Si:ch211-125e6.11,genes,True,6.804488e-11,-26.987938


Plot some heatmap using Plotly

In [20]:
# res = QueryBuilder(ap) \
#         .add(Select(GENES)) \
#         .add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \
#         .add(Heatmap())

Find the compounds that are connected to the DE genes above

In [21]:
res = QueryBuilder(ap) \
        .add(Select(GENES)) \
        .add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \
        .add(Connected(data_type=COMPOUNDS)) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
78584,Aflatoxin b1 exo-cysteinylglycine conjugate,compounds,False
15377,Water,compounds,False
29985,L-glutamate(1-),compounds,False
78587,Aflatoxin b1 exo glutathione conjugate,compounds,False
78579,Aflatoxin b1 endo-cysteinylglycine conjugate,compounds,False
78581,Aflatoxin b1 endo glutathione conjugate,compounds,False
28666,Leukotriene d4,compounds,False
16978,Leukotriene c4,compounds,False
4047,L-cysteinylglycine,compounds,True
16856,Glutathione,compounds,True
