In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pandas as pd
from loguru import logger

In [4]:
sys.path.append('..')

from pyMultiOmics.constants import *
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info
from pyMultiOmics.analysis import *
from pyMultiOmics.query import *
from pyMultiOmics.pipelines import *

2021-04-29 16:09:59.966 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyMultiOmics

## Load the processed Zebrafish data from [1]

[1] [Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.](https://www.pnas.org/content/114/5/E717.short)

In [5]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'zebrafish_data'))
DATA_FOLDER

'C:\\Users\\joewa\\Work\\git\\pyMultiOmics\\notebooks\\test_data\\zebrafish_data'

Read transcriptomics data (identified by their gene ids)

In [6]:
gene_data = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_data_combined.csv'), index_col='Identifier')
gene_design = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_design.csv'), index_col='sample')

Read proteomics data

In [7]:
protein_data = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Uniprot')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample')

Read metabolomics data

In [8]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_chebi.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [9]:
set_log_level_info()

1

## Create a mapping object

The mapping object uses Reactome to map the different biological entities in the data:
- Transcripts (or genes) are connected to the proteins they encode
- Proteins and compounds are connected to reactions they're involved in
- Reactions are connected to pathways

In [10]:
m = Mapper(DANIO_RERIO, metabolic_pathway_only=True) \
        .set_gene(gene_data, gene_design) \
        .set_protein(protein_data, protein_design) \
        .set_compound(compound_data, compound_design) \
        .build()

2021-04-29 16:10:02.161 | INFO     | pyMultiOmics.functions:remove_dupes:385 - Removing 0 rows with duplicate identifiers
2021-04-29 16:10:02.161 | INFO     | pyMultiOmics.functions:reactome_mapping:78 - There are 124 observed compound ids
2021-04-29 16:10:02.161 | INFO     | pyMultiOmics.functions:reactome_mapping:81 - Mapping genes -> proteins
2021-04-29 16:10:03.025 | INFO     | pyMultiOmics.functions:reactome_mapping:86 - Mapping proteins -> reactions
2021-04-29 16:10:04.654 | INFO     | pyMultiOmics.functions:reactome_mapping:94 - Mapping compounds -> reactions
2021-04-29 16:10:05.522 | INFO     | pyMultiOmics.functions:reactome_mapping:100 - Mapping reactions -> pathways
2021-04-29 16:10:05.831 | INFO     | pyMultiOmics.functions:reactome_mapping:111 - Mapping reactions -> proteins
2021-04-29 16:10:07.926 | INFO     | pyMultiOmics.functions:reactome_mapping:118 - Mapping reactions -> compounds
2021-04-29 16:10:09.025 | INFO     | pyMultiOmics.functions:reactome_mapping:130 - Mapp

## DE analysis

In [11]:
ap = AnalysisPipeline(m)

In [12]:
method = INFERENCE_T_TEST
ap.run_de(method, GENES, 'Distal', 'Proximal')
ap.run_de(method, GENES, 'Distal', 'Middle')
ap.run_de(method, GENES, 'Proximal', 'Middle')

2021-04-29 16:10:32.704 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Distal, control is Proximal
2021-04-29 16:10:46.857 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Distal, control is Middle
2021-04-29 16:11:01.222 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Proximal, control is Middle


In [13]:
method = INFERENCE_T_TEST
ap.run_de(method, PROTEINS, 'Distal', 'Proximal')
ap.run_de(method, PROTEINS, 'Distal', 'Middle')
ap.run_de(method, PROTEINS, 'Proximal', 'Middle')

ap.run_de(method, COMPOUNDS, 'Distal', 'Proximal')
ap.run_de(method, COMPOUNDS, 'Distal', 'Middle')
ap.run_de(method, COMPOUNDS, 'Proximal', 'Middle')

2021-04-29 16:11:15.065 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Distal, control is Proximal
2021-04-29 16:11:16.668 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Distal, control is Middle
2021-04-29 16:11:18.165 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Proximal, control is Middle
2021-04-29 16:11:19.690 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Distal, control is Proximal
2021-04-29 16:11:19.784 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Distal, control is Middle
2021-04-29 16:11:19.846 | INFO     | pyMultiOmics.pipelines:run_ttest:134 - t-test case is Proximal, control is Middle


## Query results

Retrieve a single node

In [14]:
node_id = '15366'
res = QueryBuilder(ap) \
        .add(Entity(node_id)) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15366,Acetic acid,compounds,False


Retrieve multiple nodes

In [15]:
node_id = ['15366', 'ENSDARG00000037781', 'F1QAA7']
res = QueryBuilder(ap) \
        .add(Entity(node_id)) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15366,Acetic acid,compounds,False
ENSDARG00000037781,Acss2,genes,True
F1QAA7,F1QAA7,proteins,True


Retrieve nodes connected to a single node

In [16]:
query_id = 'F1QAA7'
res = QueryBuilder(ap) \
        .add(Entity(query_id)) \
        .add(Connected()) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSDARG00000037781,Acss2,genes,True,F1QAA7
15366,Acetic acid,compounds,False,F1QAA7
16027,Adenosine 5-monophosphate,compounds,True,F1QAA7
29888,Diphosphoric acid,compounds,False,F1QAA7
15351,Acetyl-coa,compounds,False,F1QAA7
15346,Coenzyme a,compounds,False,F1QAA7
30616,Atp(4-),compounds,False,F1QAA7
R-DRE-71735,Acetate + coa + atp => acetyl-coa + amp + pyro...,reactions,,F1QAA7
R-DRE-71384,Ethanol oxidation,pathways,,F1QAA7


Retrieve top-10 significantly changing genes

In [17]:
case = 'Distal'
control = 'Proximal'
pval = 0.05
fc_lte = -2
fc_gte = 2
N = 20

In [18]:
res = QueryBuilder(ap) \
        .add(Select(GENES)) \
        .add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \
        .run()
res

Unnamed: 0,display_name,data_type,observed,padj_Distal_vs_Proximal,FC_Distal_vs_Proximal
ENSDARG00000045854,Fgf23,genes,True,3.56955e-10,24.000289
ENSDARG00000071091,Chrm3a,genes,True,3.307603e-11,22.486196
ENSDARG00000104259,Cabz01072885.1,genes,True,2.782933e-10,20.545646
ENSDARG00000005522,Galr1a,genes,True,2.091894e-09,20.531569
ENSDARG00000091254,Si:ch73-59p9.2,genes,True,3.736906e-10,19.689389
ENSDARG00000008541,Chia.4,genes,True,3.736906e-10,19.689389
ENSDARG00000093738,Pth1a,genes,True,3.736906e-10,19.689389
ENSDARG00000037946,Prl,genes,True,3.736906e-10,19.689389
ENSDARG00000036171,Rnasel3,genes,True,1.848146e-11,19.636782
ENSDARG00000026484,Rab15,genes,True,3.56955e-10,19.138412


Find the compounds that are connected to the DE genes above

In [19]:
res = QueryBuilder(ap) \
        .add(Select(GENES)) \
        .add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \
        .add(Connected(data_type=COMPOUNDS)) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed,source_id
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
78579,Aflatoxin b1 endo-cysteinylglycine conjugate,compounds,False,ENSDARG00000091254
29985,L-glutamate(1-),compounds,True,ENSDARG00000091254
15377,Water,compounds,False,ENSDARG00000091254
78584,Aflatoxin b1 exo-cysteinylglycine conjugate,compounds,False,ENSDARG00000091254
78587,Aflatoxin b1 exo glutathione conjugate,compounds,False,ENSDARG00000091254
78581,Aflatoxin b1 endo glutathione conjugate,compounds,False,ENSDARG00000091254
16978,Leukotriene c4,compounds,False,ENSDARG00000091254
28666,Leukotriene d4,compounds,False,ENSDARG00000091254
4047,L-cysteinylglycine,compounds,True,ENSDARG00000091254
16856,Glutathione,compounds,True,ENSDARG00000091254


### Retrieve entity info

In [25]:
res = QueryBuilder(ap) \
        .add(Select(GENES)) \
        .add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \
        .add(Connected()) \
        .add(Info()) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed,source_id,infos,images,links
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q1L8M3,Q1L8M3,proteins,False,ENSDARG00000045854,"[{'key': 'Name', 'value': 'Q1L8M3'}, {'key': '...",[],"[{'text': 'Link to UniProt', 'href': 'http://w..."
X1WHZ7,X1WHZ7,proteins,False,ENSDARG00000071091,"[{'key': 'Name', 'value': 'Muscarinic acetylch...",[],"[{'text': 'Link to UniProt', 'href': 'http://w..."
A0A0G2L3I9,A0A0G2L3I9,proteins,False,ENSDARG00000104259,"[{'key': 'Name', 'value': 'A0A0G2L3I9'}, {'key...",[],"[{'text': 'Link to UniProt', 'href': 'http://w..."
F1QSA8,F1QSA8,proteins,False,ENSDARG00000005522,"[{'key': 'Name', 'value': 'F1QSA8'}, {'key': '...",[],"[{'text': 'Link to UniProt', 'href': 'http://w..."
E7F2C6,E7F2C6,proteins,False,ENSDARG00000091254,"[{'key': 'Name', 'value': 'E7F2C6'}, {'key': '...",[],"[{'text': 'Link to UniProt', 'href': 'http://w..."
...,...,...,...,...,...,...,...
16474,Nadph,compounds,False,ENSDARG00000039067,"[{'key': 'PiMP Peak ID', 'value': 'None'}, {'k...",[http://www.ebi.ac.uk/chebi/displayImage.do?de...,"[{'text': 'Link to ChEBI database', 'href': 'h..."
R-DRE-469659,Testosterone is converted to 5-alpha-dihydroxy...,reactions,,ENSDARG00000039067,"[{'key': 'Summary', 'value': 'The conversion o...",[https://reactome.org/ContentService/exporter/...,"[{'text': 'Link to Reactome database', 'href':..."
R-DRE-193048,Androgen biosynthesis,pathways,,ENSDARG00000039067,"[{'key': 'Summary', 'value': 'Androgens are th...",[https://reactome.org/ContentService/exporter/...,"[{'text': 'Link to Reactome database', 'href':..."
A0A286Y999,A0A286Y999,proteins,False,ENSDARG00000076270,"[{'key': 'Name', 'value': 'A0A286Y999'}, {'key...",[],"[{'text': 'Link to UniProt', 'href': 'http://w..."


In [29]:
from pyMultiOmics.info import get_info

In [30]:
entity_ids = ['ENSDARG00000091254', 'F1QAA7', '15378', 'R-DRE-469659', 'R-DRE-174403']
data_types = ['genes', 'proteins', 'compounds', 'reactions', 'pathways']
for entity_id, data_type in zip(entity_ids, data_types):
    print(entity_id, data_type)
    print(get_info(entity_id, data_type))
    print()

ENSDARG00000091254 genes
{'infos': [{'key': 'Description', 'value': 'si:ch73-59p9.2 '}, {'key': 'Species', 'value': 'danio_rerio'}], 'links': [{'text': 'Link to Ensembl', 'href': 'https://www.ensembl.org/id/ENSDARG00000091254'}, {'text': 'Link to GeneCard', 'href': 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=si:ch73-59p9.2'}, {'text': 'Transcript: si:ch73-59p9.2-201', 'href': 'https://www.ensembl.org/id/ENSDART00000111526'}], 'images': []}

F1QAA7 proteins
{'infos': [{'key': 'Name', 'value': 'Propionate--CoA ligase'}, {'key': 'EC Number', 'value': 'EC6.2.1.17'}, {'key': 'Catalytic Activity', 'value': '\n\nATP + CoA + propanoate = AMP + diphosphate + propanoyl-CoA\n\n\n\n\n\n\n\n\n\n\n\n\n'}, {'key': 'Catalytic Activity', 'value': '\n\nacetate + ATP + CoA = acetyl-CoA + AMP + diphosphate\n\n\n\n\n\n\n\n\n\n\n\n\n'}, {'key': 'Gene_ontologies', 'value': 'acetate-CoA ligase activity; propionate-CoA ligase activity'}], 'links': [{'text': 'Link to UniProt', 'href': 'http://www.unipro

### Plot some heatmap using Plotly

In [None]:
res = QueryBuilder(ap) \
        .add(Select(GENES)) \
        .add(SignificantDE(case, control, pval, fc_lte=fc_lte, fc_gte=fc_gte, N=N)) \
        .run()
res

In [None]:
data_type = GENES
analysis = ap.get_de_analysis(data_type, case, control)
wi = analysis.wi
data_df, design_df = wi.data_df, wi.design_df
data_df

In [None]:
case_group = design_df[design_df['group'] == case].index.values.tolist()
control_group = design_df[design_df['group'] == control].index.values.tolist()
selection = case_group + control_group
selection

In [None]:
heatmap_df = wi.data_df.loc[res.index.values]
heatmap_df = heatmap_df[selection]
heatmap_df

In [None]:
from plotly import express as px
px.imshow(wi.standardize_df(heatmap_df, log=True, axis=0))