In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pandas as pd
from loguru import logger

In [18]:
sys.path.append('..')

from pyMultiOmics.constants import *
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info
from pyMultiOmics.analysis import *
from pyMultiOmics.query import *

# Demonstration of pyMultiOmics

## Load the processed Zebrafish data from [1]

[1] [Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.](https://www.pnas.org/content/114/5/E717.short)

In [5]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'zebrafish_data'))
DATA_FOLDER

'C:\\Users\\joewa\\Work\\git\\pyMultiOmics\\notebooks\\test_data\\zebrafish_data'

Read transcriptomics data (identified by their gene ids)

In [6]:
gene_data = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_data_combined.csv'), index_col='Identifier')
gene_design = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_design.csv'), index_col='sample')

Read proteomics data

In [7]:
protein_data = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Uniprot')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample')

Read metabolomics data

In [8]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_kegg.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [9]:
set_log_level_info()

1

## Create a mapping object

The mapping object uses Reactome to map the different biological entities in the data:
- Transcripts (or genes) are connected to the proteins they encode
- Proteins and compounds are connected to reactions they're involved in
- Reactions are connected to pathways

In [10]:
m = Mapper(DANIO_RERIO, metabolic_pathway_only=True) \
        .set_gene(gene_data, gene_design) \
        .set_protein(protein_data, protein_design) \
        .set_compound(compound_data, compound_design) \
        .build()

2021-02-10 10:56:33.573 | INFO     | pyMultiOmics.functions:reactome_mapping:72 - Mapping genes -> proteins
2021-02-10 10:56:34.567 | INFO     | pyMultiOmics.functions:reactome_mapping:77 - Mapping proteins -> reactions
2021-02-10 10:56:36.904 | INFO     | pyMultiOmics.functions:reactome_mapping:85 - Mapping compounds -> reactions
2021-02-10 10:56:37.821 | INFO     | pyMultiOmics.functions:reactome_mapping:91 - Mapping reactions -> pathways
2021-02-10 10:56:38.176 | INFO     | pyMultiOmics.functions:reactome_mapping:102 - Mapping reactions -> proteins
2021-02-10 10:56:40.807 | INFO     | pyMultiOmics.functions:reactome_mapping:109 - Mapping reactions -> compounds
2021-02-10 10:56:42.195 | INFO     | pyMultiOmics.functions:reactome_mapping:121 - Mapping proteins -> genes
2021-02-10 10:56:58.388 | INFO     | pyMultiOmics.mapping:_add_nodes:181 - Processing nodes: genes
2021-02-10 10:57:01.102 | INFO     | pyMultiOmics.mapping:_add_nodes:181 - Processing nodes: proteins
2021-02-10 10:57:0

## DE analysis

In [11]:
data_type = GENES
data_df, design_df = m.get_dfs(data_type)
case = 'Distal'
control = 'Proximal'

In [12]:
analysis = TTestAnalysis(data_df, design_df, data_type, case, control)
analysis.run()

2021-02-10 10:57:10.519 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Distal, control is Proximal


Unnamed: 0,padj,log2FoldChange
ENSDARG00000000001,0.024460,0.715686
ENSDARG00000000002,0.514784,0.127781
ENSDARG00000000018,0.003630,-0.620668
ENSDARG00000000019,0.049351,0.349941
ENSDARG00000000068,0.396128,-0.113797
...,...,...
ENSDARG00000105304,1.000000,0.000000
ENSDARG00000105305,0.391027,0.253777
ENSDARG00000105306,0.628077,-0.350978
ENSDARG00000105307,0.170340,-0.998356


In [13]:
# analysis = DESeq2Analysis(data_df, design_df, data_type, case, control)
# analysis.run()

In [14]:
# analysis = LimmaAnalysis(data_df, design_df, data_type, case, control)
# analysis.run()

Update the results back to graph

In [None]:
m.update(analysis)

In [None]:
np = NetworkPlot(m)
np.plot()

## Try various queries

Retrieve a single node

In [29]:
node_id = '15366'
res = QueryBuilder(m) \
        .add(Entity(node_id)) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15366,Acetic acid,compounds,False


Retrieve multiple nodes

In [30]:
node_id = ['15366', 'ENSDARG00000037781', 'F1QAA7']
res = QueryBuilder(m) \
        .add(Entity(node_id)) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15366,Acetic acid,compounds,False
ENSDARG00000037781,Acss2,genes,True
F1QAA7,F1QAA7,proteins,True


Retrieve nodes connected to a single node

In [16]:
query_id = 'F1QAA7'
res = QueryBuilder(m) \
        .add(SingleEntity(query_id)) \
        .add(Connected()) \
        .run()
res

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSDARG00000037781,Acss2,genes,True
15346,Coenzyme a,compounds,False
30616,Atp(4-),compounds,False
29888,Diphosphoric acid,compounds,False
16027,Adenosine 5-monophosphate,compounds,True
15351,Acetyl-coa,compounds,False
15366,Acetic acid,compounds,False
R-DRE-71735,Acetate + coa + atp => acetyl-coa + amp + pyro...,reactions,
R-DRE-71384,Ethanol oxidation,pathways,


Retrieve compounds connected to the top-10 significantly changing genes in the Distal vs. Proximal comparison

In [32]:
case = 'Distal'
control = 'Proximal'
sort_col = 'FC_%s_%s' % (case, control)
pval = 0.05
top = 10

res = QueryBuilder(m) \
        .add(Select(GENES)) \
        .add(Significant(case, control, th=pval)) \
        .add(TopN(top, sort_by=sort_col)) \
        .add(Connected(dest_type=COMPOUNDS, observed=True)) \
        .run()

NameError: name 'Select' is not defined

Retrieve top-10 significantly changing genes and proteins in the Distal vs. Proximal comparisons, and show a heatmap for each

In [33]:
case = 'Distal'
control = 'Proximal'
sort_col = 'FC_%s_%s' % (case, control)
pval = 0.05
top = 10

qb1 = QueryBuilder(m) \
        .add(Select(GENES)) \
        .add(Significant(case, control, th=pval)) \
        .add(TopN(top, sort_by=sort_col))

qb2 = QueryBuilder(mapping) \
        .add(Select(PROTEINS)) \
        .add(Significant(case, control, th=pval)) \
        .add(TopN(top, sort_by=sort_col))

res = QueryBuilder(mapping) \
        .add(Concat(qb1, qb2)) \
        .add(HeatMap()) \
        .run()

NameError: name 'Select' is not defined