In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pandas as pd
from loguru import logger

In [4]:
sys.path.append('..')

from pyMultiOmics.constants import *
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info
from pyMultiOmics.analysis import *

2021-02-08 16:00:19.158 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyMultiOmics

## Load the processed Zebrafish data from [1]

[1] [Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.](https://www.pnas.org/content/114/5/E717.short)

In [5]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'zebrafish_data'))
DATA_FOLDER

'C:\\Users\\joewa\\Work\\git\\pyMultiOmics\\notebooks\\test_data\\zebrafish_data'

Read transcriptomics data (identified by their gene ids)

In [6]:
gene_data = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_data_combined.csv'), index_col='Identifier')
gene_design = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_design.csv'), index_col='sample')

Read proteomics data

In [7]:
protein_data = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Uniprot')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample')

Read metabolomics data

In [8]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_kegg.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [9]:
set_log_level_info()

1

## Create a mapping object

The mapping object uses Reactome to map the different biological entities in the data:
- Transcripts (or genes) are connected to the proteins they encode
- Proteins and compounds are connected to reactions they're involved in
- Reactions are connected to pathways

In [10]:
m = Mapper(DANIO_RERIO, metabolic_pathway_only=True) \
        .set_gene(gene_data, gene_design) \
        .set_protein(protein_data, protein_design) \
        .set_compound(compound_data, compound_design) \
        .build()

2021-02-08 16:00:22.781 | INFO     | pyMultiOmics.functions:reactome_mapping:72 - Mapping genes -> proteins
2021-02-08 16:00:23.581 | INFO     | pyMultiOmics.functions:reactome_mapping:77 - Mapping proteins -> reactions
2021-02-08 16:00:25.608 | INFO     | pyMultiOmics.functions:reactome_mapping:85 - Mapping compounds -> reactions
2021-02-08 16:00:26.348 | INFO     | pyMultiOmics.functions:reactome_mapping:91 - Mapping reactions -> pathways
2021-02-08 16:00:26.645 | INFO     | pyMultiOmics.functions:reactome_mapping:102 - Mapping reactions -> proteins
2021-02-08 16:00:28.902 | INFO     | pyMultiOmics.functions:reactome_mapping:109 - Mapping reactions -> compounds
2021-02-08 16:00:30.019 | INFO     | pyMultiOmics.functions:reactome_mapping:121 - Mapping proteins -> genes
2021-02-08 16:00:42.962 | INFO     | pyMultiOmics.mapping:_add_nodes:220 - Processing nodes: genes
2021-02-08 16:00:45.249 | INFO     | pyMultiOmics.mapping:_add_nodes:220 - Processing nodes: proteins
2021-02-08 16:00:4

## DE analysis

In [11]:
data_type = GENES
data_df, design_df = m.get_dfs(data_type)
case = 'Distal'
control = 'Proximal'

In [12]:
analysis = TTestAnalysis(data_df, design_df, data_type, case, control)
analysis.run()

2021-02-08 16:01:03.407 | INFO     | pyMultiOmics.pipelines:run_ttest:120 - t-test case is Distal, control is Proximal


Unnamed: 0,padj,log2FoldChange
ENSDARG00000000001,0.024460,0.715686
ENSDARG00000000002,0.514784,0.127781
ENSDARG00000000018,0.003630,-0.620668
ENSDARG00000000019,0.049351,0.349941
ENSDARG00000000068,0.396128,-0.113797
...,...,...
ENSDARG00000105304,1.000000,0.000000
ENSDARG00000105305,0.391027,0.253777
ENSDARG00000105306,0.628077,-0.350978
ENSDARG00000105307,0.170340,-0.998356


In [None]:
# analysis = DESeq2Analysis(data_df, design_df, data_type, case, control)
# analysis.run()

In [None]:
# analysis = LimmaAnalysis(data_df, design_df, data_type, case, control)
# analysis.run()

Update the results back to graph

In [None]:
m.update(analysis)

In [13]:
data_df

Unnamed: 0_level_0,US-1584693,US-1584700,US-1584706,US-1584712,US-1584722,US-1584724,US-1584725,US-1584732,US-1584738,US-1584744,...,US-1584753,US-1584754,US-1584758,US-1584765,FC_distal_vs_proximal,padj_distal_vs_proximal,FC_distal_vs_middle,padj_distal_vs_middle,FC_middle_vs_proximal,padj_middle_vs_proximal
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDARG00000000001,51,40,69,78,89,47,88,86,110,55,...,58,104,43,72,0.869331,0.000008,0.748943,4.380000e-05,0.114026,0.630834
ENSDARG00000000002,283,129,164,269,211,171,146,256,283,150,...,142,272,260,256,0.287823,0.031298,1.005337,1.310000e-13,-0.724987,0.000001
ENSDARG00000000018,545,503,547,387,332,559,623,499,436,488,...,462,287,495,299,-0.437271,0.000389,-0.404770,6.868400e-04,-0.040193,0.760679
ENSDARG00000000019,437,469,538,557,550,404,544,443,623,502,...,470,460,329,480,0.521291,0.000015,0.271082,1.936266e-02,0.242435,0.041606
ENSDARG00000000068,266,249,247,236,195,247,283,259,299,232,...,231,236,274,241,0.064820,0.595522,0.142243,2.579239e-01,-0.084764,0.528336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000105304,1,0,2,1,0,0,1,0,1,0,...,0,0,0,0,,,,,,
ENSDARG00000105305,14,20,15,21,21,20,30,25,34,27,...,20,24,24,18,0.416577,0.166487,0.261947,3.745709e-01,0.148963,0.635113
ENSDARG00000105306,6,1,1,2,2,3,1,3,1,6,...,1,4,1,3,-0.204962,0.863278,0.466864,7.126342e-01,-0.679447,0.472924
ENSDARG00000105307,7,8,4,6,0,3,4,3,1,2,...,4,1,4,3,-0.845544,0.330570,-0.836304,3.287140e-01,-0.017126,1.000000


In [16]:
query_id = '15366'
m.get_connected(query_id)

Unnamed: 0_level_0,display_name,data_type,observed
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSDARG00000087120,Slc5a8,genes,True
ENSDARG00000014138,Acot8,genes,True
ENSDARG00000037781,Acss2,genes,True
ENSDARG00000005154,Aspa,genes,True
ENSDARG00000093003,Acy3.1,genes,True
ENSDARG00000032199,Gpc3,genes,True
ENSDARG00000086269,Ndst2a,genes,True
ENSDARG00000103647,Gpc5a,genes,True
ENSDARG00000041776,Ndst3,genes,True
ENSDARG00000103606,Ndst1b,genes,True
