In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pylab as plt
import matplotlib

import numpy as np
import pandas as pd

In [4]:
sys.path.append('..')

from pyWebOmics.constants import DANIO_RERIO, REACTIONS, PROTEOMICS, METABOLOMICS, GENOMICS, TRANSCRIPTOMICS, PATHWAYS
from pyWebOmics.mapping import Mapper
from pyWebOmics.common import set_log_level_info, set_log_level_debug

2021-01-22 23:05:19.547 | INFO     | pyWebOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyWebOmics

## Load the processed Zebrafish data from [1]

[1] [Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.](https://www.pnas.org/content/114/5/E717.short)

In [5]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'zebrafish_data'))
DATA_FOLDER

'C:\\Users\\joewa\\Work\\git\\pyWebOmics\\notebooks\\test_data\\zebrafish_data'

Read transcriptomics data (identified by their gene ids)

In [6]:
transcript_data = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_data_combined.csv'), index_col='Identifier')
transcript_design = pd.read_csv(os.path.join(DATA_FOLDER, 'gene_design.csv'), index_col='sample')

In [7]:
transcript_data.head()

Unnamed: 0_level_0,US-1584693,US-1584700,US-1584706,US-1584712,US-1584722,US-1584724,US-1584725,US-1584732,US-1584738,US-1584744,...,US-1584753,US-1584754,US-1584758,US-1584765,FC_distal_vs_proximal,padj_distal_vs_proximal,FC_distal_vs_middle,padj_distal_vs_middle,FC_middle_vs_proximal,padj_middle_vs_proximal
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDARG00000000001,51,40,69,78,89,47,88,86,110,55,...,58,104,43,72,0.869331,8e-06,0.748943,4.38e-05,0.114026,0.630834
ENSDARG00000000002,283,129,164,269,211,171,146,256,283,150,...,142,272,260,256,0.287823,0.031298,1.005337,1.31e-13,-0.724987,1e-06
ENSDARG00000000018,545,503,547,387,332,559,623,499,436,488,...,462,287,495,299,-0.437271,0.000389,-0.40477,0.00068684,-0.040193,0.760679
ENSDARG00000000019,437,469,538,557,550,404,544,443,623,502,...,470,460,329,480,0.521291,1.5e-05,0.271082,0.01936266,0.242435,0.041606
ENSDARG00000000068,266,249,247,236,195,247,283,259,299,232,...,231,236,274,241,0.06482,0.595522,0.142243,0.2579239,-0.084764,0.528336


In [8]:
transcript_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
US-1584752,Proximal
US-1584732,Proximal
US-1584724,Proximal
US-1584693,Proximal
US-1584758,Proximal
US-1584725,Middle
US-1584706,Middle
US-1584700,Middle
US-1584744,Middle
US-1584753,Middle


Read proteomics data

In [9]:
protein_data = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Uniprot')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample')

In [10]:
protein_data.head()

Unnamed: 0_level_0,Distal#3_01,Distal#3_02,Distal#3_03,Distal#3_04,Middle#3_01,Middle#3_02,Middle#3_03,Middle#3_04,Proximal#3_01,Proximal#3_02,Proximal#3_03,Proximal#3_04
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A0A0A0MPL4,995526.4,4946580.0,1377194.0,2208140.0,2907807.0,4231976.0,1560849.0,2852904.0,1781795.086,2668135.0,3079148.0,2840473.0
A0A0B4J1A5,2982519.0,8816655.0,7668431.0,4632309.0,7672153.0,7776017.0,6633781.0,8242783.0,5475654.544,5703832.0,8294364.0,13348740.0
A0A0B4J1A7,15530490.0,1037155.0,18561370.0,17678590.0,13757360.0,17479980.0,15175070.0,23944650.0,3157387.719,17947750.0,23004300.0,20638000.0
A0AUQ3,2012699.0,3088982.0,2455865.0,944833.1,2866780.0,2661669.0,2100352.0,2133662.0,1738244.989,2629396.0,2900560.0,2416018.0
A0AUR9,3640487.0,25884770.0,34159890.0,2868569.0,1971142.0,2472776.0,5615177.0,1303356.0,3263299.566,6866769.0,2465929.0,4515643.0


In [11]:
protein_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
Distal#3_01,Distal
Distal#3_02,Distal
Distal#3_03,Distal
Distal#3_04,Distal
Middle#3_01,Middle
Middle#3_02,Middle
Middle#3_03,Middle
Middle#3_04,Middle
Proximal#3_01,Proximal
Proximal#3_02,Proximal


Read metabolomics data

In [12]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_kegg.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [13]:
compound_data.head()

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
C00565,75170.0,57052,39170.0,84057,38608.0,64126.0,50214.0,75680,165178,121856,77061,98015.0,113765,96098,84198,117644,169459,169669
C00037,64511.0,33658,23565.0,52102,49508.0,37498.0,30417.0,55728,88519,103871,45974,73101.0,72725,66008,54220,95341,110192,291598
C01104,5787534.0,4351239,4401036.0,8187282,8431125.0,5082056.0,5138937.0,7341351,7837293,9256269,9934066,10243285.0,7344406,5524811,4809250,9279874,9047339,9211255
C00134,3430897.0,1877785,1225710.0,2326620,2421267.0,2595529.0,2003627.0,2120053,2269318,3220850,4596854,3155377.0,3760854,2658833,2488025,2506550,4000703,3292566
C00213,112845.0,129977,122292.0,63219,50113.0,100343.0,156651.0,176682,379322,160906,56802,107161.0,235982,181200,142994,116132,94589,167280


In [14]:
compound_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
distal_M1,Distal
distal_M2,Distal
distal_M3,Distal
distal_F1,Distal
distal_F2,Distal
distal_F3,Distal
middle_M1,Middle
middle_M2,Middle
middle_M3,Middle
middle_F1,Middle


In [15]:
set_log_level_info()

1

## Create a mapping object

The mapping object uses Reactome to map the different biological entities in the data:
- Transcripts (or genes) are connected to the proteins they encode
- Proteins and compounds are connected to reactions they're involved in
- Reactions are connected to pathways

In [16]:
m = Mapper(DANIO_RERIO) \
        .set_transcript(transcript_data, transcript_design) \
        .set_protein(protein_data, protein_design) \
        .set_compound(compound_data, compound_design) \
        .build()

2021-01-22 23:05:23.223 | INFO     | pyWebOmics.mapping:build:85 - Created a multi-omics network with 39154 nodes and 13154 edges
2021-01-22 23:05:23.389 | INFO     | pyWebOmics.mapping:build:86 - node_counts = {'Genomics': 0, 'Transcriptomics': 31953, 'Proteomics': 3061, 'Metabolomics': 128, 'Reactions': 4012, 'Pathways': 0}


In [17]:
m

<pyWebOmics.mapping.Mapper at 0x1c2016da240>

## Querying mapping object

Below shows some example queries we can perform with the mapping object

##### Find out the number of entities mapped to each reaction

In [46]:
reactions = m.get_nodes(types=REACTIONS)
count = 0
for reaction_id, reaction_data in reactions:
    reaction_name = reaction_data['name']
    transcripts = m.get_connected(reaction_id, TRANSCRIPTOMICS)['results']
    proteins = m.get_connected(reaction_id, PROTEOMICS)['results']
    compounds = m.get_connected(reaction_id, METABOLOMICS)['results']
    
    if len(transcripts) > 0 and len(proteins) > 0 and len(compounds) > 0:
        print('%s\t%s (transcripts=%d proteins=%d compounds=%d)' % (reaction_id, reaction_name, len(transcripts), len(proteins), len(compounds)))
        count += 1
        if count >= 10:
            break

R-DRE-70967	isocitrate + NAD+ => alpha-ketoglutarate + CO2 + NADH + H+ [IDH3] (transcripts=2 proteins=2 compounds=2)
R-DRE-5688289	SIRT3 deacetylates ACCS2, GLUD, IDH2, SOD2 (transcripts=4 proteins=4 compounds=1)
R-DRE-71735	acetate + CoA + ATP => acetyl-CoA + AMP + pyrophosphate [cytosolic] (transcripts=1 proteins=1 compounds=1)
R-DRE-8870469	RGGT geranylgeranylates RAB proteins (transcripts=13 proteins=13 compounds=1)
R-DRE-8870466	RGGT:CHM binds RABs (transcripts=13 proteins=13 compounds=1)
R-DRE-5617816	RAB3IP stimulates nucleotide exchange on RAB8A  (transcripts=21 proteins=21 compounds=1)
R-DRE-5623521	RAB3IP stimulates nucleotide exchange on RAB8A (transcripts=2 proteins=2 compounds=1)
R-DRE-5623519	RAB3IP and RAB8A bind to the ciliary targeting complex (transcripts=2 proteins=2 compounds=1)
R-DRE-5623513	ASAP1 stimulates GTPase activity of ARF4 (transcripts=1 proteins=1 compounds=1)
R-DRE-390597	Release Of ADP From Myosin (transcripts=3 proteins=3 compounds=1)


##### What are the nodes directly connected to reaction 'R-DRE-71735' in the graph?

In [47]:
query_id = 'R-DRE-71735'
m.get_neighbours(query_id)

['F1QAA7', 'C00020']

##### Find transcripts connected to protein 'F1QAA7'

In [53]:
query_id = 'F1QAA7'
m.get_connected(query_id, TRANSCRIPTOMICS)

{'results': ['ENSDARG00000037781'],
 'path': [('F1QAA7', 2),
  ('ENSDARG00000037781', 1),
  ('R-DRE-5688289', 4),
  ('R-DRE-71735', 4),
  ('C00153', 3),
  ('C00020', 3)]}

##### Find compounds connected to protein 'F1QAA7'

In [55]:
query_id = 'F1QAA7'
m.get_connected(query_id, METABOLOMICS)

{'results': ['C00153', 'C00020'],
 'path': [('F1QAA7', 2),
  ('ENSDARG00000037781', 1),
  ('R-DRE-5688289', 4),
  ('R-DRE-71735', 4),
  ('C00153', 3),
  ('C00020', 3)]}

##### Find transcripts connected to compound 'C00153'

In [56]:
query_id = 'C00153'
m.get_connected(query_id, TRANSCRIPTOMICS)

{'results': ['ENSDARG00000019529',
  'ENSDARG00000071017',
  'ENSDARG00000030598',
  'ENSDARG00000036096',
  'ENSDARG00000006389',
  'ENSDARG00000058984',
  'ENSDARG00000032725',
  'ENSDARG00000020001',
  'ENSDARG00000038030',
  'ENSDARG00000020504',
  'ENSDARG00000033009',
  'ENSDARG00000068436',
  'ENSDARG00000045248',
  'ENSDARG00000068434',
  'ENSDARG00000086591',
  'ENSDARG00000068820',
  'ENSDARG00000068941',
  'ENSDARG00000101720',
  'ENSDARG00000094154',
  'ENSDARG00000098970',
  'ENSDARG00000103819',
  'ENSDARG00000101074',
  'ENSDARG00000008816',
  'ENSDARG00000044562',
  'ENSDARG00000037781',
  'ENSDARG00000042644'],
 'path': [('C00153', 3),
  ('R-DRE-5685953', 4),
  ('R-DRE-8948800', 4),
  ('R-DRE-9667952', 4),
  ('R-DRE-3640858', 4),
  ('R-DRE-5687653', 4),
  ('R-DRE-8940074', 4),
  ('R-DRE-197250', 4),
  ('R-DRE-8870346', 4),
  ('R-DRE-8938073', 4),
  ('R-DRE-5359451', 4),
  ('R-DRE-8938076', 4),
  ('R-DRE-2187325', 4),
  ('R-DRE-5696655', 4),
  ('R-DRE-3371467', 4),
  ('

##### Find compounds connected to transcript 'ENSDARG00000042644'

In [59]:
query_id = 'ENSDARG00000042644'
m.get_connected(query_id, METABOLOMICS)

{'results': ['C00153'],
 'path': [('ENSDARG00000042644', 1),
  ('Q6P980', 2),
  ('R-DRE-5688289', 4),
  ('R-DRE-3299680', 4),
  ('C00153', 3)]}