In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pylab as plt
import matplotlib

import numpy as np
import pandas as pd

In [4]:
sys.path.append('..')

from pyWebOmics.constants import HOMO_SAPIENS, REACTIONS, PROTEOMICS, METABOLOMICS, GENOMICS, TRANSCRIPTOMICS, PATHWAYS
from pyWebOmics.mapping import Mapper
from pyWebOmics.common import set_log_level_info, set_log_level_debug

2021-01-21 17:52:57.365 | INFO     | pyWebOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyWebOmics

## Load the processed Covid data from [1]

[1] [Shen, Bo, et al. "Proteomic and metabolomic characterization of COVID-19 patient sera." Cell 182.1 (2020): 59-72.](https://www.sciencedirect.com/science/article/pii/S0092867420306279?casa_token=wKwWn9P4MK0AAAAA:v8z5MVnQ1ONrcatncCsigSDoxeOq2ZOcN4da9SofGDcpeDqrC76QK8yNKrKtVUrMWBBAntI8)

In [5]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'covid19_data'))
DATA_FOLDER

'C:\\Users\\joewa\\Work\\git\\pyWebOmics\\notebooks\\test_data\\covid19_data'

Read proteomics data

In [6]:
protein_data = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_data.csv'), index_col='Identifier')
protein_design = pd.read_csv(os.path.join(DATA_FOLDER, 'protein_design.csv'), index_col='sample')

In [7]:
protein_data.head()

Unnamed: 0_level_0,h_F1_131N,h_F1_131C,h_F1_132C,h_F2_131N,h_F2_131C,h_F2_132C,h_F3_131N,h_F3_131C,h_F3_132C,h_F4_131N,...,s_F3_128N,s_F3_128C,s_F3_129C,s_F4_128N,s_F4_128C,s_F5_128N,s_F5_128C,s_F6_128N,s_F6_128C,s_F6_133N
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P04114,0.75,0.853,0.822,1.191,1.175,1.078,0.693,0.947,0.931,1.057,...,1.044,1.305,1.657,1.323,1.624,1.17,0.981,0.791,1.029,1.195
P01024,0.782,1.057,0.994,0.864,0.917,0.79,0.823,1.152,0.816,0.92,...,1.1,0.986,1.114,1.21,1.289,1.104,1.111,1.007,1.159,0.979
P02768,1.183,1.101,1.045,1.086,1.041,1.187,1.234,1.079,1.011,1.099,...,0.786,0.706,0.947,0.831,0.717,0.795,0.776,0.938,0.903,0.743
P01023,1.066,1.278,0.959,0.811,0.789,0.931,0.971,0.769,1.011,0.866,...,0.817,0.728,0.861,0.798,0.751,0.917,0.809,0.78,1.195,0.706
P02751,1.085,0.947,0.993,1.343,1.13,0.778,0.731,1.084,1.107,0.909,...,0.566,0.854,1.109,0.63,0.85,0.661,0.848,0.829,0.76,0.811


In [8]:
protein_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
h_F1_131N,healthy
h_F1_131C,healthy
h_F1_132C,healthy
h_F2_131N,healthy
h_F2_131C,healthy
...,...
s_F5_128N,severe
s_F5_128C,severe
s_F6_128N,severe
s_F6_128C,severe


Read metabolomics data

In [9]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [10]:
compound_data.head()

Unnamed: 0_level_0,h_jkdz1,h_jkdz2,h_jkdz3,h_jkdz4,h_jkdz5,h_jkdz6,h_jkdz7,h_jkdz8,h_jkdz9,h_jkdz10,...,s_ZX12,s_ZX13,s_ZX14,s_ZX15,s_ZX16,s_ZX17,s_ZX18,s_ZX19,s_ZX20,s_ZX21
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C21482,19413052.0,6381812.0,9748316.0,5326872.0,19980720.0,3580375.0,8256121.0,8079382.0,15596590.0,15203630.0,...,1904349.0,3226016.0,737814.7,2817698.0,3329101.0,3206752.75,1466174.0,2779301.0,2117668.0,2184310.0
C18218,2711915.25,2056393.0,1445594.0,2038765.0,2536996.0,2638198.0,2285757.0,1973140.0,2015425.0,2290842.0,...,1409720.0,1413307.0,3218834.0,1602131.0,1317878.0,2930312.75,1168094.0,2946776.0,1417311.0,1474166.0
C05127,87727.25,,92387.06,,159787.9,,,,90551.3,121411.4,...,,,,,,,,138278.8,,
C01152,58832828.0,58439340.0,55521330.0,45162140.0,54789520.0,39412590.0,29878760.0,67517260.0,46660310.0,91185240.0,...,28813140.0,31643580.0,25387670.0,33076040.0,39156980.0,24400592.0,25933750.0,64138680.0,40205880.0,49044880.0
C02918,,181554.9,224039.2,160939.7,320619.4,717655.7,326818.2,513581.0,273458.2,,...,333724.5,,434715.2,35321.18,,655827.25,835970.6,4034381.0,283935.8,80621.6


In [11]:
compound_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
h_jkdz1,healthy
h_jkdz2,healthy
h_jkdz3,healthy
h_jkdz4,healthy
h_jkdz5,healthy
...,...
s_ZX17,severe
s_ZX18,severe
s_ZX19,severe
s_ZX20,severe


In [12]:
set_log_level_info()

1

## Create a mapping object

The mapping object uses Reactome to map the different biological entities in the data:
- Transcripts (or genes) are connected to the proteins they encode
- Proteins and compounds are connected to reactions they're involved in
- Reactions are connected to pathways

In [13]:
m = Mapper(HOMO_SAPIENS) \
        .set_protein(protein_data, protein_design) \
        .set_compound(compound_data, compound_design) \
        .build()

2021-01-21 17:53:00.414 | INFO     | pyWebOmics.mapping:build:78 - Created a multi-omics network with 6445 nodes and 13840 edges
2021-01-21 17:53:00.424 | INFO     | pyWebOmics.mapping:build:79 - node_counts = {'Genomics': 0, 'Transcriptomics': 0, 'Proteomics': 594, 'Metabolomics': 81, 'Reactions': 5770, 'Pathways': 0}


In [14]:
m

<pyWebOmics.mapping.Mapper at 0x1c801de72b0>

## Query mapping object

Below shows some example queries we can perform with the mapping object

##### Find out the number of entities mapped to each reaction

In [15]:
reactions = m.get_nodes(types=REACTIONS)
count = 0
for reaction_id, reaction_data in reactions:
    reaction_name = reaction_data['name']
    proteins = m.get_connected(reaction_id, REACTIONS, PROTEOMICS)
    compounds = m.get_connected(reaction_id, REACTIONS, METABOLOMICS)
    
    if len(proteins) > 0 and len(compounds) > 0:
        print('%s\t%s (num_proteins=%d num_compounds=%d)' % (reaction_id, reaction_name, len(proteins), len(compounds)))
        count += 1
        if count >= 10:
            break

R-HSA-8952289	FAM20C phosphorylates FAM20C substrates (num_proteins=50 num_compounds=1)
R-HSA-749448	Liganded Gq-activating GPCRs bind inactive heterotrimeric Gq (num_proteins=6 num_compounds=9)
R-HSA-749452	The Ligand:GPCR:Gq complex dissociates (num_proteins=6 num_compounds=8)
R-HSA-379048	Liganded Gq/11-activating GPCRs act as GEFs for Gq/11 (num_proteins=6 num_compounds=9)
R-HSA-749454	The Ligand:GPCR:Gi complex dissociates (num_proteins=10 num_compounds=5)
R-HSA-749456	Liganded Gi-activating GPCRs bind inactive heterotrimeric G-protein Gi (num_proteins=10 num_compounds=6)
R-HSA-380073	Liganded Gi-activating GPCR acts as a GEF for Gi (num_proteins=10 num_compounds=6)
R-HSA-9606162	Phosphorylated BTK phosphorylates PLCG2 (num_proteins=45 num_compounds=1)
R-HSA-9606159	BTK autophosphorylates (num_proteins=45 num_compounds=1)
R-HSA-9606163	p-SYK and LYN phosphorylate BTK (num_proteins=45 num_compounds=1)


##### List all entities connected to reaction R-HSA-749448

In [16]:
query_id = 'R-HSA-749448'
m.get_neighbours(query_id)

['P01042',
 'P0DJI8',
 'P00734',
 'P01019',
 'Q9HAV0',
 'P05067',
 'C00002',
 'C00035',
 'C00025',
 'C00015',
 'C02165',
 'C00780',
 'C00047',
 'C00062',
 'C00077']

##### Query the connections between proteins and compounds (through their shared reactions)

In [17]:
query_id = 'P01042'
m.get_connected(query_id, PROTEOMICS, METABOLOMICS)

['C00035',
 'C00015',
 'C00042',
 'C00002',
 'C00025',
 'C02165',
 'C00186',
 'C00047',
 'C00062',
 'C00077',
 'C00029',
 'C00780']

In [18]:
query_id = 'C00077'
m.get_connected(query_id, METABOLOMICS, PROTEOMICS)

['P01019', 'P0DJI8', 'P01042', 'Q9HAV0', 'P00734', 'P05067']