# Find interesting Nodes

- Parse graph
- Collapse proteins/RNA to genes
- Remove association relations and pathologies
- Get 10 genes in select subgraphs with lowest degrees
- Run INDRA machine 
- Deduplicate edges
- Shuffle
- Generate Excel sheets

In [1]:
import os
import sys
import itertools as itt
from operator import itemgetter
import logging
from collections import Counter

import matplotlib.pyplot as plt
import pybel
import pybel.constants as pbc 
from pybel_tools.io import from_pickle
from pybel_tools.summary import print_summary
from hbp.curation_utils import get_milestones
from hbp.indra_utils import get_write_no_duplicates
from hbp.find_interesting_nodes import FindInterestingNodes
import seaborn as sns

In [2]:
graph_path = os.path.normpath(os.path.join(os.pardir, 'curation', 'bel', 'alzheimers.bel'))
graph_pickle_path = f'{graph_path}.gpickle'

assert os.path.exists(graph_path), f'{graph_path} does not exist'

In [3]:
pybel.utils.get_version()

'0.11.11-dev'

In [4]:
graph = pybel.from_pickle(f'{graph_path}.gpickle')

print(graph)
print_summary(graph)

Alzheimer's Disease Knowledge Assembly v5.0.5
Nodes: 2359
Edges: 6307
Citations: 819
Authors: 0
Network density: 0.0011338382122353854
Components: 60
Average degree: 2.6735905044510386


In [5]:
curation_path = os.path.normpath(os.path.join(os.pardir, 'curation', 'indra'))
assert os.path.exists(curation_path)

In [6]:
fin = FindInterestingNodes(graph=graph, path=curation_path)

INFO: [2018-07-23 15:20:59] indra/hbp.find_interesting_nodes - preprocessing Alzheimer's Disease Knowledge Assembly v5.0.5


## Get genes from target subgraphs

In [7]:
# Step 1: get a list of target NeuroMMSig subgraphs

target_subgraphs = fin.get_target_subgraphs()
target_subgraphs

['Tau protein subgraph',
 'GSK3 subgraph',
 'DKK1 subgraph',
 'Inflammatory response subgraph',
 'Apoptosis signaling subgraph',
 'Acetylcholine signaling subgraph',
 'Myeloperoxidase subgraph',
 'Free radical formation subgraph',
 'Non-amyloidogenic subgraph',
 'Amyloidogenic subgraph',
 'Insulin signal transduction',
 'GABA subgraph',
 'Neurotransmitter release subgraph',
 'Hydrogen peroxide subgraph',
 'Nitric oxide subgraph',
 'Reactive oxygen species subgraph',
 'Non-amyloidogenic subgraph']

Filter based on the edge predicate

In [8]:
filtered_graph = fin.get_filtered_graph()

print_summary(filtered_graph)

Nodes: 1100
Edges: 2552
Citations: 673
Authors: 0
Network density: 0.002111010009099181
Components: 38
Average degree: 2.32


Get all genes from the filtered graph graph

In [9]:
target_genes = fin.get_target_genes()

print(f'There are {len(target_genes)} target genes')

There are 394 target genes


## Preprocessing

The full graph must be first filtered to remove junk

Get the degrees of the target genes

In [10]:
degrees = fin.get_kept_node_degrees()
degrees[:5]

[(('Gene', 'HGNC', 'APP', ('hgvs', 'c.275341G>C')), 2),
 (('Gene', 'HGNC', 'APP'), 280),
 (('Gene', 'HGNC', 'APP', ('hgvs', 'c.717G>C')), 2),
 (('Gene', 'HGNC', 'CD44'), 0),
 (('Gene', 'HGNC', 'NPHP1'), 0)]

In [11]:
missing_genes = fin.get_missing_genes()
missing_genes

[('Gene', 'HGNC', 'CD44'),
 ('Gene', 'HGNC', 'NPHP1'),
 ('Gene', 'HGNC', 'CADPS2'),
 ('Gene', 'HGNC', 'FRMD4A'),
 ('Gene', 'HGNC', 'ANXA1'),
 ('Gene', 'HGNC', 'STIM2'),
 ('Gene', 'HGNC', 'ROBO2'),
 ('Gene', 'HGNC', 'BIN1'),
 ('Gene', 'HGNC', 'MIR124-1'),
 ('Gene', 'HGNC', 'MIR590'),
 ('Gene', 'HGNC', 'TRAF6'),
 ('Gene', 'HGNC', 'MIR16-1'),
 ('Gene', 'HGNC', 'MIR125B1'),
 ('Gene', 'HGNC', 'MIR128-1'),
 ('Gene', 'HGNC', 'MIR155'),
 ('Gene', 'HGNC', 'CD274'),
 ('Gene', 'HGNC', 'DEFB1'),
 ('Gene', 'HGNC', 'ARG1'),
 ('Gene', 'HGNC', 'MRC1'),
 ('Gene', 'HGNC', 'CHI3L2'),
 ('Gene', 'HGNC', 'IL4R'),
 ('Gene', 'HGNC', 'NUMB'),
 ('Gene', 'HGNC', 'HDLBP'),
 ('Gene', 'HGNC', 'SLC39A1'),
 ('Gene', 'HGNC', 'SOD2'),
 ('Gene', 'HGNC', 'LAMTOR1'),
 ('Gene', 'HGNC', 'AGO2'),
 ('Gene', 'HGNC', 'VIM'),
 ('Gene', 'HGNC', 'TPM1'),
 ('Gene', 'HGNC', 'S100P'),
 ('Gene', 'HGNC', 'INA'),
 ('Gene', 'HGNC', 'SNCB'),
 ('Gene', 'HGNC', 'CTTN'),
 ('Gene', 'HGNC', 'UCHL1'),
 ('Gene', 'HGNC', 'TMSB4X'),
 ('Gene', 'HGN

In [12]:
manager = pybel.Manager()
novel_genes = fin.get_novel_genes(manager, missing_genes)
print(f'there are {len(novel_genes)} novel genes')

INFO: [2018-07-23 15:21:01] indra/pybel.constants - getting configured connection mysql+mysqldb://root@localhost/pybel?charset=utf8


skipping: g(HGNC:CD44)
skipping: g(HGNC:NPHP1)
skipping: g(HGNC:CADPS2)
skipping: g(HGNC:FRMD4A)
skipping: g(HGNC:ANXA1)
skipping: g(HGNC:STIM2)
skipping: g(HGNC:ROBO2)
skipping: g(HGNC:BIN1)
skipping miRNA: ('Gene', 'HGNC', 'MIR124-1')
skipping miRNA: ('Gene', 'HGNC', 'MIR590')
skipping miRNA: ('Gene', 'HGNC', 'MIR16-1')
skipping miRNA: ('Gene', 'HGNC', 'MIR125B1')
skipping miRNA: ('Gene', 'HGNC', 'MIR128-1')
skipping miRNA: ('Gene', 'HGNC', 'MIR155')
skipping: g(HGNC:SOD2)
skipping: g(HGNC:S100P)
skipping: g(HGNC:SNCB)
skipping: g(HGNC:UCHL1)
there are 26 novel genes
there are 26 novel genes


In [13]:
%%time
r = fin.run_novel_genes(novel_genes)

Genes:  96%|█████████▌| 25/26 [26:59<01:04, 64.77s/it]

KeyboardInterrupt: 

In [None]:
Counter({k:len(v) for k, v in r.items()}).most_common()