# Implementation of diffusion hetmech

In [1]:
import collections

import numpy
import pandas
from sklearn.preprocessing import normalize
from neo4j.v1 import GraphDatabase
import hetio.readwrite
import hetio.hetnet

In [2]:
%%time
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

CPU times: user 1min 20s, sys: 1.97 s, total: 1min 22s
Wall time: 1min 25s


In [3]:
# MetaGraph node/edge count
metagraph.n_nodes, metagraph.n_edges

(11, 24)

In [4]:
# Graph node/edge count
graph.n_nodes, graph.n_edges

(47031, 2250197)

In [5]:
def get_node_to_position(graph, metanode):
    """
    Given a metanode, return a dictionary of node to position
    """
    if not isinstance(metanode, hetio.hetnet.MetaNode):
        # metanode is a name
        metanode = graph.node_dict(metanode)
    metanode_to_nodes = graph.get_metanode_to_nodes()
    nodes = sorted(metanode_to_nodes[metanode])
    node_to_position = collections.OrderedDict((n, i) for i, n in enumerate(nodes))
    return node_to_position

def metaedge_to_adjacency_matrix(graph, metaedge):
    """
    Returns an adjacency matrix where source nodes are columns and target nodes are rows
    """
    if not isinstance(metaedge, hetio.hetnet.MetaEdge):
        # metaedge is an abbreviation
        metaedge = graph.metagraph.metapath_from_abbrev(metaedge)[0]
    source_nodes = list(get_node_to_position(graph, metaedge.source))
    target_node_to_position = get_node_to_position(graph, metaedge.target)
    adjacency_matrix = numpy.zeros((len(target_node_to_position), len(source_nodes)))
    for j, source_node in enumerate(source_nodes):
        for edge in source_node.edges[metaedge]:
            i = target_node_to_position[edge.target]
            adjacency_matrix[i, j] = 1
    return adjacency_matrix

In [6]:
def row_normalize(matrix, damping_exponent=0):
    """
    Row normalize a 2d numpy array
    """
    matrix = matrix.copy()
    row_sums = matrix.sum(axis=1)
    for j, row_sum in enumerate(row_sums):
        if row_sum == 0:
            continue
        matrix[j, :] *= row_sum ** -damping_exponent
    return matrix


def diffuse_along_metapath(graph, metapath, source_node_weights, damping_exponent=1):
    """
    Parameters
    ==========
    graph : hetio.hetnet.Graph
        graph to extract adjacency matrixes along
    metapath : hetio.hetnet.MetaPath
        metapath to diffuse along
    source_node_weights : dict
        dictionary of node to weight. Nodes not in dict are zero-weighted
    """
    
    # Initialize node weights
    source_metanode = metapath.source()
    source_node_to_position = get_node_to_position(graph, source_metanode)
    node_scores = numpy.zeros(len(source_node_to_position))
    for source_node, weight in source_node_weights.items():
        i = source_node_to_position[source_node]
        node_scores[i] = weight
    
    for metaedge in metapath:
        adjacency_matrix = metaedge_to_adjacency_matrix(graph, metaedge)
        # Row normalization and degree damping
        adjacency_matrix = row_normalize(adjacency_matrix, damping_exponent)
        # Column Normalization
        adjacency_matrix = normalize(adjacency_matrix, norm='l1', axis=0)
        # Can use @ in Python 3.5+ https://www.python.org/dev/peps/pep-0465/
        node_scores = adjacency_matrix.dot(node_scores)


    target_metanode = metapath.target()
    target_node_to_position = get_node_to_position(graph, target_metanode)
    node_to_score = collections.OrderedDict(zip(target_node_to_position, node_scores))
    return node_to_score

In [7]:
# Uses the official neo4j-python-driver. See https://github.com/neo4j/neo4j-python-driver

query = '''
MATCH (disease:Disease)-[assoc:ASSOCIATES_DaG]-(gene:Gene)
WHERE disease.name = 'epilepsy syndrome'
RETURN
 gene.name AS gene_symbol,
 gene.description AS gene_name,
 gene.identifier AS entrez_gene_id,
 assoc.sources AS sources
ORDER BY gene_symbol
'''

driver = GraphDatabase.driver("bolt://neo4j.het.io")
with driver.session() as session:
    result = session.run(query)
    gene_df = pandas.DataFrame((x.values() for x in result), columns=result.keys())

gene_df.head()

Unnamed: 0,gene_symbol,gene_name,entrez_gene_id,sources
0,ABAT,4-aminobutyrate aminotransferase,18,[DisGeNET]
1,ABCB1,"ATP-binding cassette, sub-family B (MDR/TAP), ...",5243,"[DISEASES, DOAF, DisGeNET]"
2,ABCC2,"ATP-binding cassette, sub-family C (CFTR/MRP),...",1244,[DisGeNET]
3,ABCG2,"ATP-binding cassette, sub-family G (WHITE), me...",9429,[DisGeNET]
4,ACKR4,atypical chemokine receptor 4,51554,[DISEASES]


In [None]:
epilepsy_genes = list()
for entrez_gene_id in gene_df.entrez_gene_id:
    node_id = 'Gene', entrez_gene_id
    node = graph.node_dict.get(node_id)
    if node:
        epilepsy_genes.append(node)
len(epilepsy_genes)

399

In [None]:
metapath = metagraph.metapath_from_abbrev('GiGpBP')
source_node_weights = {gene: 1 for gene in epilepsy_genes}
pathway_scores = diffuse_along_metapath(graph, metapath, source_node_weights, damping_exponent=1)
rows = [(pathway.name, score) for pathway, score in pathway_scores.items()]
target_df = pandas.DataFrame(rows, columns=['target_node', 'score'])
target_df = target_df.sort_values('score', ascending=False)

In [None]:
len(target_df)

In [None]:
sum(target_df.score)

In [None]:
metapath

In [None]:
target_df

# Diagnosing ubiquitin homeostasis

[ubiquitin homeostasis](http://amigo.geneontology.org/amigo/term/GO:0010992) contains 3 genes: [UBB, UBC, IDE]

```cypher
MATCH (bp:BiologicalProcess)-[rel:PARTICIPATES_GpBP]-(gene)-[INTERACTS_GiG]-(gene_target)
WHERE bp.name ='ubiquitin homeostasis'
RETURN
  gene.name AS ubiquitin_homeostasis_gene,
  count(gene_target) AS n_interacting_genes
```

Returns the following table:

| ubiquitin_homeostasis_gene | n_interacting_genes |
|----------------------------|---------------------|
| IDE | 243 |
| UBC | 9371 |
| UBB | 1040 |
