# Create metapaths for analysis

In [109]:
import json
import collections
import math
import numpy

import hetio.readwrite
import hetio.neo4j

In [93]:
# Load metagraph for the hetnet
commit = 'f076854ac2b9f785cb080f37ce7d186d7fcb7d0d'
url = 'https://github.com/dhimmel/integrate/raw/{}/data/metagraph.json'.format(commit)
metagraph = hetio.readwrite.read_metagraph(url)

## Construct list of metapaths

In [94]:
def remove(metapath):
    """
    Return True if metpath should be removed, so features
    are not computed for it.
    """
    if len(metapath) == 1:
        return True
    return False

In [95]:
# Cypher DWPC query parameters
dwpc_query_options = {
    'property': 'identifier',
    'using': True,
    'unique_nodes': 'labeled',
}

In [96]:
metapaths = metagraph.extract_metapaths('compound', 'disease', max_length=4)
obj = list()
removed = list()
for metapath in metapaths:
    if remove(metapath):
        removed.append(metapath)
        continue
    item = collections.OrderedDict()
    item['length'] = len(metapath)
    item['abbreviation'] = str(metapath)
    item['edges'] = [str(x) for x in metapath.edges]
    item['standard_edges'] = [str(x.inverse if x.inverted else x) for x in metapath.edges]
    item['edge_abbreviations'] = [x.get_abbrev() for x in metapath.edges]
    item['standard_edge_abbreviations'] = [x.get_standard_abbrev() for x in metapath.edges]
    item['dwpc_query'] = hetio.neo4j.construct_dwpc_query(metapath, **dwpc_query_options)
    obj.append(item)

obj.sort(key=lambda x: (x['length'], x['abbreviation']))
print('Removed {} metapaths'.format(len(removed)))

Removed 2 metapaths


## Calculate metpath complexity (mean/max degree products)

In [50]:
import pandas

In [58]:
url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/degrees.xlsx'.format(commit)
degree_dfs = dict()
for metanode in metagraph.get_nodes():
    metanode_name = str(metanode)
    degree_dfs[metanode_name] = pandas.read_excel(url, sheetname=metanode_name)

In [74]:
metaedge_to_max_degree = dict()
metaedge_to_mean_degree = dict()
for degree_df in degree_dfs.values():
    for metaedge, degree in degree_df.iloc[:, 2:].max().items():
        metaedge_to_max_degree[metaedge] = degree
    for metaedge, degree in degree_df.iloc[:, 2:].mean().items():
        metaedge_to_mean_degree[metaedge] = degree

In [127]:
def get_optimal_join_index(degrees):
    """
    Returns `(join_index, complexity)` where join_index is the optimal
    join index for path traversal based on `degrees` (a list of degrees)
    and complexity is a log10 measure of the estimated computational complexity.
    """
    log10_degrees = numpy.log10(degrees)
    complexity = list()
    for i in range(len(log10_degrees) + 1):
        head = sum(log10_degrees[:i])
        tail = sum(log10_degrees[i:])
        complexity.append(math.log10(10 ** head + 10 ** tail))
    i = numpy.argmin(complexity)
    return i, complexity[i]

In [119]:
for item in obj:
    max_degrees = [metaedge_to_max_degree[x] for x in item['edges']]
    mean_degrees = [metaedge_to_mean_degree[x] for x in item['edges']]
#     item['join_index_max'], item['complexity_max'] = get_optimal_join_index(max_degrees)
#     item['join_index_mean'], item['complexity_mean'] = get_optimal_join_index(mean_degrees)
    item['complexity_max'] = sum(numpy.log10(max_degrees))
    item['complexity_mean'] = sum(numpy.log10(max_degrees))

## Save metapaths as a JSON file 

In [99]:
path = 'data/all-features/metapaths.json'
with open(path, 'w') as fp:
    json.dump(obj, fp, indent=2)