# Extract data for hetnet visualization

In [1]:
import itertools
import collections

import pandas
import py2neo
import hetio.readwrite
import hetio.neo4j

## Metapath counts

In [2]:
metagraph = hetio.readwrite.read_metagraph('../../data/metagraph.json')

In [3]:
metanodes = sorted(metagraph.get_nodes(), key=lambda x: str(x))

In [4]:
rows = list()
max_length = 4
for source, target in itertools.combinations_with_replacement(metanodes, 2):
    metapaths = metagraph.extract_metapaths(source.get_id(), target.get_id(), max_length = max_length)
    counter = collections.Counter(len(metapath) for metapath in metapaths)
    for i in range(max_length):
        i += 1
        row = str(source), str(target), source.abbrev, target.abbrev, i, counter[i]
        rows.append(row)
count_df = pandas.DataFrame(rows, columns=['source', 'target', 'source_abbrev', 'target_abbrev', 'length', 'metapaths'])

In [5]:
count_df.head()

Unnamed: 0,source,target,source_abbrev,target_abbrev,length,metapaths
0,Anatomy,Anatomy,A,A,1,0
1,Anatomy,Anatomy,A,A,2,10
2,Anatomy,Anatomy,A,A,3,55
3,Anatomy,Anatomy,A,A,4,583
4,Anatomy,Biological Process,A,BP,1,0


In [6]:
count_df.to_csv('data/metapath-counts.tsv', sep='\t', index=False)

## Path Counts

In [7]:
neo = py2neo.Graph("http://localhost:7500/db/data/")

In [8]:
neo4j_label_to_kind = dict()
for metanode in metagraph.get_nodes():
    neo4j_label_to_kind[hetio.neo4j.as_label(metanode)] = str(metanode)

In [9]:
%%time
query = '''
OPTIONAL MATCH path = (s)-[*{}]-(t)
WHERE {{ source }} in labels(s)
AND {{ target }} in labels(t)
// WITH path, s, t
//  LIMIT 100
RETURN
  {{ source }} AS source,
  {{ target }} AS target,
  count(path) AS path_count,
  count(DISTINCT [r IN relationships(path)| type(r)]) AS metapath_count,
  count(DISTINCT t) AS target_nodes
'''
rows = list()
for source, target in itertools.product(neo4j_label_to_kind.keys(), repeat=2):
    for length in range(1, 2):
        length_query = query.format(length)
        records = neo.cypher.execute(length_query, source=source, target=target)
        rows.extend(tuple(r) + (length,) for r in records)

reach_df = pandas.DataFrame(rows, columns=tuple(records.columns) + ('length',))
for column in 'source', 'target':
    reach_df[column] = reach_df[column].map(neo4j_label_to_kind)

CPU times: user 360 ms, sys: 36 ms, total: 396 ms
Wall time: 2min 12s


In [10]:
reach_df.head()

Unnamed: 0,source,target,path_count,metapath_count,target_nodes,length
0,Cellular Component,Cellular Component,0,0,0,1
1,Cellular Component,Pathway,0,0,0,1
2,Cellular Component,Symptom,0,0,0,1
3,Cellular Component,Disease,0,0,0,1
4,Cellular Component,Molecular Function,0,0,0,1


In [11]:
metanode_df = pandas.read_table('../../data/summary/metanodes.tsv')
metanode_to_count = dict(zip(metanode_df.metanode, metanode_df.nodes))

In [12]:
reach_df = reach_df.merge(
  metanode_df[['metanode', 'nodes']].rename(columns={'metanode': 'target', 'nodes': 'total_targets'})
)
reach_df['percent_targets'] = 100 * reach_df['target_nodes'] / reach_df['total_targets']

In [13]:
reach_df.head()

Unnamed: 0,source,target,path_count,metapath_count,target_nodes,length,total_targets,percent_targets
0,Cellular Component,Cellular Component,0,0,0,1,1391,0.0
1,Pathway,Cellular Component,0,0,0,1,1391,0.0
2,Symptom,Cellular Component,0,0,0,1,1391,0.0
3,Disease,Cellular Component,0,0,0,1,1391,0.0
4,Molecular Function,Cellular Component,0,0,0,1,1391,0.0


In [14]:
reach_df.to_csv('data/path-counts.tsv', sep='\t', index=False)