# Prepare dataset for hiveplot

This notebook currently just exports a subset of the nodes to a DOT file for import into [`jhive`](https://www.bcgsc.ca/wiki/display/jhive/Documentation).

In [1]:
import random

import pandas
import networkx

from networkx.drawing.nx_pydot import write_dot

In [2]:
node_df = pandas.read_table('../../data/nodes.tsv')
edge_df = pandas.read_table('../../data/edges.sif.gz')

In [3]:
node_df.head(2)

Unnamed: 0,id,name,kind
0,Anatomy::UBERON:0000002,uterine cervix,Anatomy
1,Anatomy::UBERON:0000004,nose,Anatomy


In [4]:
edge_df.head(2)

Unnamed: 0,source,metaedge,target
0,Gene::9021,GpBP,Biological Process::GO:0071357
1,Gene::51676,GpBP,Biological Process::GO:0098780


In [5]:
graph = networkx.MultiGraph()

# No colons allowed. See https://github.com/carlos-jenkins/pydotplus/issues/3
make_dot_safe = lambda x: x.replace(':', '_')

for row in node_df.itertuples():
    node_id = make_dot_safe(row.id)
    graph.add_node(node_id, node_name=row.name, kind=row.kind)

for row in edge_df.itertuples():
    source = make_dot_safe(row.source)
    target = make_dot_safe(row.target)
    graph.add_edge(source, target, key=row.metaedge)

len(graph)

47031

In [6]:
random.seed(0)
node_subset = random.sample(graph.nodes(), 1000)
graph_subset = graph.subgraph(node_subset)
len(graph_subset)

1000

In [7]:
write_dot(graph_subset, 'data/hetionet-v1.0-simple.dot')