In [31]:
import networkx as nx
from indra_cogex.client.neo4j_client import Neo4jClient
from tabulate import tabulate
from IPython.display import HTML
from collections import Counter
from networkx.drawing.nx_agraph import to_agraph
from tqdm.auto import tqdm

In [32]:
client = Neo4jClient()

INFO: [2022-01-25 16:59:54] indra_cogex.client.neo4j_client - Using configured URL for INDRA neo4j connection
INFO: [2022-01-25 16:59:54] indra_cogex.client.neo4j_client - Using configured credentials for INDRA neo4j connection


In [9]:
[el[0] for el in client.query_tx("call db.labels();")]

['ClinicalTrial', 'Evidence', 'BioEntity', 'Publication']

In [33]:
%%time
[(labels[0], count) for labels, count in client.query_tx(f"MATCH (n) RETURN distinct labels(n), count(*)")]

CPU times: user 3.92 ms, sys: 2.59 ms, total: 6.51 ms
Wall time: 15.3 s


[('ClinicalTrial', 364937),
 ('Evidence', 18326675),
 ('BioEntity', 2612711),
 ('Publication', 33369469)]

In [14]:
HTML(tabulate(node_count_results, headers=["Node Type", "Count"], tablefmt="html"))

Node Type,Count
['ClinicalTrial'],364937
['Evidence'],18326675
['BioEntity'],2612711
['Publication'],33369469


In [36]:
%%time
Counter({
    label[0]: client.query_tx(f"MATCH (n:{label[0]}) RETURN count(*)")[0][0]
    for label in client.query_tx("call db.labels();")
})

CPU times: user 7.98 ms, sys: 2.24 ms, total: 10.2 ms
Wall time: 1.71 s


Counter({'ClinicalTrial': 364937,
         'Evidence': 18326675,
         'BioEntity': 2612711,
         'Publication': 33369469})

In [19]:
relation_types = [el[0] for el in client.query_tx("call db.relationshipTypes();")]
relation_types

['expressed_in',
 'copy_number_altered_in',
 'sensitive_to',
 'mutated_in',
 'has_indication',
 'tested_in',
 'has_trial',
 'indra_rel',
 'has_citation',
 'associated_with',
 'isa',
 'xref',
 'partof',
 'annotated_with',
 'haspart',
 'has_side_effect']

In [26]:
edge_count_results = Counter({
    relation: client.query_tx(f"MATCH ()-[r:{relation}]->() RETURN count(*)")[0][0]
    for relation in tqdm(relation_types)
})

  0%|          | 0/16 [00:00<?, ?it/s]

In [38]:
edge_count_results

Counter({'expressed_in': 4725039,
         'copy_number_altered_in': 1422111,
         'sensitive_to': 69271,
         'mutated_in': 1140475,
         'has_indication': 45902,
         'tested_in': 253578,
         'has_trial': 536104,
         'indra_rel': 6268226,
         'has_citation': 17975205,
         'associated_with': 158648,
         'isa': 534776,
         'xref': 1129208,
         'partof': 619379,
         'annotated_with': 290131450,
         'haspart': 428980,
         'has_side_effect': 308948})

In [27]:
HTML(tabulate(edge_count_results.most_common(), headers=["Edge Type", "Count"], tablefmt="html"))

Edge Type,Count
annotated_with,290131450
has_citation,17975205
indra_rel,6268226
expressed_in,4725039
copy_number_altered_in,1422111
mutated_in,1140475
xref,1129208
partof,619379
has_trial,536104
isa,534776


In [None]:
def write_schema(client: Neo4jClient, path, prog: str = "dot") -> None:
    """Generate a diagram of the the Neo4j schema.

    Arguments
    ---------
    client : the neo4j client
    path : the path of the file (can end in pdf, svg, png, etc.)
    prog : the graphviz layout program (can be neato, dot, twopi, etc.)
    """
    schema_nodes, schema_relationships = client.query_tx("call db.schema.visualization();")[0]

    graph = nx.MultiDiGraph()
    for node in schema_nodes:
        graph.add_node(node._id, label=node["name"])
    for edge in schema_relationships:
        graph.add_edge(edge.start_node._id, edge.end_node._id, id=edge._id, label=edge.type)

    agraph = to_agraph(graph)
    agraph.draw(path prog="dot")