# DWPC computation using neo4j

In [1]:
import threading
import concurrent.futures
import gzip
import csv
import time

import pandas
import py2neo

import hetio.readwrite
import hetio.neo4j

In [2]:
# Read hetnet metagraph
path = 'https://raw.githubusercontent.com/dhimmel/integrate/c56d312a53e351d26016d5fd4751a9715cc3f0c3/data/metagraph.json'
metagraph = hetio.readwrite.read_metagraph(path)

In [3]:
# Extract metapaths
metapaths = metagraph.extract_metapaths('compound', 'disease', max_length=4)
metapaths.pop(0)
metapath_to_query = {str(metapath): hetio.neo4j.construct_dwpc_query(metapath, 'identifier')
                     for metapath in metapaths}
len(metapaths)

1979

In [4]:
# Read drug-compound disease pairs
part_df = pandas.read_table('data/partition.tsv.gz')
part_df.tail()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,indication,percentile
212619,DB09028,Cytisine,DOID:9744,type 1 diabetes mellitus,0,0.849434
212620,DB09028,Cytisine,DOID:9835,refractive error,0,0.262252
212621,DB09028,Cytisine,DOID:986,alopecia areata,0,0.584285
212622,DB09028,Cytisine,DOID:9917,Pleural cancer,0,0.032494
212623,DB09028,Cytisine,DOID:9970,obesity,0,0.733156


In [5]:
# Open neo4j connection
uri = 'http://localhost:7474/db/data/'
neo = py2neo.Graph(uri)

In [6]:
def generate_parameters(max_elems=None):
    """Generate compound, disease, metapath combinations"""
    n = 0
    for i, row in part_df.iterrows():
        for metapath, query in metapath_to_query.items():
            if max_elems is not None and n == max_elems:
                break
            yield {
                'compound_id': row['compound_id'],
                'disease_id': row['disease_id'],
                'metapath': metapath,
                'query': query,
                'w': 0.4,
            }
            n += 1

In [None]:
def compute_dwpc(query, metapath, compound_id, disease_id, w):
    """Execute the neo4j query and write results to file"""
    start = time.time()
    results = neo.cypher.execute(query, source=compound_id, target=disease_id, w=w)
    record = results.one
    seconds = '{0:.4g}'.format(time.time() - start)
    row = compound_id, disease_id, metapath, record['PC'], w, record['DWPC'], seconds
    with writer_lock:
        writer.writerow(row)

In [None]:
%%time

# Parameters
workers = 16
max_elems = 1e5

# Prepare writer
path = 'data/dwpc-neo.tsv.gz'
write_file = gzip.open(path, 'wt')
writer = csv.writer(write_file, delimiter='\t')
writer.writerow(['compound_id', 'disease_id', 'metapath', 'PC', 'w', 'DWPC', 'seconds'])

# Create ThreadPoolExecutor
executor = concurrent.futures.ThreadPoolExecutor(max_workers=workers)
writer_lock = threading.Lock()

# Submit jobs
for params in generate_parameters(max_elems):
    while executor._work_queue.qsize() > 5000:
        time.sleep(1)
    executor.submit(compute_dwpc, **params)

# Shutdown and close
executor.shutdown()
write_file.close()

In [None]:
# dwpc_df = pandas.read_table('data/dwpc-neo-TEST.tsv.gz')
# dwpc_df.sort_values('seconds', ascending=False)