# DWPC computation using neo4j

In [1]:
import threading
import queue
import gzip
import csv
import time

import pandas
import py2neo

import hetio.readwrite
import hetio.neo4j

In [2]:
%%time
# Read hetnet
path = 'download/hetnet.json.gz'
graph = hetio.readwrite.read_json(path)

CPU times: user 2min 18s, sys: 9.69 s, total: 2min 28s
Wall time: 2min 28s


In [5]:
# Extract metapaths
metapaths = graph.metagraph.extract_metapaths('compound', 'disease', max_length=4)
metapaths.pop(0)
metapath_to_query = {str(metapath):hetio.neo4j.construct_dwpc_query(metapath, 'identifier')
                     for metapath in metapaths}
len(metapaths)

3662

In [6]:
# Read drug-compound disease pairs
part_df = pandas.read_table('data/partition.tsv.gz')
part_df.tail()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,indication,percentile
212619,DB09028,Cytisine,DOID:9744,type 1 diabetes mellitus,0,0.849434
212620,DB09028,Cytisine,DOID:9835,refractive error,0,0.262252
212621,DB09028,Cytisine,DOID:986,alopecia areata,0,0.584285
212622,DB09028,Cytisine,DOID:9917,Pleural cancer,0,0.032494
212623,DB09028,Cytisine,DOID:9970,obesity,0,0.733156


In [7]:
# Open neo4j connection
uri = 'http://neo4j:hetnet@localhost:7474/db/data/'
neo = py2neo.Graph(uri)

In [8]:
def get_params_list():
    """Generate compound, disease, metapath combinations"""
    for i, row in part_df.iterrows():
        for metapath, query in metapath_to_query.items():
            yield {
                'compound_id': row['compound_id'],
                'disease_id': row['disease_id'],
                'metapath': metapath,
                'query': query,
                'w': 0.4,
            }

In [9]:
def compute_dwpc(query, metapath, compound_id, disease_id, w):
    """Execute the neo4j query and write results to file"""
    start = time.time()
    results = neo.cypher.execute(query, source=compound_id, target=disease_id, w=w)
    record = results.one
    seconds = time.time() - start
    row = compound_id, disease_id, metapath, record['PC'], w, record['DWPC'], seconds
    with writer_lock:
        writer.writerow(row)

In [10]:
def threader():
    """Process queries from the queue"""
    while True:
        params = q.get()
        compute_dwpc(**params)
        q.task_done()

In [11]:
# Prepare writer
writer_lock = threading.Lock()
path = 'data/dwpc-neo.tsv.gz'
write_file = gzip.open(path, 'wt')
writer = csv.writer(write_file, delimiter='\t')
writer.writerow(['compound_id', 'disease_id', 'metapath', 'PC', 'w', 'DWPC', 'seconds'])

# Create the queue
q = queue.Queue()

In [None]:
%%time

# create threads
cores = 16
for i in range(cores):
    t = threading.Thread(target=threader)
    t.daemon = True
    t.start()

# assign jobs
params_generator = get_params_list()
for i, params in enumerate(params_generator):
    q.put(params)
#     if i >= 100000:
#         break        

# wait until the thread terminates and close file
q.join()
write_file.close()

In [None]:
# dwpc_df = pandas.read_table('data/dwpc-neo.tsv.gz')
# dwpc_df.sort_values('seconds', ascending=False)
# dwpc_df.head()