# DWPC computation using neo4j

In [1]:
import multiprocessing
import gzip
import csv

import pandas
import py2neo

import hetio.readwrite
import hetio.neo4j

In [2]:
%%time
path = 'download/hetnet.json.gz'
graph = hetio.readwrite.read_json(path)

CPU times: user 2min 25s, sys: 19.7 s, total: 2min 44s
Wall time: 2min 44s


In [4]:
metapaths = graph.metagraph.extract_metapaths('compound', 'disease', max_length=4)
metapaths.pop(0)
len(metapaths)

3662

In [5]:
metapath_to_query = {str(metapath): hetio.neo4j.construct_dwpc_query(metapath, 'identifier') for metapath in metapaths}

In [6]:
part_df = pandas.read_table('data/partition.tsv.gz')
part_df.tail()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,indication,percentile
212619,DB09028,Cytisine,DOID:9744,type 1 diabetes mellitus,0,0.849434
212620,DB09028,Cytisine,DOID:9835,refractive error,0,0.262252
212621,DB09028,Cytisine,DOID:986,alopecia areata,0,0.584285
212622,DB09028,Cytisine,DOID:9917,Pleural cancer,0,0.032494
212623,DB09028,Cytisine,DOID:9970,obesity,0,0.733156


In [10]:
def compute_dwpc(kwargs):
    import py2neo
    uri = 'http://neo4j:hetnet@localhost:7474/db/data/'
    neo = py2neo.Graph(uri)
    source = kwargs['compound_id']
    target = kwargs['disease_id']
    w = kwargs['w']
    results = neo.cypher.execute(kwargs['query'], source=source, target=target, w=w)
    record = results.one
    return source, target, kwargs['metapath'], record['PC'], w, record['DWPC']

In [11]:
def get_params_list():
    for i, row in part_df.iterrows():
        for metapath, query in metapath_to_query.items():
            yield {
                'compound_id': row['compound_id'],
                'disease_id': row['disease_id'],
                'metapath': metapath,
                'query': query,
                'w': 0.4,
            }

In [None]:
pool = multiprocessing.Pool(14)
params_generator = get_params_list()
generator = pool.imap_unordered(compute_dwpc, params_generator, chunksize=50)

path = 'data/dwpc-neo.tsv.gz'
write_file = gzip.open(path, 'wt')
writer = csv.writer(write_file, delimiter='\t')
writer.writerow(['compound_id', 'disease_id', 'metapath', 'w', 'PC', 'DWPC'])
for i, row in enumerate(generator):
    if i % 1000 == 0:
        print(i, end='\r')
    writer.writerow(row)
write_file.close()

01000