# Assign positives and negatives 

In [1]:
import itertools
import random
import sys

import py2neo
import pandas

## Startup neo4j and connections

In [4]:
# server.py script in directory containing neo4j instances
directory = '../../construct/integrate/neo4j/'
sys.path.insert(0, directory)
import servers
instances = servers.get_instances(directory)

In [None]:
servers.start_all(instances)

In [None]:
for instance in instances:
    uri = 'http://localhost:{}/db/data/'.format(instance['port'])
    instance['py2neo'] = py2neo.Graph(uri)

## Compute features

In [None]:
import threading
import concurrent.futures
import bz2
import csv
import time

import pandas
import py2neo

import hetio.readwrite
import hetio.neo4j

In [None]:
# Read hetnet metagraph
path = 'https://github.com/dhimmel/integrate/raw/d68b823bf2167e7ab7f0e784d1280200c33fb3bf/data/metagraph.json'
metagraph = hetio.readwrite.read_metagraph(path)

In [29]:
# Cypher DWPC query parameters
dwpc_query_options = {
    'property': 'identifier',
    'using': True,
    'unique_nodes': 'labeled',
}

In [30]:
## Todo: exclude metapaths with all 4 expression edges

In [31]:
# Extract metapaths
metapaths = metagraph.extract_metapaths('compound', 'disease', max_length=4)
metapaths = [metapath for metapath in metapaths if len(metapath) > 1]
metapaths.sort(key=lambda x: [len(x), str(x)])
metapath_query_tuples = [(str(metapath), hetio.neo4j.construct_dwpc_query(metapath, **dwpc_query_options)) for metapath in metapaths]
len(metapaths)

1198

In [32]:
# Total number of queries
total_queries = len(metapaths) * len(part_df)
total_queries

27134700

In [None]:
part_series = [row for i, row in part_df.iterrows()]

def generate_parameters(max_elems=None):
    """Generate compound, disease, metapath combinations"""
    n = 0
    for metapath, query in metapath_query_tuples:
        for series in part_series:
            if max_elems is not None and n == max_elems:
                break
            yield {
                'neo': series['neo'],
                'hetnet': series['hetnet'],
                'compound_id': series['compound_id'],
                'disease_id': series['disease_id'],
                'metapath': metapath,
                'query': query,
                'w': 0.4,
            }
            n += 1

In [None]:
def compute_dwpc(neo, hetnet, query, metapath, compound_id, disease_id, w):
    """Execute the neo4j query and write results to file"""
    start = time.time()
    results = neo.cypher.execute(query, source=compound_id, target=disease_id, w=w)
    record = results.one
    seconds = '{0:.4g}'.format(time.time() - start)
    row = hetnet, compound_id, disease_id, metapath, record['PC'], w, '{0:.6g}'.format(record['DWPC']), seconds
    with writer_lock:
        writer.writerow(row)

In [None]:
%%time

# Parameters
workers = 12
max_elems = None

# Prepare writer
path = 'data/all-features/dwpc.tsv.bz2'
write_file = bz2.open(path, 'wt')
writer = csv.writer(write_file, delimiter='\t')
writer.writerow(['hetnet', 'compound_id', 'disease_id', 'metapath', 'PC', 'w', 'DWPC', 'seconds'])

# Create ThreadPoolExecutor
executor = concurrent.futures.ThreadPoolExecutor(max_workers=workers)
writer_lock = threading.Lock()

# Submit jobs
n_queries = 0
for params in generate_parameters(max_elems):
    while executor._work_queue.qsize() > 10000:
        print('Submitted queries: {} ({:.4%})'.format(n_queries, n_queries / total_queries), end='\r')
        time.sleep(1)
    executor.submit(compute_dwpc, **params)
    n_queries += 1

# Shutdown and close
executor.shutdown()
write_file.close()

n_queries

In [None]:
servers.stop_all(instances)

In [25]:
n_queries

13790205