# Assign positives and negatives 

In [None]:
import itertools
import random
import sys

import py2neo
import pandas

## Startup neo4j and connections

In [None]:
# server.py script in directory containing neo4j instances
directory = '../construct/integrate/neo4j/'
sys.path.insert(0, directory)
import servers
instances = servers.get_instances(directory)

In [None]:
servers.start_all(instances)

In [None]:
for instance in instances:
    uri = 'http://localhost:{}/db/data/'.format(instance['port'])
    instance['py2neo'] = py2neo.Graph(uri)

## Create partitions

In [None]:
def to_df(record_list):
    """Convert a py2neo RecordList to a dataframe"""
    return pandas.DataFrame(record_list.records, columns = record_list.columns)

In [None]:
indication_query = '''
MATCH (compound:Compound)-[rel]->(disease:Disease)
RETURN
  compound.identifier AS compound_id,
  compound.name AS compound_name,
  disease.identifier AS disease_id,
  disease.name AS disease_name,
  type(rel) AS rel_type
ORDER BY
  disease_name, rel_type, compound_name
'''

def summarize(df):
    series = pandas.Series()
    series['treats'] = sum(df.rel_type == 'TREATS_CtD')
    series['palliates'] = sum(df.rel_type == 'PALLIATES_CpD')
    return series

def partition(neo):
    """
    Extract negative and positive compound-disease pairs from a py2neo.Graph.
    """
    indication_df = to_df(neo.cypher.execute(indication_query))
    compound_df = indication_df.groupby(['compound_id', 'compound_name']).apply(summarize).reset_index().sort_values(['treats', 'palliates'], ascending=False)
    disease_df = indication_df.groupby(['disease_id', 'disease_name']).apply(summarize).reset_index().sort_values(['treats', 'palliates'], ascending=False)
    compounds = compound_df.query("treats > 0").compound_id
    diseases = disease_df.query("treats > 0").disease_id
    indication_df = indication_df.query("compound_id in @compounds and disease_id in @diseases")
    non_negatives = set(zip(indication_df.compound_id, indication_df.disease_id))
    indication_df = indication_df.query("rel_type == 'TREATS_CtD'")
    positives = set(zip(indication_df.compound_id, indication_df.disease_id))
    negatives = set(itertools.product(compounds, diseases)) - non_negatives
    negatives = random.sample(negatives, k=len(positives) * 4)
    rows = list()
    for status, pairs in (0, negatives), (1, positives):
        for drug, disease in pairs:
            rows.append((drug, disease, status))
    df = pandas.DataFrame(rows, columns=['compound_id', 'compound_name', 'disease_id', 'disease_name', 'status'])
    df = df.sort_values(['disease_id', 'status', 'compound_id'])
    return df

In [None]:
part_dfs = list()
for instance in instances:
    random.seed(0, version=2)
    part_df = partition(instance['py2neo'])
    part_df.insert(0, 'hetnet', instance['name'])
    part_df['neo'] = instance['py2neo']
    part_dfs.append(part_df)
part_df = pandas.concat(part_dfs)
part_df = part_df.sort_values(['compound_id', 'disease_id', 'hetnet'])

In [27]:
part_df.head()

Unnamed: 0,drugbank_id,doid_id,status,hetnet,neo
2033,DB00014,DOID:1024,0,hetio-ind_perm-1,<Graph uri='http://localhost:7501/db/data/'>
3352,DB00014,DOID:10283,1,hetio-ind,<Graph uri='http://localhost:7474/db/data/'>
3356,DB00014,DOID:10283,1,hetio-ind_perm-1,<Graph uri='http://localhost:7501/db/data/'>
1937,DB00014,DOID:10652,0,hetio-ind_perm-2,<Graph uri='http://localhost:7502/db/data/'>
3261,DB00014,DOID:10763,1,hetio-ind_perm-0,<Graph uri='http://localhost:7500/db/data/'>


In [None]:
part_df.iloc[:, :4].to_csv('data/all-features/partitions.tsv', sep='\t', index=False)

In [26]:
len(part_df)

22650

## Compute features

In [None]:
import threading
import concurrent.futures
import bz2
import csv
import time

import pandas
import py2neo

import hetio.readwrite
import hetio.neo4j

In [None]:
# Read hetnet metagraph
path = 'https://github.com/dhimmel/integrate/raw/d68b823bf2167e7ab7f0e784d1280200c33fb3bf/data/metagraph.json'
metagraph = hetio.readwrite.read_metagraph(path)

In [29]:
# Cypher DWPC query parameters
dwpc_query_options = {
    'property': 'identifier',
    'using': True,
    'unique_nodes': 'labeled',
}

In [30]:
## Todo: exclude metapaths with all 4 expression edges

In [31]:
# Extract metapaths
metapaths = metagraph.extract_metapaths('compound', 'disease', max_length=4)
metapaths = [metapath for metapath in metapaths if len(metapath) > 1]
metapaths.sort(key=lambda x: [len(x), str(x)])
metapath_query_tuples = [(str(metapath), hetio.neo4j.construct_dwpc_query(metapath, **dwpc_query_options)) for metapath in metapaths]
len(metapaths)

1198

In [32]:
# Total number of queries
total_queries = len(metapaths) * len(part_df)
total_queries

27134700

In [None]:
part_series = [row for i, row in part_df.iterrows()]

def generate_parameters(max_elems=None):
    """Generate compound, disease, metapath combinations"""
    n = 0
    for metapath, query in metapath_query_tuples:
        for series in part_series:
            if max_elems is not None and n == max_elems:
                break
            yield {
                'neo': series['neo'],
                'hetnet': series['hetnet'],
                'compound_id': series['compound_id'],
                'disease_id': series['disease_id'],
                'metapath': metapath,
                'query': query,
                'w': 0.4,
            }
            n += 1

In [None]:
def compute_dwpc(neo, hetnet, query, metapath, compound_id, disease_id, w):
    """Execute the neo4j query and write results to file"""
    start = time.time()
    results = neo.cypher.execute(query, source=compound_id, target=disease_id, w=w)
    record = results.one
    seconds = '{0:.4g}'.format(time.time() - start)
    row = hetnet, compound_id, disease_id, metapath, record['PC'], w, '{0:.6g}'.format(record['DWPC']), seconds
    with writer_lock:
        writer.writerow(row)

In [None]:
%%time

# Parameters
workers = 12
max_elems = None

# Prepare writer
path = 'data/all-features/dwpc.tsv.bz2'
write_file = bz2.open(path, 'wt')
writer = csv.writer(write_file, delimiter='\t')
writer.writerow(['hetnet', 'compound_id', 'disease_id', 'metapath', 'PC', 'w', 'DWPC', 'seconds'])

# Create ThreadPoolExecutor
executor = concurrent.futures.ThreadPoolExecutor(max_workers=workers)
writer_lock = threading.Lock()

# Submit jobs
n_queries = 0
for params in generate_parameters(max_elems):
    while executor._work_queue.qsize() > 10000:
        print('Submitted queries: {} ({:.4%})'.format(n_queries, n_queries / total_queries), end='\r')
        time.sleep(1)
    executor.submit(compute_dwpc, **params)
    n_queries += 1

# Shutdown and close
executor.shutdown()
write_file.close()

n_queries

In [None]:
servers.stop_all(instances)

In [25]:
n_queries

13790205