# Assign positives and negatives 

In [7]:
import itertools
import random
import sys
import json

import py2neo
import pandas

## Startup neo4j and connections

In [8]:
with open('servers.json') as read_file:
    instances = json.load(read_file)

for instance in instances:
    uri = 'http://localhost:{}/db/data/'.format(instance['port'])
    instance['py2neo'] = py2neo.Graph(uri)

## Create partitions

In [9]:
def to_df(record_list):
    """Convert a py2neo RecordList to a dataframe"""
    return pandas.DataFrame(record_list.records, columns = record_list.columns)

In [25]:
indication_query = '''
MATCH (compound:Compound)-[rel]->(disease:Disease)
RETURN
  compound.identifier AS compound_id,
  compound.name AS compound_name,
  disease.identifier AS disease_id,
  disease.name AS disease_name,
  type(rel) AS rel_type
ORDER BY
  disease_name, rel_type, compound_name
'''

def summarize(df, prefix=''):
    series = pandas.Series()
    series[prefix + 'treats'] = sum(df.rel_type == 'TREATS_CtD')
    series[prefix + 'palliates'] = sum(df.rel_type == 'PALLIATES_CpD')
    return series

def partition(neo):
    """
    Extract negative and positive compound-disease pairs from a py2neo.Graph.
    """
    indication_df = to_df(neo.cypher.execute(indication_query))
    compound_df = indication_df.groupby(['compound_id', 'compound_name']).apply(summarize, prefix='compound_').reset_index()
    disease_df = indication_df.groupby(['disease_id', 'disease_name']).apply(summarize, prefix='disease_').reset_index()
    compounds = compound_df.query("compound_treats > 0").compound_id
    diseases = disease_df.query("disease_treats > 0").disease_id
    indication_df = indication_df.query("compound_id in @compounds and disease_id in @diseases")
    non_negatives = set(zip(indication_df.compound_id, indication_df.disease_id))
    indication_df = indication_df.query("rel_type == 'TREATS_CtD'")
    positives = set(zip(indication_df.compound_id, indication_df.disease_id))
    negatives = set(itertools.product(compounds, diseases)) - non_negatives
    negatives = random.sample(negatives, k=len(positives) * 4)
    rows = list()
    for status, pairs in (0, negatives), (1, positives):
        for drug, disease in pairs:
            rows.append((drug, disease, status))
    df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id', 'status'])
    df = df.merge(compound_df).merge(disease_df)
    df = df.sort_values(['disease_id', 'status', 'compound_id'])
    return df

In [26]:
part_dfs = list()
for instance in instances:
    random.seed(0, version=2)
    part_df = partition(instance['py2neo'])
    part_df.insert(0, 'hetnet', instance['name'])
    part_dfs.append(part_df)
part_df = pandas.concat(part_dfs)
part_df = part_df.sort_values(['compound_id', 'disease_id', 'hetnet'])

In [27]:
part_df.head()

Unnamed: 0,hetnet,compound_id,disease_id,status,compound_name,compound_treats,compound_palliates,disease_name,disease_treats,disease_palliates
1621,hetio-ind,DB00014,DOID:0050742,0,Goserelin,2,0,nicotine dependence,1,1
2017,hetio-ind_perm-1,DB00014,DOID:0050742,0,Goserelin,2,0,nicotine dependence,1,1
1809,hetio-ind,DB00014,DOID:10283,1,Goserelin,2,0,Prostate cancer,21,9
258,hetio-ind_perm-5,DB00014,DOID:10283,0,Goserelin,1,0,Prostate cancer,10,5
2995,hetio-ind_perm-2,DB00014,DOID:10652,0,Goserelin,2,0,Alzheimer's disease,4,5


In [None]:
part_df.to_csv('data/partitions.tsv', sep='\t', index=False)

In [30]:
len(part_df)

16865

In [28]:
# Number of compounds
part_df.compound_id.nunique()

387

In [34]:
# Number of diseases
part_df.disease_id.nunique()

77

In [33]:
pandas.crosstab(part_df.hetnet, part_df.status)

status,0,1
hetnet,Unnamed: 1_level_1,Unnamed: 2_level_1
hetio-ind,3020,755
hetio-ind_perm-1,3020,755
hetio-ind_perm-2,3020,755
hetio-ind_perm-3,20,5
hetio-ind_perm-4,3020,755
hetio-ind_perm-5,1392,348
