# Assign positives and negatives 

In [1]:
import itertools
import random
import sys
import json

import py2neo
import pandas

## Startup neo4j and connections

In [2]:
with open('servers.json') as read_file:
    instances = json.load(read_file)

for instance in instances:
    uri = 'http://localhost:{}/db/data/'.format(instance['port'])
    instance['py2neo'] = py2neo.Graph(uri)
    if instance['name'] == 'rephetio-v2.0':
        neo_unperm = instance['py2neo']

## Create partitions

In [3]:
def to_df(record_list):
    """Convert a py2neo RecordList to a dataframe"""
    return pandas.DataFrame(record_list.records, columns = record_list.columns)

In [6]:
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')

In [7]:
'{} compounds × {} diseases = {} pairs'.format(
    len(disease_df), len(compound_df), len(disease_df) * len(compound_df))

'136 compounds × 1538 diseases = 209168 pairs'

In [8]:
nonzero_prior_pairs = set(itertools.product(
    compound_df.query("treats > 0").compound_id,
    disease_df.query("treats > 0").disease_id)
)

In [9]:
indication_query = '''
MATCH (compound:Compound)-[rel]->(disease:Disease)
RETURN
  compound.identifier AS compound_id,
  disease.identifier AS disease_id,
  type(rel) AS rel_type
ORDER BY
  compound_id, rel_type, disease_id
'''

def summarize(df, prefix=''):
    series = pandas.Series()
    series[prefix + 'treats'] = sum(df.rel_type == 'TREATS_CtD')
    series[prefix + 'palliates'] = sum(df.rel_type == 'PALLIATES_CpD')
    return series

def partition(neo):
    """
    Extract negative and positive compound-disease pairs from a py2neo.Graph.
    """
    indication_df = to_df(neo.cypher.execute(indication_query))
    # Use TREATS_CtD and PALLIATES_CpD as non-negatives
    non_negatives = set(zip(indication_df.compound_id, indication_df.disease_id))
    indication_df = indication_df.query("rel_type == 'TREATS_CtD'")
    # Use TREATS_CtD as positives
    positives = set(zip(indication_df.compound_id, indication_df.disease_id))
    # Use nonzero-prior pairs excluding non-negatives as negatives
    negatives = nonzero_prior_pairs - non_negatives
    negatives = random.sample(negatives, k=len(positives) * 4)
    rows = list()
    for status, pairs in (0, negatives), (1, positives):
        for drug, disease in pairs:
            rows.append((drug, disease, status))
    df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id', 'status'])
    df = df.sort_values(['disease_id', 'status', 'compound_id'])
    return df

In [10]:
part_dfs = list()
for instance in instances:
    random.seed(0, version=2)
    part_df = partition(instance['py2neo'])
    part_df.insert(0, 'hetnet', instance['name'])
    part_dfs.append(part_df)
part_df = pandas.concat(part_dfs)
part_df = part_df.sort_values(['compound_id', 'disease_id', 'hetnet'])

In [11]:
part_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status
2687,rephetio-v2.0,DB00014,DOID:0050741,0
1759,rephetio-v2.0_perm-5,DB00014,DOID:0050741,0


In [12]:
part_df.to_csv('data/partitions.tsv', sep='\t', index=False)

In [13]:
# Number of hetnet-compound-disease pairs
len(part_df)

22650

In [14]:
# Number of positives and negatives per hetnet
pandas.crosstab(part_df.hetnet, part_df.status)

status,0,1
hetnet,Unnamed: 1_level_1,Unnamed: 2_level_1
rephetio-v2.0,3020,755
rephetio-v2.0_perm-1,3020,755
rephetio-v2.0_perm-2,3020,755
rephetio-v2.0_perm-3,3020,755
rephetio-v2.0_perm-4,3020,755
rephetio-v2.0_perm-5,3020,755
