# Partition compound--disease pairs

In [21]:
import itertools
import random
import gzip

import pandas

import hetio.readwrite

In [2]:
%%time
graph = hetio.readwrite.read_json('download/graph.json.gz')
metagraph = graph.metagraph

CPU times: user 2min 16s, sys: 12.6 s, total: 2min 28s
Wall time: 2min 28s


In [5]:
compound = metagraph.get_node('compound')
disease = metagraph.get_node('disease')

metanode_to_nodes = graph.get_metanode_to_nodes()
compounds = metanode_to_nodes[compound]
diseases = metanode_to_nodes[disease]

In [6]:
rows = list()
for compound, disease in itertools.product(compounds, diseases):
    
    compound_id = compound.get_id()
    disease_id = disease.get_id()
    edge_id = compound_id, disease_id, 'indication', 'both'
    indication_edge = graph.edge_dict.get(edge_id)

    row = pandas.Series()
    row['compound_id'] = compound.identifier
    row['compound_name'] = compound.name
    row['disease_id'] = disease.identifier
    row['disease_name'] = disease.name
    row['indication'] = int(bool(indication_edge))
    
    rows.append(row)

pair_df = pandas.DataFrame(rows)
pair_df = pair_df.sort(['compound_id', 'disease_id'])

In [18]:
def add_percentile(df):
    k = len(df)
    df['percentile'] = random.sample(range(k), k)
    df['percentile'] /= k
    return df

random.seed(0)
pair_df = pair_df.groupby('indication').apply(add_percentile)
pair_df = pair_df.sort(['compound_id', 'disease_id'])

In [19]:
pair_df.head()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,indication,percentile
75917,DB00014,Goserelin,DOID:0050156,idiopathic pulmonary fibrosis,0,0.441142
75955,DB00014,Goserelin,DOID:0050425,restless legs syndrome,0,0.555469
75919,DB00014,Goserelin,DOID:0050741,alcohol dependence,0,0.070334
75922,DB00014,Goserelin,DOID:0050742,nicotine dependence,0,0.779067
75930,DB00014,Goserelin,DOID:0060073,Lymphatic system cancer,0,0.937534


In [13]:
# Prevalence of indications
pair_df.indication.mean()

0.0065279554518774922

In [14]:
pair_df.indication.value_counts()

0    211236
1      1388
dtype: int64

In [25]:
with gzip.open('data/partition.tsv.gz', 'wt') as write_file:
    pair_df.to_csv(write_file, sep='\t', index=False, float_format='%.8g')