## hetnet machine learning

In [1]:
import itertools
import collections
import csv
import gzip

import pandas
import sklearn.metrics

import hetio.readwrite
import hetio.pathtools

## Download and load hetnet

In [2]:
# # Download network
# url = 'https://raw.githubusercontent.com/dhimmel/integrate/7ef64533f0822fb5728ab4c1d88dc39f3345dcc8/data/hetnet.json.gz'
# ! wget --no-verbose --directory-prefix download/ {url}

In [3]:
%%time
graph = hetio.readwrite.read_json('download/hetnet.json.gz')
metagraph = graph.metagraph

CPU times: user 2min 21s, sys: 14.9 s, total: 2min 35s
Wall time: 2min 35s


In [4]:
metapaths = metagraph.extract_metapaths('compound', 'disease', max_length=3)
metapaths.pop(0)
len(metapaths)

261

In [5]:
weights = [x / 50 for x in range(51)]

## Helper functions

In [6]:
def compute_dwpcs(graph, compound_id, disease_id, metapath):
    # Compute edge exclusions
    exclude_edges = set()
    edge_id = compound_id, disease_id, 'indication', 'both'
    indication = graph.edge_dict.get(edge_id)
    if indication:
        exclude_edges.add(indication)
        exclude_edges.add(indication.inverse)

    compound = graph.get_node(('compound', compound_id))
    disease = graph.get_node(('disease', disease_id))

    paths = hetio.pathtools.paths_between(graph, compound, disease, metapath, exclude_edges=exclude_edges)
    
    for w in weights:
        dwpc = hetio.pathtools.DWPC(paths, damping_exponent=w, exclude_edges=exclude_edges)

        row = collections.OrderedDict()
        row['compound_id'] = compound_id
        row['disease_id'] = disease_id
        row['metapath'] = str(metapath)
        row['path_count'] = len(paths)
        row['w'] = w
        row['dwpc'] = dwpc
    
        yield row

## Read part_df

In [7]:
part_df = pandas.read_table('data/partition.tsv.gz')
part_df = part_df.query('indication == 1 or percentile <= 0.02')
part_df.head()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,indication,percentile
9,DB00014,Goserelin,DOID:10283,Prostate cancer,1,0.705331
20,DB00014,Goserelin,DOID:11119,Gilles de la Tourette syndrome,0,0.007863
54,DB00014,Goserelin,DOID:14268,sclerosing cholangitis,0,0.011764
58,DB00014,Goserelin,DOID:1612,Breast cancer,1,0.522334
82,DB00014,Goserelin,DOID:2986,IgA glomerulonephritis,0,0.013587


In [9]:
part_df.indication.value_counts()

0    4225
1    1388
dtype: int64

In [10]:
metanode_to_nodes = graph.get_metanode_to_nodes()

In [11]:
#compounds = metanode_to_nodes[metagraph.get_node('compound')]
#diseases = metanode_to_nodes[metagraph.get_node('disease')]
#MS = graph.get_node(('disease', 'DOID:2377'))

In [None]:
%%time
write_file = gzip.open('data/dwpc.tsv.gz', 'wt')
header = ['compound_id', 'disease_id', 'metapath', 'path_count', 'w', 'dwpc']
writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=header)
writer.writeheader()
for i, row in part_df.iterrows():
    print('Computing DWPCs for {} and {}'.format(row.compound_name, row.disease_name), end='\r')
    for metapath in metapaths:
        compound_id = row['compound_id']
        disease_id = row['disease_id']
        rows = compute_dwpcs(graph, compound_id, disease_id, metapath)
        writer.writerows(rows)

write_file.close()

# Performance

In [None]:
dwpc_df = pandas.read_table('data/dwpc.tsv.gz')
dwpc_df.tail()

In [None]:
def compute_metrics(df):
    y_true = df.indication
    y_score = df.dwpc
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

merged_df = part_df.merge(dwpc_df)
auc_df = merged_df.groupby(['metapath', 'w']).apply(compute_metrics).reset_index()

In [None]:
#auc_df = auc_df.sort('auroc', ascending=False)
auc_df.to_csv('data/auc.tsv', sep='\t', index=False, float_format='%.3f')
auc_df.head()

In [None]:
metagraph.kind_to_abbrev