## hetnet machine learning

In [7]:
import itertools
import collections
import csv
import gzip

import pandas
import sklearn.metrics

import hetio.readwrite
import hetio.pathtools

## Download and load hetnet

In [2]:
# # Download network
# url = 'https://raw.githubusercontent.com/dhimmel/integrate/f72d32ce09b8884b6ec7e000ec261c116b340198/data/graph.json.gz'
# ! wget --no-verbose --directory-prefix download/ {url}

In [3]:
%%time
graph = hetio.readwrite.read_json('download/graph.json.gz')
metagraph = graph.metagraph

CPU times: user 2min 24s, sys: 17.7 s, total: 2min 41s
Wall time: 2min 41s


In [4]:
metapaths = metagraph.extract_metapaths('compound', 'disease', max_length=2)
metapaths.pop(0)
len(metapaths)

18

## Helper functions

In [9]:
def compute_dwpc(graph, compound_id, disease_id, metapath):
    # Compute edge exclusions
    exclude_edges = set()
    edge_id = compound_id, disease_id, 'indication', 'both'
    indication = graph.edge_dict.get(edge_id)
    if indication:
        exclude_edges.add(indication)
        exclude_edges.add(indication.inverse)

    compound = graph.get_node(('compound', compound_id))
    disease = graph.get_node(('disease', disease_id))

    paths = hetio.pathtools.paths_between(graph, compound, disease, metapath, exclude_edges=exclude_edges)
    dwpc = hetio.pathtools.DWPC(paths, damping_exponent=0.4, exclude_edges=exclude_edges)
    
    row = collections.OrderedDict()
    row['compound_id'] = compound_id
    row['disease_id'] = disease_id
    row['metapath'] = str(metapath)
    row['path_count'] = len(paths)
    row['dwpc'] = dwpc
    
    return row

## Read part_df

In [10]:
part_df = pandas.read_table('data/partition.tsv.gz')
part_df = part_df.query('indication == 1 or percentile <= 0.02')
part_df.head()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,indication,percentile
9,DB00014,Goserelin,DOID:10283,Prostate cancer,1,0.705331
20,DB00014,Goserelin,DOID:11119,Gilles de la Tourette syndrome,0,0.007863
54,DB00014,Goserelin,DOID:14268,sclerosing cholangitis,0,0.011764
58,DB00014,Goserelin,DOID:1612,Breast cancer,1,0.522334
82,DB00014,Goserelin,DOID:2986,IgA glomerulonephritis,0,0.013587


In [11]:
len(part_df)

5613

In [12]:
metanode_to_nodes = graph.get_metanode_to_nodes()

In [13]:
#compounds = metanode_to_nodes[metagraph.get_node('compound')]
#diseases = metanode_to_nodes[metagraph.get_node('disease')]
#MS = graph.get_node(('disease', 'DOID:2377'))

In [15]:
write_file = gzip.open('data/dwpc.tsv.gz', 'wt')
header = ['compound_id', 'disease_id', 'metapath', 'path_count', 'dwpc']
writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=header)
writer.writeheader()
for i, row in part_df.iterrows():
    print('Computing DWPCs for {} and {}'.format(row.compound_name, row.disease_name), end='\r')
    for metapath in metapaths:
        compound_id = row['compound_id']
        disease_id = row['disease_id']
        row = compute_dwpc(graph, compound_id, disease_id, metapath)
        writer.writerow(row)

write_file.close()



# Performance

In [16]:
dwpc_df = pandas.read_table('data/dwpc.tsv.gz')
dwpc_df.tail()

Unnamed: 0,compound_id,disease_id,metapath,path_count,dwpc
101029,DB09028,DOID:8986,CuGdD,0,0
101030,DB09028,DOID:8986,CtGvD,0,0
101031,DB09028,DOID:8986,CtGaD,0,0
101032,DB09028,DOID:8986,CtGuD,0,0
101033,DB09028,DOID:8986,CtGdD,0,0


In [28]:
def compute_metrics(df):
    y_true = df.indication
    y_score = df.dwpc
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

merged_df = part_df.merge(dwpc_df)
auc_df = merged_df.groupby('metapath').apply(compute_metrics).reset_index()

In [29]:
auc_df = auc_df.sort('auroc', ascending=False)
auc_df.to_csv('data/auc.tsv', sep='\t', index=False, float_format='%.3f')
auc_df

Unnamed: 0,metapath,nonzero,auroc,auprc
9,CsCiD,0.148762,0.741553,0.732153
8,CiDsD,0.13148,0.712503,0.699054
10,CtGaD,0.149653,0.69285,0.616763
0,CbGaD,0.090682,0.630475,0.597622
14,CuGaD,0.112061,0.604035,0.489858
4,CdGaD,0.100481,0.591247,0.46522
7,CdGvD,0.024229,0.532679,0.515344
17,CuGvD,0.022448,0.529108,0.50347
16,CuGuD,0.013006,0.522472,0.585561
6,CdGuD,0.013184,0.521893,0.573895


In [30]:
metagraph.kind_to_abbrev

{'anatomy': 'A',
 'association': 'a',
 'binding': 'b',
 'biological process': 'BP',
 'causation': 'c',
 'cellular component': 'CC',
 'compound': 'C',
 'disease': 'D',
 'downregulation': 'd',
 'evolution': 'e',
 'expression': 'e',
 'function': 'f',
 'gene': 'G',
 'indication': 'i',
 'interaction': 'i',
 'knockdown downregulation': 'kd',
 'knockdown upregulation': 'ku',
 'localization': 'l',
 'molecular function': 'MF',
 'overexpression downregulation': 'od',
 'overexpression upregulation': 'up',
 'participation': 'p',
 'pathway': 'PW',
 'perturbation': 'PB',
 'presense': 'p',
 'regulation': 'r',
 'side effect': 'SE',
 'similarity': 's',
 'symptom': 'S',
 'target': 't',
 'upregulation': 'u',
 'variation': 'v'}