## hetnet machine learning

In [20]:
import gzip

import pandas
import sklearn.metrics

# Performance

In [2]:
dwpc_df = pandas.read_table('data/dwpc.tsv.gz')
dwpc_df.tail()

Unnamed: 0,compound_id,disease_id,metapath,PC,w,DWPC,seconds
15224442,DB05389,DOID:9970,CbG<kdGeAlD,0,0.4,0.0,0.05497
15224443,DB05389,DOID:9970,CuGiGaDsD,0,0.4,0.0,0.05102
15224444,DB05389,DOID:9970,CbG<ouGku>GaD,0,0.4,0.0,0.05334
15224445,DB05389,DOID:9970,CuGou>GdAlD,0,0.4,0.0,0.05134
15224446,DB05389,DOID:9970,CiDaGeAlD,8792,0.4,0.031667,0.4702


In [None]:
part_df = pandas.read_table('data/partition.tsv.gz')

In [7]:
def compute_metrics(df):
    y_true = df.indication
    y_score = df.DWPC
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

merged_df = part_df.merge(dwpc_df)
auc_df = merged_df.groupby('metapath').apply(compute_metrics).reset_index()

In [9]:
auc_df = auc_df.sort_values('auroc', ascending=False)
auc_df.to_csv('data/auc.tsv', sep='\t', index=False, float_format='%.3f')
auc_df

Unnamed: 0,metapath,nonzero,auroc,auprc
1202,CiDiCsCiD,0.885740,0.957915,0.755454
1226,CiDsDiCiD,0.825426,0.923989,0.554505
1200,CiDiCiD,0.722345,0.892719,0.505435
1201,CiDiCiDsD,0.854933,0.878335,0.473675
1227,CiDsDlAlD,0.996360,0.868019,0.374375
1228,CiDsDpSpD,0.999610,0.863671,0.342040
1220,CiDsDaGaD,0.996360,0.857424,0.393347
1230,CiDsDsDsD,0.664240,0.854326,0.424807
1213,CiDlAlDsD,0.998310,0.844872,0.413463
1218,CiDpSpDsD,0.999610,0.823664,0.348059


In [11]:
auc_df[auc_df.metapath.str.contains('BP')]

Unnamed: 0,metapath,nonzero,auroc,auprc
496,CbGpBPpGaD,1.0,0.580165,0.121965
1926,CuGpBPpGaD,0.693618,0.552438,0.121143
1057,CdGpBPpGaD,0.738854,0.551037,0.13566
497,CbGpBPpGdD,0.061224,0.529834,0.134823
498,CbGpBPpGuD,0.061224,0.529667,0.136112
1927,CuGpBPpGdD,0.042506,0.527317,0.167083
1928,CuGpBPpGuD,0.042506,0.527246,0.165075
1058,CdGpBPpGdD,0.045236,0.523534,0.156162
1059,CdGpBPpGuD,0.045236,0.523451,0.153893


In [36]:
# Create spread dataframes
# compound-disease pairs as rows, metapaths as columns
pivoted = dict()
for value in 'PC', 'DWPC', 'seconds':
    df = pandas.pivot_table(dwpc_df, values=value, index=['compound_id', 'disease_id'], columns='metapath')
    df = part_df.merge(df.reset_index())
    path = 'data/{}-spread.tsv.gz'.format(value)
    with gzip.open(path, 'wt') as wf:
        df.to_csv(wf, index=False, sep='\t')
    pivoted[value] = df

In [34]:
dwpc_spread_df = pivoted['DWPC']
dwpc_spread_df.head()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,indication,percentile,CbG<kdG<kdGaD,CbG<kdG<kdGdD,CbG<kdG<kdGuD,CbG<kdG<kuGaD,...,CuGuDdGdD,CuGuDdGuD,CuGuDiCiD,CuGuDlAlD,CuGuDpSpD,CuGuDsD,CuGuDsDsD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,DB00091,Cyclosporine,DOID:0050425,restless legs syndrome,0,0.767828,0.000589,0,0,0.000424,...,0,0,0.0,0.000204,0.000996,0,0.0,0.0,0,0
1,DB00091,Cyclosporine,DOID:10283,Prostate cancer,0,0.006045,0.006579,0,0,0.006011,...,0,0,0.006902,0.002043,0.003115,0,0.001705,0.005054,0,0
2,DB00091,Cyclosporine,DOID:10652,Alzheimer's disease,0,0.406124,0.002196,0,0,0.002734,...,0,0,0.0,9.7e-05,0.000366,0,0.0,0.002312,0,0
3,DB00091,Cyclosporine,DOID:10763,hypertension,0,0.408666,0.004117,0,0,0.004145,...,0,0,0.000369,0.000664,0.000417,0,0.000948,0.003934,0,0
4,DB00091,Cyclosporine,DOID:1094,attention deficit hyperactivity disorder,0,0.478266,0.000185,0,0,0.000239,...,0,0,0.0,0.000177,0.00025,0,0.0,0.000612,0,0


In [37]:
len(dwpc_df)

15224447