# Calculate the performonce on every feature

In [1]:
import bz2
import os
import re

import pandas
import sklearn.metrics
import scipy.stats
import math

## Read features and partitions

In [2]:
part_df = pandas.read_table('data/all-features/partitions.tsv')
part_df.tail(2)

Unnamed: 0,drugbank_id,doid_id,status,hetnet
22648,DB08906,DOID:9074,0,hetio-ind_perm-2
22649,DB08906,DOID:986,0,hetio-ind_perm-0


In [3]:
dwpc_df = pandas.read_table('data/all-features/dwpc.tsv.bz2')

# Fix `hetnet` omission in header
dwpc_df.index.name = 'hetnet'
dwpc_df = dwpc_df.reset_index()
dwpc_df.tail(2)

# Fix column name isuse
dwpc_df = dwpc_df.rename(columns={'compound_id': 'drugbank_id', 'disease_id': 'doid_id'})

In [4]:
dwpc_df = part_df.merge(dwpc_df)
dwpc_df.tail(2)

Unnamed: 0,drugbank_id,doid_id,status,hetnet,metapath,PC,w,DWPC,seconds
13785654,DB08906,DOID:986,0,hetio-ind_perm-0,CpDaGbCtD,0,0.4,0.0,0.008561
13785655,DB08906,DOID:986,0,hetio-ind_perm-0,CpDaGcGaD,0,0.4,0.0,0.02122


## Compute performance

In [5]:
def compute_metrics(df):
    y_true = df['status']
    y_score = df.DWPC
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    series['mean_PC'] = df['PC'].mean()
    series['seconds_per_query'] = df['seconds'].mean()
    return series

auc_df = dwpc_df.groupby(['hetnet', 'metapath']).apply(compute_metrics).reset_index()
auc_df['permuted'] = auc_df.hetnet.str.contains('_perm').astype(int)

In [6]:
auc_df.head(2)

Unnamed: 0,hetnet,metapath,nonzero,auroc,auprc,mean_PC,seconds_per_query,permuted
0,hetio-ind,CbG<rG<rGaD,0.868609,0.65169,0.353662,566.712053,0.05702,0
1,hetio-ind,CbG<rG<rGdD,0.416159,0.582549,0.261258,862.403709,0.076651,0


In [7]:
def compare_permutation(df):
    unperm = df.query("permuted == 0").iloc[0, :]
    perm_df = df.query("permuted == 1")
    series = pandas.Series()
    series['nonzero'] = unperm['nonzero']
    series['seconds_per_query'] = unperm['seconds_per_query']
    series['auroc'] = unperm.auroc
    series['auroc_permuted'] = perm_df.auroc.mean()
    series['delta_auroc'] = series['auroc'] - series['auroc_permuted']
    # One tailed p-value testing whether the unpermuted AUROC
    # is greater than the permuted AUROCs
    ttest = scipy.stats.ttest_1samp(perm_df.auroc, unperm.auroc)
    pvalue = ttest.pvalue / 2 if ttest.statistic < 0 else 1 - ttest.pvalue / 2
    series['pval_auroc'] = pvalue
    #series['nlog10_pval_auroc'] = -math.log10(pvalue)
    return(series)

compare_df = auc_df.groupby('metapath').apply(compare_permutation).reset_index()
compare_df['length'] = compare_df.metapath.map(lambda x: len(re.split(r'[a-z<>]+', x)) - 1)
compare_df = compare_df.sort_values(['length', 'metapath'])

In [8]:
compare_df.head(3)

Unnamed: 0,metapath,nonzero,seconds_per_query,auroc,auroc_permuted,delta_auroc,pval_auroc,length
27,CbGaD,0.311523,0.014537,0.715374,0.580433,0.134941,3e-06,2
113,CbGdD,0.149139,0.013583,0.512119,0.515437,-0.003318,0.921172,2
237,CbGuD,0.13457,0.013051,0.517345,0.514276,0.003068,0.135583,2


In [9]:
len(compare_df)

609

In [10]:
compare_df.sort_values('pval_auroc', ascending=True).head(5)

Unnamed: 0,metapath,nonzero,seconds_per_query,auroc,auroc_permuted,delta_auroc,pval_auroc,length
54,CbGbCrCtD,0.743576,0.021744,0.881191,0.702353,0.178838,1.885407e-08,4
559,CrCrCtD,0.155232,0.011457,0.754342,0.588369,0.165973,2.697382e-08,3
576,CtDrDrD,0.29457,0.011415,0.684383,0.508899,0.175484,2.909682e-08,3
561,CrCtDrD,0.136424,0.011195,0.64796,0.502565,0.145395,9.377069e-08,3
571,CtDlAlD,0.753642,0.012912,0.592044,0.47725,0.114794,4.605347e-07,3


In [11]:
auc_df.to_csv('data/all-features/auc.tsv', sep='\t', index=False, float_format='%.3g')
compare_df.to_csv('data/all-features/auroc.tsv', sep='\t', index=False, float_format='%.3g')

In [12]:
compare_df[compare_df.metapath.str.contains('CpD|DpC')].sort_values('pval_auroc', ascending=True).head(5)

Unnamed: 0,metapath,nonzero,seconds_per_query,auroc,auroc_permuted,delta_auroc,pval_auroc,length
541,CpDpCtD,0.036291,0.010954,0.567074,0.521675,0.045399,1.8e-05,3
543,CpDrD,0.01404,0.011533,0.511921,0.501189,0.010732,5.8e-05,2
572,CtDpCpD,0.067285,0.011678,0.577253,0.531694,0.045559,0.000117,3
52,CbGbCpDrD,0.619338,0.015703,0.535126,0.49583,0.039296,0.000176,4
229,CbGuCpDrD,0.065695,0.011037,0.518041,0.499678,0.018363,0.000604,4


## Create matrix

In [13]:
# Create spread dataframes
# compound-disease pairs as rows, metapaths as columns

## TODO: saves PC columns as floats rather than ints
pivoted = dict()
for hetnet in dwpc_df.hetnet.unique():
    for value in 'PC', 'DWPC', 'seconds':
        print(hetnet, value)
        df = dwpc_df.query("hetnet == @hetnet")
        df = pandas.pivot_table(df, values=value, index=['drugbank_id', 'doid_id', 'status'], columns='metapath')
        df = df = df.reset_index()
        directory = 'data/all-features/{}'.format(hetnet)
        if not os.path.exists(directory):
            os.mkdir(directory)
        filename = '{}-spread.tsv.bz2'.format(value)
        path = os.path.join(directory, filename)
        with bz2.open(path, 'wt') as wf:
            df.to_csv(wf, index=False, sep='\t')
        pivoted[(hetnet, value)] = df

hetio-ind_perm-1 PC
hetio-ind_perm-1 DWPC
hetio-ind_perm-1 seconds
hetio-ind PC
hetio-ind DWPC
hetio-ind seconds
hetio-ind_perm-2 PC
hetio-ind_perm-2 DWPC
hetio-ind_perm-2 seconds
hetio-ind_perm-0 PC
hetio-ind_perm-0 DWPC
hetio-ind_perm-0 seconds
hetio-ind_perm-3 PC
hetio-ind_perm-3 DWPC
hetio-ind_perm-3 seconds
hetio-ind_perm-4 PC
hetio-ind_perm-4 DWPC
hetio-ind_perm-4 seconds
