# Calculate the performonce on every feature

In [1]:
import bz2
import os
import re

import pandas
import sklearn.metrics
import scipy.stats
import math

## Read features and partitions

In [2]:
part_df = pandas.read_table('data/partitions.tsv')
part_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,status
18873,hetio-ind_perm-5,DB09028,DOID:585,0
18874,hetio-ind_perm-1,DB09028,DOID:9074,0


In [3]:
dwpc_df = pandas.read_table('data/dwpc.tsv.bz2')

In [4]:
dwpc_df = part_df.merge(dwpc_df)
dwpc_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,metapath,PC,w,DWPC,seconds
22933123,hetio-ind_perm-1,DB09028,DOID:9074,0,CuGeAeGaD,0,0.4,0.0,0.001614
22933124,hetio-ind_perm-1,DB09028,DOID:9074,0,CdGeAeGaD,0,0.4,0.0,0.002203


## Compute performance

In [5]:
def compute_metrics(df):
    y_true = df['status']
    y_score = df.DWPC
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    series['mean_PC'] = df['PC'].mean()
    series['seconds_per_query'] = df['seconds'].mean()
    return series

auc_df = dwpc_df.groupby(['hetnet', 'metapath']).apply(compute_metrics).reset_index()
auc_df['permuted'] = auc_df.hetnet.str.contains('_perm').astype(int)

In [6]:
auc_df.head(2)

Unnamed: 0,hetnet,metapath,nonzero,auroc,auprc,mean_PC,seconds_per_query,permuted
0,hetio-ind,CbG<rG<rGaD,0.624636,0.785146,0.494498,203.20106,0.217884,0
1,hetio-ind,CbG<rG<rGdD,0.250596,0.663977,0.399786,52.062781,0.127356,0


In [7]:
def compare_permutation(df):
    unperm = df.query("permuted == 0").iloc[0, :]
    perm_df = df.query("permuted == 1")
    series = pandas.Series()
    series['nonzero'] = unperm['nonzero']
    series['seconds_per_query'] = unperm['seconds_per_query']
    series['auroc'] = unperm.auroc
    series['auroc_permuted'] = perm_df.auroc.mean()
    series['delta_auroc'] = series['auroc'] - series['auroc_permuted']
    # One tailed p-value testing whether the unpermuted AUROC
    # is greater than the permuted AUROCs
    ttest = scipy.stats.ttest_1samp(perm_df.auroc, unperm.auroc)
    pvalue = ttest.pvalue / 2 if ttest.statistic < 0 else 1 - ttest.pvalue / 2
    series['pval_auroc'] = pvalue
    #series['nlog10_pval_auroc'] = -math.log10(pvalue)
    return(series)

compare_df = auc_df.groupby('metapath').apply(compare_permutation).reset_index()
compare_df['length'] = compare_df.metapath.map(lambda x: len(re.split(r'[a-z<>]+', x)) - 1)
compare_df = compare_df.sort_values(['length', 'metapath'])

In [8]:
compare_df.head(3)

Unnamed: 0,metapath,nonzero,seconds_per_query,auroc,auroc_permuted,delta_auroc,pval_auroc,length
27,CbGaD,0.233113,0.010829,0.751725,0.642282,0.109443,4e-06,2
113,CbGdD,0.02755,0.010916,0.523197,0.522938,0.000259,0.473782,2
237,CbGuD,0.02543,0.010817,0.530414,0.520551,0.009863,0.006674,2


In [9]:
len(compare_df)

1215

In [10]:
compare_df.sort_values('pval_auroc', ascending=True).head(5)

Unnamed: 0,metapath,nonzero,seconds_per_query,auroc,auroc_permuted,delta_auroc,pval_auroc,length
769,CrCrCtDrD,0.179868,0.012836,0.663283,0.474523,0.18876,7.718345e-07,4
773,CrCtD,0.113377,0.011495,0.754664,0.532202,0.222463,8.633755e-07,2
528,CiPCiCdGaD,0.175099,0.012468,0.686687,0.62253,0.064158,1.963615e-06,4
200,CbGr>Gr>GaD,0.706225,0.246924,0.776548,0.734824,0.041724,1.984732e-06,4
685,CrCbGaD,0.338013,0.012536,0.684673,0.60866,0.076014,3.002558e-06,3


In [22]:
auc_df.to_csv('data/auc.tsv', sep='\t', index=False, float_format='%.5g')
compare_df.to_csv('data/auroc.tsv', sep='\t', index=False, float_format='%.5g')

In [12]:
compare_df[compare_df.metapath.str.contains('CiPC|PCiC')].sort_values('pval_auroc', ascending=True).head(5)

Unnamed: 0,metapath,nonzero,seconds_per_query,auroc,auroc_permuted,delta_auroc,pval_auroc,length
528,CiPCiCdGaD,0.175099,0.012468,0.686687,0.62253,0.064158,2e-06,4
739,CrCiPCiCtD,0.114967,0.015172,0.735578,0.589265,0.146313,4e-06,4
535,CiPCiCtD,0.111788,0.01137,0.764842,0.542268,0.222574,5e-06,3
536,CiPCiCtDrD,0.12106,0.010579,0.712278,0.520669,0.191609,6e-06,4
534,CiPCiCrCtD,0.108609,0.011366,0.733578,0.608648,0.12493,7e-06,4


## Create matrix

In [19]:
# Read compound and disease info
compound_df = pandas.read_table('../summary/compounds.tsv')
compound_df = compound_df.iloc[:, :3].rename(columns={'treats': 'compound_treats'})
disease_df = pandas.read_table('../summary/diseases.tsv')
disease_df = disease_df.iloc[:, :3].rename(columns={'treats': 'disease_treats'})

In [21]:
# Create spread dataframes
# compound-disease pairs as rows, metapaths as columns

## TODO: saves PC columns as floats rather than ints
pivoted = dict()
for hetnet in dwpc_df.hetnet.unique():
    for value in 'PC', 'DWPC', 'seconds':
        print(hetnet, value)
        df = dwpc_df.query("hetnet == @hetnet")
        df = pandas.pivot_table(df, values=value, index=['compound_id', 'disease_id', 'status'], columns='metapath')
        #part_df.query("hetnet == @hetnet")[['compound_id', 'disease_id', 'compound_name', 'disease_name']]
        df = df = df.reset_index()
        df = compound_df.merge(disease_df.merge(df))
        directory = 'data/matrix/{}'.format(hetnet)
        if not os.path.exists(directory):
            os.mkdir(directory)
        filename = '{}-spread.tsv.bz2'.format(value)
        path = os.path.join(directory, filename)
        with bz2.open(path, 'wt') as wf:
            df.to_csv(wf, index=False, sep='\t')
        pivoted[(hetnet, value)] = df

hetio-ind_perm-5 PC
hetio-ind_perm-5 DWPC
hetio-ind_perm-5 seconds
hetio-ind PC
hetio-ind DWPC
hetio-ind seconds
hetio-ind_perm-3 PC
hetio-ind_perm-3 DWPC
hetio-ind_perm-3 seconds
hetio-ind_perm-4 PC
hetio-ind_perm-4 DWPC
hetio-ind_perm-4 seconds
hetio-ind_perm-1 PC
hetio-ind_perm-1 DWPC
hetio-ind_perm-1 seconds
