# Assess the marginal performance of every feature

In [1]:
import math

import pandas
import sklearn.metrics
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
feature_type_df = pandas.read_table('data/matrix/feature-type.tsv')
feature_df = pandas.read_table('data/matrix/features.tsv.bz2')
features = list(feature_type_df['feature'])

In [3]:
feature_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,disease_name,compound_name,prior_prob,CbG,CcSE,CdG,...,CuGuDpCpD,CuGuDpCtD,CuGuDpSpD,CuGuDrD,CuGuDrDrD,CuGuDtCpD,CuGuDtCtD,CuGuDuGaD,CuGuDuGdD,CuGuDuGuD
0,rephetio-v2.0,DB00014,DOID:0050741,0,alcohol dependence,Goserelin,3.8e-05,2,249,0,...,0.0,0.0,0.000413,0.0,0.0,0.0,0.0,0.0,0.000212,0.00116
1,rephetio-v2.0_perm-5,DB00014,DOID:0050741,0,alcohol dependence,Goserelin,3.8e-05,2,249,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001268,0.003163


In [4]:
def compute_metrics(y_true, y_score):
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

def columnar_performance(df):
    y_true = df['status']
    perf_df = df[features].apply(lambda x: compute_metrics(y_true, x), axis='index')
    perf_df = perf_df.T
    perf_df.index.name = 'feature'
    return perf_df.reset_index()

perf_df = feature_df.groupby('hetnet').apply(columnar_performance).reset_index(level='hetnet')
perf_df = perf_df.merge(feature_type_df)
perf_df['permuted'] = perf_df.hetnet.str.contains('_perm').astype(int)

In [5]:
perf_df.merge(feature_type_df).head(3)

Unnamed: 0,hetnet,feature,nonzero,auroc,auprc,feature_type,permuted
0,rephetio-v2.0,prior_prob,1.0,0.847956,0.595182,prior,0
1,rephetio-v2.0_perm-1,prior_prob,1.0,0.830787,0.602504,prior,1
2,rephetio-v2.0_perm-2,prior_prob,1.0,0.838287,0.629588,prior,1


In [6]:
def compare_permutation(df):
    unperm = df.query("permuted == 0").iloc[0, :]
    perm_df = df.query("permuted == 1")
    series = pandas.Series()
    series['nonzero'] = unperm['nonzero']
    series['auroc'] = unperm.auroc
    series['auroc_permuted'] = perm_df.auroc.mean()
    series['delta_auroc'] = series['auroc'] - series['auroc_permuted']
    ttest = scipy.stats.ttest_1samp(perm_df.auroc, unperm.auroc)
    pvalue = ttest.pvalue
    series['pval_auroc'] = pvalue
    #series['nlog10_pval_auroc'] = -math.log10(pvalue)
    return(series)

compare_df = perf_df.groupby(['feature_type', 'feature']).apply(compare_permutation).reset_index()
reject, compare_df['fdr_pval_auroc'], alphacSidak, alphacBonf = multipletests(
    pvals=compare_df.pval_auroc, method='fdr_bh')
compare_df = compare_df.sort_values(['feature_type', 'feature'])

In [7]:
compare_df.head(3)

Unnamed: 0,feature_type,feature,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
0,degree,CbG,0.988079,0.547779,0.549037,-0.001257,0.54022,0.64624
1,degree,CcSE,0.954702,0.56452,0.568342,-0.003822,0.044426,0.091959
2,degree,CdG,0.671788,0.585467,0.586055,-0.000588,0.799381,0.844642


In [8]:
# Save datasets
perf_df.to_csv('data/auc.tsv', sep='\t', index=False, float_format='%.5g')
compare_df.to_csv('data/auroc.tsv', sep='\t', index=False, float_format='%.5g')