# Calculate the performonce on every feature

In [1]:
import bz2
import os
import re
import math

import pandas
import sklearn.metrics
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests

## Read features and partitions

In [2]:
# Read partition information
part_df = pandas.read_table('data/partitions.tsv')
part_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,status
18873,hetio-ind_perm-5,DB09028,DOID:585,0
18874,hetio-ind_perm-1,DB09028,DOID:9074,0


In [3]:
# Read DWPC results
dwpc_df = pandas.read_table('data/dwpc.tsv.bz2')
dwpc_df = dwpc_df.rename(columns={'metapath': 'feature', 'DWPC': 'value'})
dwpc_df['feature_type'] = 'DWPC'
dwpc_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,feature,PC,w,value,seconds,feature_type
22933123,hetio-ind_perm-5,DB08912,DOID:9352,CdGeAeGaD,1599374,0.4,0.163162,61.52,DWPC
22933124,hetio-ind_perm-4,DB09020,DOID:1324,CdGeAeGaD,1451285,0.4,0.154356,44.43,DWPC


In [4]:
#     if df.feature_type.iloc[0] == 'DWPC':
#         series['mean_PC'] = df['PC'].mean()
#         series['seconds_per_query'] = df['seconds'].mean()
#compare_df['length'] = compare_df.feature.map(lambda x: len(re.split(r'[a-z<>]+', x)) - 1)

In [5]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
compound_df = compound_df.iloc[:, :3].rename(columns={'treats': 'compound_treats'})
disease_df = pandas.read_table('../summary/diseases.tsv')
disease_df = disease_df.iloc[:, :3].rename(columns={'treats': 'disease_treats'})

In [6]:
degree_df = part_df.merge(compound_df).merge(disease_df)
degree_df = pandas.melt(degree_df, id_vars=['hetnet', 'compound_id', 'disease_id'],
    value_vars=['compound_treats', 'disease_treats'], var_name='feature')
degree_df['feature_type'] = 'degree'
degree_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,feature,value,feature_type
37748,hetio-ind_perm-3,DB09015,DOID:11555,disease_treats,0,degree
37749,hetio-ind_perm-1,DB09018,DOID:11555,disease_treats,0,degree


In [7]:
feature_df = part_df.merge(pandas.concat([degree_df, dwpc_df]))
feature_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,PC,feature,feature_type,seconds,value,w
0,hetio-ind_perm-5,DB00014,DOID:0060073,0,,compound_treats,degree,,2.0,
1,hetio-ind_perm-5,DB00014,DOID:0060073,0,,disease_treats,degree,,9.0,


## Compute performance

In [8]:
def compute_metrics(df):
    y_true = df['status']
    y_score = df['value']
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

auc_df = feature_df.groupby(['hetnet', 'feature_type', 'feature']).apply(compute_metrics).reset_index()
auc_df['permuted'] = auc_df.hetnet.str.contains('_perm').astype(int)

In [9]:
auc_df.head(2)

Unnamed: 0,hetnet,feature_type,feature,nonzero,auroc,auprc,permuted
0,hetio-ind,DWPC,CbG<rG<rGaD,0.624636,0.785146,0.494498,0
1,hetio-ind,DWPC,CbG<rG<rGdD,0.250596,0.663977,0.399786,0


In [10]:
def compare_permutation(df):
    unperm = df.query("permuted == 0").iloc[0, :]
    perm_df = df.query("permuted == 1")
    series = pandas.Series()
    series['nonzero'] = unperm['nonzero']
    series['auroc'] = unperm.auroc
    series['auroc_permuted'] = perm_df.auroc.mean()
    series['delta_auroc'] = series['auroc'] - series['auroc_permuted']
    ttest = scipy.stats.ttest_1samp(perm_df.auroc, unperm.auroc)
    pvalue = ttest.pvalue
    series['pval_auroc'] = pvalue
    #series['nlog10_pval_auroc'] = -math.log10(pvalue)
    return(series)

compare_df = auc_df.groupby(['feature_type', 'feature']).apply(compare_permutation).reset_index()
reject, compare_df['fdr_pval_auroc'], alphacSidak, alphacBonf = multipletests(
    pvals=compare_df.pval_auroc, method='fdr_bh')
compare_df = compare_df.sort_values('feature')

In [11]:
compare_df.tail(3)

Unnamed: 0,feature_type,feature,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
1214,DWPC,CuGuDuGuD,0.157881,0.638287,0.642103,-0.003816,0.131198,0.215186
1215,degree,compound_treats,0.399735,0.926446,0.925125,0.001321,0.377265,0.482081
1216,degree,disease_treats,0.646623,0.858216,0.864443,-0.006227,0.018287,0.052848


In [12]:
len(compare_df)

1217

In [13]:
compare_df.sort_values('pval_auroc', ascending=True).head(5)

Unnamed: 0,feature_type,feature,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
769,DWPC,CrCrCtDrD,0.179868,0.663283,0.474523,0.18876,2e-06,0.001051
773,DWPC,CrCtD,0.113377,0.754664,0.532202,0.222463,2e-06,0.001051
528,DWPC,CiPCiCdGaD,0.175099,0.686687,0.62253,0.064158,4e-06,0.001208
200,DWPC,CbGr>Gr>GaD,0.706225,0.776548,0.734824,0.041724,4e-06,0.001208
685,DWPC,CrCbGaD,0.338013,0.684673,0.60866,0.076014,6e-06,0.001462


In [14]:
# Save datasets
auc_df.to_csv('data/auc.tsv', sep='\t', index=False, float_format='%.5g')
compare_df.to_csv('data/auroc.tsv', sep='\t', index=False, float_format='%.5g')

## Create matrix

In [16]:
dwpc_df.head()

Unnamed: 0,hetnet,compound_id,disease_id,feature,PC,w,value,seconds,feature_type
0,hetio-ind_perm-5,DB00014,DOID:0060073,CpDpCpD,0,0.4,0.0,1.016,DWPC
1,hetio-ind,DB00014,DOID:1612,CpDpCpD,0,0.4,0.0,1.067,DWPC
2,hetio-ind,DB00014,DOID:10283,CpDpCpD,0,0.4,0.0,1.077,DWPC
3,hetio-ind_perm-5,DB00014,DOID:2994,CpDpCpD,0,0.4,0.0,0.01554,DWPC
4,hetio-ind_perm-5,DB00014,DOID:2998,CpDpCpD,0,0.4,0.0,0.01528,DWPC


In [19]:
# Create spread dataframes
# compound-disease pairs as rows, metapaths as columns
pivoted = dict()
for hetnet in dwpc_df.hetnet.unique():
    print(hetnet)
    df = part_df.merge(dwpc_df.query("hetnet == @hetnet"))
    df = pandas.pivot_table(df, values='value', index=['compound_id', 'disease_id', 'status'], columns='feature')
    df = df.reset_index()
    df = compound_df.merge(disease_df.merge(df))
    directory = 'data/matrix/{}'.format(hetnet)
    if not os.path.exists(directory):
        os.mkdir(directory)
    filename = 'DWPC-spread.tsv.bz2'.format(value)
    path = os.path.join(directory, filename)
    with bz2.open(path, 'wt') as wf:
        df.to_csv(wf, index=False, sep='\t')
    pivoted[(hetnet, value)] = df

hetio-ind_perm-5
hetio-ind
hetio-ind_perm-1
hetio-ind_perm-4
hetio-ind_perm-3
