# Calculate the performonce on every feature

In [1]:
import bz2
import os
import re
import math

import pandas
import sklearn.metrics
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests

## Read features and partitions

In [2]:
# Read partition information
part_df = pandas.read_table('data/partitions.tsv')
part_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,status
18873,hetio-ind_perm-5,DB09028,DOID:585,0
18874,hetio-ind_perm-1,DB09028,DOID:9074,0


In [3]:
# Read DWPC results
dwpc_df = pandas.read_table('data/dwpc.tsv.bz2')
dwpc_df = dwpc_df.rename(columns={'metapath': 'feature', 'DWPC': 'value'})
dwpc_df['feature_type'] = 'DWPC'
dwpc_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,feature,PC,w,value,seconds,feature_type
22933123,hetio-ind_perm-5,DB08912,DOID:9352,CdGeAeGaD,1599374,0.4,0.163162,61.52,DWPC
22933124,hetio-ind_perm-4,DB09020,DOID:1324,CdGeAeGaD,1451285,0.4,0.154356,44.43,DWPC


In [4]:
commit = '8f7c90a413b883911db0d8afb3c37775d1370087'

url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/degrees.xlsx'.format(commit)
disease_degree_df = pandas.read_excel(url, sheetname='Disease')
disease_degree_df = disease_degree_df.rename(columns={'node_id': 'disease_id'}).drop('node_name', axis='columns')
compound_degree_df = pandas.read_excel(url, sheetname='Compound')
compound_degree_df = compound_degree_df.rename(columns={'node_id': 'compound_id'}).drop('node_name', axis='columns')

url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/metaedge-styles.tsv'.format(commit)
metaedge_style_df = pandas.read_table(url)
metaedge_to_abbreviation = dict(zip(metaedge_style_df.metaedge, metaedge_style_df.abbreviation))

In [5]:
degree_df = part_df.merge(compound_degree_df).merge(disease_degree_df)
degree_vars = list(compound_degree_df.columns[1:]) + list(disease_degree_df.columns[1:])
degree_df = pandas.melt(degree_df, id_vars=['hetnet', 'compound_id', 'disease_id'],
    value_vars=degree_vars, var_name='feature')
degree_df['feature'] = degree_df['feature'].map(metaedge_to_abbreviation)
degree_df['feature_type'] = 'degree'
degree_df.tail(2)

Unnamed: 0,hetnet,compound_id,disease_id,feature,value,feature_type
301998,hetio-ind_perm-3,DB09015,DOID:11555,DuG,0,degree
301999,hetio-ind_perm-1,DB09018,DOID:11555,DuG,0,degree


In [6]:
feature_df = part_df.merge(pandas.concat([degree_df, dwpc_df]))
feature_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status,PC,feature,feature_type,seconds,value,w
0,hetio-ind_perm-5,DB00014,DOID:0060073,0,,CbG,degree,,2.0,
1,hetio-ind_perm-5,DB00014,DOID:0060073,0,,CcSE,degree,,249.0,


## Compute performance

In [7]:
def compute_metrics(df):
    y_true = df['status']
    y_score = df['value']
    series = pandas.Series()
    series['nonzero'] = (y_score > 0).mean()
    series['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_score)
    series['auprc'] = sklearn.metrics.average_precision_score(y_true, y_score)
    return series

auc_df = feature_df.groupby(['hetnet', 'feature_type', 'feature']).apply(compute_metrics).reset_index()
auc_df['permuted'] = auc_df.hetnet.str.contains('_perm').astype(int)

In [8]:
auc_df.head(2)

Unnamed: 0,hetnet,feature_type,feature,nonzero,auroc,auprc,permuted
0,hetio-ind,DWPC,CbG<rG<rGaD,0.624636,0.785146,0.494498,0
1,hetio-ind,DWPC,CbG<rG<rGdD,0.250596,0.663977,0.399786,0


In [9]:
def compare_permutation(df):
    unperm = df.query("permuted == 0").iloc[0, :]
    perm_df = df.query("permuted == 1")
    series = pandas.Series()
    series['nonzero'] = unperm['nonzero']
    series['auroc'] = unperm.auroc
    series['auroc_permuted'] = perm_df.auroc.mean()
    series['delta_auroc'] = series['auroc'] - series['auroc_permuted']
    ttest = scipy.stats.ttest_1samp(perm_df.auroc, unperm.auroc)
    pvalue = ttest.pvalue
    series['pval_auroc'] = pvalue
    #series['nlog10_pval_auroc'] = -math.log10(pvalue)
    return(series)

compare_df = auc_df.groupby(['feature_type', 'feature']).apply(compare_permutation).reset_index()
reject, compare_df['fdr_pval_auroc'], alphacSidak, alphacBonf = multipletests(
    pvals=compare_df.pval_auroc, method='fdr_bh')
compare_df = compare_df.sort_values('feature')

In [10]:
compare_df.tail(3)

Unnamed: 0,feature_type,feature,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
1228,degree,DrD,0.953642,0.463837,0.456494,0.007343,0.009585,0.035966
1229,degree,DtC,0.646623,0.858216,0.864443,-0.006227,0.018287,0.053079
1230,degree,DuG,0.388874,0.658492,0.651686,0.006806,0.035766,0.085825


In [11]:
len(compare_df)

1231

In [12]:
compare_df.sort_values('pval_auroc', ascending=True).head(5)

Unnamed: 0,feature_type,feature,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
769,DWPC,CrCrCtDrD,0.179868,0.663283,0.474523,0.18876,2e-06,0.001063
773,DWPC,CrCtD,0.113377,0.754664,0.532202,0.222463,2e-06,0.001063
528,DWPC,CiPCiCdGaD,0.175099,0.686687,0.62253,0.064158,4e-06,0.001222
200,DWPC,CbGr>Gr>GaD,0.706225,0.776548,0.734824,0.041724,4e-06,0.001222
685,DWPC,CrCbGaD,0.338013,0.684673,0.60866,0.076014,6e-06,0.001478


In [13]:
# Save datasets
auc_df.to_csv('data/auc.tsv', sep='\t', index=False, float_format='%.5g')
compare_df.to_csv('data/auroc.tsv', sep='\t', index=False, float_format='%.5g')

## Create matrix

In [14]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
compound_df = compound_df.iloc[:, :2]
disease_df = pandas.read_table('../summary/diseases.tsv')
disease_df = disease_df.iloc[:, :2]

In [16]:
# Create spread dataframes
# compound-disease pairs as rows, metapaths as columns
for hetnet in feature_df.hetnet.unique():
    print(hetnet)
    df = part_df.merge(feature_df.query("hetnet == @hetnet"))
    df = pandas.pivot_table(df, values='value', index=['compound_id', 'disease_id', 'status'], columns='feature')
    df = df.reset_index()
    df = compound_df.merge(disease_df.merge(df))
    for feature in compare_df.query("feature_type == 'degree'").feature:
        df[feature] = df[feature].astype(int)
    directory = 'data/matrix/{}'.format(hetnet)
    if not os.path.exists(directory):
        os.mkdir(directory)
    filename = 'features.tsv.bz2'
    path = os.path.join(directory, filename)
    with bz2.open(path, 'wt') as wf:
        df.to_csv(wf, index=False, sep='\t')

hetio-ind_perm-5
hetio-ind
hetio-ind_perm-3
hetio-ind_perm-4
hetio-ind_perm-1


In [18]:
df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,status,CbG,CbG<rG<rGaD,CbG<rG<rGdD,CbG<rG<rGuD,CbG<rGaD,...,CuGuDuGdD,CuGuDuGuD,DaG,DdG,DlA,DpC,DpS,DrD,DtC,DuG
0,DB01048,Abacavir,DOID:1319,brain cancer,0,3,0.002806,0.0,0.0,0.001683,...,0.0,0.0,111,0,66,4,88,8,7,0
1,DB01048,Abacavir,DOID:10941,intracranial aneurysm,0,3,0.000952,0.0,0.0,0.001578,...,0.0,0.0,23,0,49,0,50,2,0,0
