# Prepare the selected metapaths for feature computation for all compound-disease pairs

In [1]:
import json
import itertools
import bz2
import configparser

import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

## Read node info

In [2]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')
len(compound_df), len(disease_df)

(1538, 136)

In [3]:
config = configparser.ConfigParser()
config.read('../config.ini')
commit = config['hetnet']['pharmacotherapydb_commit']
url = 'https://github.com/dhimmel/indications/blob/{}/catalog/indications.tsv?raw=true'.format(commit)
indication_df = pandas.read_table(url)
indication_df = indication_df.rename(columns={'doid_id': 'disease_id', 'drugbank_id': 'compound_id'})[['compound_id', 'disease_id', 'category']]
indication_df.head(2)

Unnamed: 0,compound_id,disease_id,category
0,DB00843,DOID:10652,DM
1,DB00674,DOID:10652,DM


In [4]:
rows = list()
for c, d in itertools.product(compound_df.itertuples(), disease_df.itertuples()):
    row = c.compound_id, c.compound_name, d.disease_id, d.disease_name
    rows.append(row)
pair_df = pandas.DataFrame(rows, columns=['compound_id', 'compound_name', 'disease_id', 'disease_name'])
pair_df = pair_df.merge(indication_df, how='left')
pair_df['status'] = (pair_df.category == 'DM').astype(int)
pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0
1,DB01048,Abacavir,DOID:9206,Barrett's esophagus,,0


In [5]:
pair_df['status'].value_counts()

0    208413
1       755
Name: status, dtype: int64

In [6]:
len(pair_df)

209168

In [7]:
with bz2.open('features/compound-disease-pairs.tsv.bz2', 'wt') as write_file:
    pair_df.to_csv(write_file, sep='\t', index=False)

## Select metapaths

In [8]:
auroc_df = pandas.read_table('../all-features/data/feature-performance/auroc.tsv')
reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(auroc_df.pval_delta_auroc, method='fdr_bh')
auroc_df['fdr_delta_auroc'] = pvals_corrected
auroc_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc,fdr_delta_auroc
0,CbG<rG<rGaD,0.63605,0.61395,0.57445,0.84662,0.56577,0.070275,2e-05,0.000464
1,CbG<rG<rGdD,0.58014,0.57085,0.5367,0.38146,0.54954,0.0306,0.000103,0.001179


In [9]:
whitelist_df = auroc_df.query(
    "rdwpc_auroc > 0.55"
    " and delta_auroc > 0"
    " and fdr_delta_auroc < 0.05"
    " and pdwpc_primary_auroc > 0.5"
).copy()
whitelist_df['feature'] = 'dwpc_' + whitelist_df['metapath']
whitelist_df.head(2)

Unnamed: 0,metapath,dwpc_auroc,pdwpc_auroc,rdwpc_auroc,nonzero,pdwpc_primary_auroc,delta_auroc,pval_delta_auroc,fdr_delta_auroc,feature
0,CbG<rG<rGaD,0.63605,0.61395,0.57445,0.84662,0.56577,0.070275,2e-05,0.000464,dwpc_CbG<rG<rGaD
3,CbG<rGaD,0.60329,0.60045,0.55514,0.47205,0.56593,0.037355,0.00063,0.003219,dwpc_CbG<rGaD


In [10]:
metapaths = set(whitelist_df.metapath)
len(metapaths)

142

## Metaedges in chosen metapaths

In [11]:
m2m_df = pandas.read_table('../all-features/data/metaedge-in-metapath.tsv')
m2m_df = m2m_df.query("metapath in @metapaths")

In [12]:
# Counting multiple occurrences in the same metapath
m2m_df.metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Disease - associates - Gene,88
1,Compound - binds - Gene,80
2,Compound - treats - Disease,57
3,Compound - resembles - Compound,51
4,Compound - downregulates - Gene,39
5,Compound - upregulates - Gene,34
6,Gene > regulates > Gene,28
7,Disease - resembles - Disease,28
8,Gene - interacts - Gene,18
9,Disease - localizes - Anatomy,16


In [13]:
# Counting only one metaedge occurrence per metapath
m2m_df.drop_duplicates().metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Disease - associates - Gene,88
1,Compound - binds - Gene,80
2,Compound - treats - Disease,57
3,Compound - resembles - Compound,51
4,Compound - downregulates - Gene,39
5,Compound - upregulates - Gene,34
6,Gene > regulates > Gene,28
7,Disease - resembles - Disease,28
8,Gene - interacts - Gene,18
9,Disease - localizes - Anatomy,16


In [14]:
# Number of included metaedges
m2m_df.metaedge.nunique()

23

In [15]:
with open('../all-features/data/metapaths.json') as read_file:
    metapath_obj = json.load(read_file)
metapath_obj = [x for x in metapath_obj if x['abbreviation'] in metapaths]
with open('features/metapaths.json', 'wt') as write_file:
    metapath_obj = json.dump(metapath_obj, write_file, indent=2, sort_keys=True)