# Prepare the selected metapaths for feature computation for all compound-disease pairs

In [1]:
import json
import itertools
import bz2
import configparser

import pandas

## Read node info

In [2]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')
len(compound_df), len(disease_df)

(1538, 136)

In [3]:
config = configparser.ConfigParser()
config.read('../config.ini')
commit = config['hetnet']['pharmacotherapydb_commit']
url = 'https://github.com/dhimmel/indications/blob/{}/catalog/indications.tsv?raw=true'.format(commit)
indication_df = pandas.read_table(url)
indication_df = indication_df.rename(columns={'doid_id': 'disease_id', 'drugbank_id': 'compound_id'})[['compound_id', 'disease_id', 'category']]
indication_df.head(2)

Unnamed: 0,compound_id,disease_id,category
0,DB00843,DOID:10652,DM
1,DB00674,DOID:10652,DM


In [4]:
rows = list()
for c, d in itertools.product(compound_df.itertuples(), disease_df.itertuples()):
    row = c.compound_id, c.compound_name, d.disease_id, d.disease_name
    rows.append(row)
pair_df = pandas.DataFrame(rows, columns=['compound_id', 'compound_name', 'disease_id', 'disease_name'])
pair_df = pair_df.merge(indication_df, how='left')
pair_df['status'] = (pair_df.category == 'DM').astype(int)
pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0
1,DB01048,Abacavir,DOID:9206,Barrett's esophagus,,0


In [5]:
pair_df['status'].value_counts()

0    208413
1       755
Name: status, dtype: int64

In [6]:
len(pair_df)

209168

In [7]:
with bz2.open('features/compound-disease-pairs.tsv.bz2', 'wt') as write_file:
    pair_df.to_csv(write_file, sep='\t', index=False)

## Select metapaths

In [8]:
path = '../all-features/selection/sweep-features.tsv'
feature_df = pandas.read_table(path)
feature_df = feature_df.query("feature != 'intercept'")
feature_df = feature_df.query("perm_affected >= 0.5")

In [9]:
metapaths = [metapath for feature_type, metapath in feature_df.feature.str.split('_', n=1) if feature_type == 'dwpc']

In [10]:
feature_df.to_csv('features/selected-features.tsv', sep='\t', index=False, float_format='%.3g')
len(feature_df), len(metapaths)

(107, 97)

## Metaedges in chosen metapaths

In [11]:
m2m_df = pandas.read_table('../all-features/data/metaedge-in-metapath.tsv')
m2m_df = m2m_df.query("metapath in @metapaths")

In [12]:
# Counting multiple occurrences in the same metapath
m2m_df.metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Compound - treats - Disease,71
1,Compound - binds - Gene,47
2,Disease - associates - Gene,34
3,Disease - resembles - Disease,27
4,Compound - resembles - Compound,26
5,Compound - palliates - Disease,25
6,Compound - upregulates - Gene,20
7,Compound - downregulates - Gene,18
8,Disease - downregulates - Gene,16
9,Disease - upregulates - Gene,15


In [13]:
# Counting only one metaedge occurrence per metapath
m2m_df.drop_duplicates().metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Compound - treats - Disease,71
1,Compound - binds - Gene,47
2,Disease - associates - Gene,34
3,Disease - resembles - Disease,27
4,Compound - resembles - Compound,26
5,Compound - palliates - Disease,25
6,Compound - upregulates - Gene,20
7,Compound - downregulates - Gene,18
8,Disease - downregulates - Gene,16
9,Disease - upregulates - Gene,15


In [14]:
# Number of included metaedges
m2m_df.metaedge.nunique()

21

In [15]:
with open('../all-features/data/metapaths.json') as read_file:
    metapath_obj = json.load(read_file)
metapath_obj = [x for x in metapath_obj if x['abbreviation'] in metapaths]
with open('features/metapaths.json', 'wt') as write_file:
    metapath_obj = json.dump(metapath_obj, write_file, indent=2, sort_keys=True)