# Prepare the selected metapaths for feature computation for all compound-disease pairs

In [1]:
import json
import itertools
import configparser
import bz2

import pandas
import py2neo

In [2]:
config = configparser.ConfigParser()
config.read('../config.ini')
commit = config['hetnet']['integrate_commit']

## Read indications

In [3]:
with open('../all-features/servers.json') as read_file:
    instances = json.load(read_file)

for instance in instances:
    if instance['name'] == 'hetio-ind':
        uri = 'http://localhost:{}/db/data/'.format(instance['port'])    
        neo = py2neo.Graph(uri)
        break
neo

<Graph uri='http://localhost:7500/db/data/'>

In [4]:
def to_df(record_list):
    """Convert a py2neo RecordList to a dataframe"""
    return pandas.DataFrame(record_list.records, columns = record_list.columns)

indication_query = '''
MATCH (compound:Compound)-[rel]->(disease:Disease)
RETURN
  compound.identifier AS compound_id,
  disease.identifier AS disease_id,
  type(rel) AS rel_type
ORDER BY
  compound_id, rel_type, disease_id
'''
indication_df = to_df(neo.cypher.execute(indication_query))
indication_df.head(2)

Unnamed: 0,compound_id,disease_id,rel_type
0,DB00014,DOID:10283,TREATS_CtD
1,DB00014,DOID:1612,TREATS_CtD


## Read node info

In [5]:
# Read compound and disease degrees
compound_df = pandas.read_table('../summary/compounds.tsv')
compound_df = compound_df.iloc[:, :2]
disease_df = pandas.read_table('../summary/diseases.tsv')
disease_df = disease_df.iloc[:, :2]

In [22]:
compound_df.merge(disease_df.merge(indication_df)).to_csv('../summary/indications.tsv', sep='\t', index=False)

In [6]:
len(compound_df), len(disease_df)

(1538, 136)

In [7]:
rows = list(itertools.product(compound_df.compound_id, disease_df.disease_id))
pair_df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id'])
pair_df = pair_df.merge(indication_df, how='left')
pair_df = compound_df.merge(disease_df.merge(pair_df))
pair_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,rel_type
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,
1,DB01048,Abacavir,DOID:9206,Barrett's esophagus,


In [8]:
len(pair_df)

209168

In [9]:
with bz2.open('features/compound-disease-pairs.tsv.bz2', 'wt') as write_file:
    pair_df.to_csv(write_file, sep='\t', index=False)

# Compute degree features

In [10]:
url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/degrees.xlsx'.format(commit)
disease_degree_df = pandas.read_excel(url, sheetname='Disease')
disease_degree_df = disease_degree_df.rename(columns={'node_id': 'disease_id'}).drop('node_name', axis='columns')
compound_degree_df = pandas.read_excel(url, sheetname='Compound')
compound_degree_df = compound_degree_df.rename(columns={'node_id': 'compound_id'}).drop('node_name', axis='columns')

url = 'https://github.com/dhimmel/integrate/raw/{}/data/summary/metaedge-styles.tsv'.format(commit)
metaedge_style_df = pandas.read_table(url)
metaedge_to_abbreviation = dict(zip(metaedge_style_df.metaedge, metaedge_style_df.abbreviation))

In [11]:
degree_df = pair_df[['compound_id', 'disease_id']].merge(compound_degree_df).merge(disease_degree_df)
degree_df = degree_df.rename(columns = metaedge_to_abbreviation)
degree_df.tail(2)

Unnamed: 0,compound_id,disease_id,CbG,CcSE,CdG,CiPC,CpD,CrC,CtD,CuG,DaG,DdG,DlA,DpC,DpS,DrD,DtC,DuG
209166,DB01198,DOID:1245,14,259,4,0,2,1,0,4,6,0,14,0,8,13,0,0
209167,DB01624,DOID:1245,9,189,0,0,0,23,0,3,6,0,14,0,8,13,0,0


In [12]:
with bz2.open('features/degrees.tsv.bz2', 'wt') as write_file:
    degree_df.to_csv(write_file, sep='\t', index=False)

## Select metapaths

In [25]:
n_runs = 50
path = '../all-features/selection/sweep-features.tsv'
feature_df = pandas.read_table(path)
feature_df = feature_df.query("perm_affected / @n_runs >= 0.4")
feature_df.to_csv('features/selected-features.tsv', sep='\t', index=False, float_format='%.3g')
metapath_df = feature_df.query("feature_type == 'DWPC'")
#feature_df = feature_df.rename(columns={'feature': 'metapath'})
len(feature_df), len(metapath_df)

(42, 37)

In [14]:
metapaths = set(metapath_df.feature)
metapath_df.head(2)

Unnamed: 0,feature,all_features,perm_affected,total,feature_type,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
0,CbGaD,50,50,100,DWPC,0.23311,0.75173,0.64228,0.10944,8e-06,0.001514
3,CbGbCtD,50,50,100,DWPC,0.4249,0.92877,0.82564,0.10313,0.000195,0.008901


## Metaedges in chosen metapaths

In [15]:
m2m_df = pandas.read_table('../all-features/data/metaedge-in-metapath.tsv')
m2m_df = m2m_df.query("metapath in @metapaths")

In [16]:
# Counting multiple occurrences in the same metapath
m2m_df.metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Compound - treats - Disease,29
1,Compound - resembles - Compound,15
2,Compound - palliates - Disease,14
3,Compound - binds - Gene,12
4,Disease - associates - Gene,11
5,Disease - resembles - Disease,7
6,Compound - causes - Side Effect,6
7,Disease - downregulates - Gene,6
8,Pharmacologic Class - includes - Compound,6
9,Compound - downregulates - Gene,6


In [17]:
# Counting only one metaedge occurrence per metapath
m2m_df.drop_duplicates().metaedge.value_counts().reset_index()

Unnamed: 0,index,metaedge
0,Compound - treats - Disease,23
1,Compound - resembles - Compound,12
2,Disease - associates - Gene,10
3,Compound - palliates - Disease,10
4,Compound - binds - Gene,10
5,Disease - resembles - Disease,6
6,Disease - downregulates - Gene,5
7,Compound - downregulates - Gene,5
8,Compound - upregulates - Gene,4
9,Compound - causes - Side Effect,3


In [18]:
# Number of included metaedges
m2m_df.metaedge.nunique()

18

In [19]:
with open('../all-features/data/metapaths.json') as read_file:
    metapath_obj = json.load(read_file)
metapath_obj = [x for x in metapath_obj if x['abbreviation'] in metapaths]
with open('features/metapaths.json', 'wt') as write_file:
    metapath_obj = json.dump(metapath_obj, write_file, indent=2)