# Extract edges from hetionet connecting drugs to diseases

In [1]:
import configparser
import numpy as np
import pandas as pd
import psycopg2

import hetio.readwrite
import hetio.pathtools

In [2]:
# read in json format of hetionetv1.0
graph_url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(graph_url)
meta_url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0-metagraph.json'
metagraph = hetio.readwrite.read_metagraph(meta_url)

### pre-process circadian treatments extracted from Ruben et al

In [3]:
# Ruben et al treatment data 
treatment = pd.read_csv('data/HumCircMed2018v2_mapped.tsv', sep = '\t')
treatment = treatment[treatment['effect'] == 'more eff']
treatment = treatment[~treatment['drug.trtmnt.DrugBankID'].isna()]
treatment = treatment[~treatment['therapeutic.area.DOID'].isna()]

# filter the data
rb_pair_drug = []
rb_pair_disease = []
rb_pair_hl = []
for i in range(0,len(treatment)):
    drug_id = treatment['drug.trtmnt.DrugBankID'].iloc[i,]
    disease_id = treatment['therapeutic.area.DOID'].iloc[i,]
    # whether treatment contains only one drug
    if len(drug_id.split(',')) == 1:
        source_id = 'Compound', drug_id
        target_id = 'Disease', disease_id
        # whether drug and disease are both in hetionet
        if source_id in graph.node_dict:
            if target_id in graph.node_dict:
                rb_pair_drug.append(drug_id)
                rb_pair_disease.append(disease_id)
                rb_pair_hl.append(treatment['halflife.hrs'].iloc[i,])

# number of drug~disease pairs in the dataset
len(rb_pair_drug)

24

### pre-process shorter-half-life drugs extracted from DrugBank

In [4]:
# read in half-life info of drugs
half_life_df = pd.read_csv('https://github.com/dhimmel/drugbank/raw/6b9ae386d6ba4a0eca2d66d4b0337a6e90fe81f4/data/drugbank_subset_halflife_curated.tsv'
                           ,header = 0, sep = '\t')

# filter out drugs in Ruben et al
filter_id = []
for i in range(0, len(half_life_df)):
    if half_life_df.drugbank_id[i] in rb_pair_drug:
        filter_id.append(False)
    else:
        filter_id.append(True)
half_life_df = half_life_df[filter_id]

# half-life < 24 hours
short24_drugs = list(half_life_df[half_life_df.half_life_hours_curated < 24].drugbank_id)
short24_hl = list(half_life_df[half_life_df.half_life_hours_curated < 24].half_life_hours_curated)      

In [5]:
# Get the diseases that each drug treats using hetionet
db_pair_drug = []
db_pair_disease = []
db_pair_hl = []
metapath = metagraph.metapath_from_abbrev('CtD')
for i in range(0, len(short24_drugs)):
    drug_id = short24_drugs[i]
    source_id = 'Compound', drug_id
    # whether drug is in hetionet
    if source_id in graph.node_dict:
        treat_paths = hetio.pathtools.paths_from(graph = graph, source = source_id, metapath = metapath)
        treat_len = len(treat_paths)
        if treat_len > 0:
            for j in range(0,treat_len):
                db_pair_drug.append(drug_id)
                db_pair_hl.append(short24_hl[i])
                disease_id = str(treat_paths[j].get_nodes()[1]).split('Disease::')[1]
                db_pair_disease.append(disease_id)

# number of drug~disease pairs in the dataset
len(db_pair_drug)

387

### Extract drug-disease edges from hetionet

In [6]:
## define metapath classes
# compound -> gene -> disease
g_class = ['CbGaD','CbGdD','CbGuD',
          'CdGaD','CdGdD','CdGuD',
          'CuGaD','CuGdD','CuGuD']
# compound -> gene -> compound -> disease
gc_class = ['CbGbCpD','CbGbCtD','CbGdCpD','CbGdCtD','CbGuCpD','CbGuCtD',
           'CdGbCpD','CdGbCtD','CdGdCpD','CdGdCtD','CdGuCpD','CdGuCtD',
           'CuGbCpD','CuGbCtD','CuGdCpD','CuGdCtD','CuGuCpD','CuGuCtD']
# compound -> gene -> disease -> disease
gd_class = ['CbGaDrD','CbGdDrD','CbGuDrD',
           'CdGaDrD','CdGdDrD','CdGuDrD',
           'CuGaDrD','CuGdDrD','CuGuDrD']
# compound -> compound -> gene -> disease
cg_class = ['CrCbGaD','CrCbGdD','CrCbGuD',
           'CrCdGaD','CrCdGdD','CrCdGuD',
           'CrCuGaD','CrCuGdD','CrCuGuD']
# compound -> disease -> gene -> disease
dg_class = ['CpDaGaD','CpDaGdD','CpDaGuD',
           'CpDdGaD','CpDdGdD','CpDdGuD',
           'CpDuGaD','CpDuGdD','CpDuGuD',
           'CtDaGaD','CtDaGdD','CtDaGuD',
           'CtDdGaD','CtDdGdD','CtDdGuD',
           'CtDuGaD','CtDuGdD','CtDuGuD']
m_classes = {'G': g_class, 'GC': gc_class, 'GD': gd_class, 'CG': cg_class, 'DG': dg_class}

In [7]:
# function to extract drug-disease edges from hetionet
def extract_drug_disease_edges(drug_list, disease_list, hl_list, graph, metagraph, meta_class_dict):
    '''
    drug_list, disease_list, hl_list: lists that contain DrugBank ID of drug, DOID of disease, and half-life of drug, 
    respectively
    graph, metagraph: read-in json format graph, metagraph
    meta_class_dict: dictionary that separates metapaths into general classes
    '''
    
    edge_list = []
    pair_count = 0  
    drug_len = len(drug_list)
    for i in range(0, drug_len):
        drug_id = drug_list[i]
        drug_hl = hl_list[i]
        disease_id = disease_list[i]
        source_id = 'Compound', drug_id
        target_id = 'Disease', disease_id
        path_count = 0
        # iterate each class of metapath
        for class_name, class_type in meta_class_dict.items():
            # iterate each metapath in the class
            for mt in class_type:
                metapath = metagraph.metapath_from_abbrev(mt)
                t_paths = hetio.pathtools.paths_between(graph = graph, source = source_id, target = target_id, metapath = metapath)
                t_path_len = len(t_paths)
                if t_path_len > 0:
                    if path_count == 0:
                        pair_count = pair_count + 1
                    path_count = path_count + t_path_len
                    # extract path info
                    for j in range(0,len(t_paths)):
                        # nodes in the path
                        nodes = t_paths[j].get_nodes()
                        # unique drug~disease pair ID, drug half-life, metapath class name, metapath name 
                        path_info = [pair_count, drug_hl, class_name, mt]
                        path_char = ';'.join(str(pi) for pi in path_info) + ';' + ';'.join(str(n) for n in list(nodes))
                        edge_list.append(path_char)
                    
    return edge_list

In [8]:
# extract drug-disease edges from hetionet for Ruben et al data
rb_edges = extract_drug_disease_edges(rb_pair_drug, rb_pair_disease, rb_pair_hl, graph, metagraph, m_classes)

# output 
f = open('data/ruben_hetionet_edges.txt','w')
for i in range(0,len(rb_edges)):
    f.write('%s\n' % rb_edges[i])
f.close()

# number of paths
len(rb_edges)

9857

In [9]:
# output format of paths
rb_edges[0:5]

['1;2.5;G;CbGaD;Compound::DB00635;Gene::213;Disease::DOID:2841',
 '1;2.5;G;CbGaD;Compound::DB00635;Gene::2908;Disease::DOID:2841',
 '1;2.5;GC;CbGbCpD;Compound::DB00635;Gene::1576;Compound::DB00668;Disease::DOID:2841',
 '1;2.5;GC;CbGbCtD;Compound::DB00635;Gene::1557;Compound::DB01274;Disease::DOID:2841',
 '1;2.5;GC;CbGbCtD;Compound::DB00635;Gene::1557;Compound::DB01234;Disease::DOID:2841']

In [10]:
# extract drug-disease edges from hetionet for drugbank data
db_edges = extract_drug_disease_edges(db_pair_drug, db_pair_disease, db_pair_hl, graph, metagraph, m_classes)

# output
f = open('data/drugbank_hetionet_edges.txt','w')
for i in range(0,len(db_edges)):
    f.write('%s\n' % db_edges[i])
f.close()

# number of paths
len(db_edges)

129435