# Setting Up Environment

In [1]:
import os
import json
import math
import logging
import pandas as pd
from collections import Counter, defaultdict
from tqdm import tqdm
from itertools import product
from typing import Mapping, List
import networkx as nx
import numpy as np
from tqdm import tqdm
DATA_DIR = '../../data'
KG_DATA_PATH = os.path.join(DATA_DIR, 'kg')

# Helper Functions

In [85]:
sim_scores = {'cn': 'Common Neighbors', 'cos': 'Cosine Similiarity', 'ji': 'Jaccard index', 'si': 'Sorensen index',
             'hpi': 'Hub Promoted Index', 'hdi': 'Hub Depressed Index', 'lhn': 'Leicht–Holme–Newman Index', 'pa':'Preferential Attachment',
             'aa': 'Adamic-Adar', 'ra': 'Resource Allocation Index', 'lp': 'Local Path Index', 'kg': 'Katz global path indicator',
             'act': 'Average Commute Time'}
def create_graph_from_df(
    graph_df
) -> nx.DiGraph:
    """Create fully connected graph from dataframe."""
    graph = nx.DiGraph()

    for sub_name, obj_name, relation in graph_df.values:
        # Store edge in the graph
        graph.add_edge(
            sub_name,
            obj_name,
            polarity=relation,
        )


    connected_components_subgraph = [
        component
        for component in sorted(
            nx.connected_components(
                graph.to_undirected()
            ),
            key=len,
            reverse=True
        )
    ]

    final_subgraph = graph.subgraph(connected_components_subgraph[0])

    return final_subgraph


def shared_khop(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return len(list(khop_A & khop_B)), len(khop_A), len(khop_B)

def shared_khop_list(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return list(khop_A & khop_B)

def total_khop(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return len(list(khop_A | khop_B)), len(khop_A), len(khop_B)

# This function will change once the KG and clinical pairs are updated
def get_accuracy(dict_actual, dict_predicted): 
    pos,total = 0,0
    for key in dict_predicted.keys():
        if key in dict_actual:
            for drug in dict_predicted[key]:
                if drug in dict_actual[key]:
                    pos+=1
                total+=1
            
    return pos/total


def get_dict_df(diseases, drugs, undirected_graph, similarity_type = 'cn'):
    df = pd.DataFrame(columns=[
        'Source',
        'Target',
        sim_scores[similarity_type]
    ])
    disease_drug_dict = {}
    # loop through each disease in the set
    for disease in tqdm(diseases, desc='Main iterator', total=len(diseases)):
        cn = []
        # for each disease, find the similarity score with for each drug and append to list
        for drug in drugs:
            if similarity_type == 'cn':
                similarity,_,_ = shared_khop(disease, drug,undirected_graph , 1)
            elif similarity_type == 'cos':
                shared, nodeA_neighbor, nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / (math.sqrt(nodeA_neighbor * nodeB_neighbor))
            elif similarity_type == 'ji':
                shared,_,_ = shared_khop(disease, drug,undirected_graph , 1)
                total,_,_ = total_khop(disease, drug,undirected_graph , 1)
                similarity = shared/total
            elif similarity_type == 'si':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = (shared * 2) / (nodeA_neighbor + nodeB_neighbor)
            elif similarity_type == 'hpi':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / min(nodeA_neighbor, nodeB_neighbor) 
            elif similarity_type == 'hdi':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / max(nodeA_neighbor, nodeB_neighbor) 
            elif similarity_type == 'lhn':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / (nodeA_neighbor * nodeB_neighbor) 
            elif similarity_type == 'pa':
                _,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = nodeA_neighbor * nodeB_neighbor
            elif similarity_type == 'aa':
                shared_list = shared_khop_list(disease, drug,undirected_graph , 1)
                similarity = 0
                for n in shared_list:
                    neighbors_list = [i for i in undirected_graph.neighbors(n)]
                    similarity += 1/math.log10(len(neighbors_list))
            elif similarity_type == 'ra':
                shared_list = shared_khop_list(disease, drug,undirected_graph , 1)
                similarity = 0
                for n in shared_list:
                    neighbors_list = [i for i in undirected_graph.neighbors(n)]
                    similarity += 1/len(neighbors_list)
            cn.append(similarity)
        # if list is full of 0's (i.e sum == 0), then there are no shared neighbors 
        if np.sum(cn) == 0:
            disease_drug_dict[disease] = 'none'
        else:
            index = np.where(cn == np.amax(cn))
            # add list of drugs with max # of shared nodes to specified key
            disease_drug_dict[disease] = np.array(list(drugs))[index]
            # create a df
            for i in index:
                for j in i:
                    df = df.append({'Source': disease, 'Target': list(drugs)[j], sim_scores[similarity_type]: cn[j]}, ignore_index=True)
    return disease_drug_dict, df

# Load Graph and Gold Standard

In [3]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)
openbiolink_df.rename(columns={'relation': 'polarity'}, inplace=True)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)
custom_df.rename(columns={'relation': 'polarity'}, inplace=True)

In [4]:
custom_df.head()

Unnamed: 0,source,target,polarity
0,pubchem.compound:4943,ncbigene:2562,1
1,pubchem.compound:667468,ncbigene:147,-1
2,pubchem.compound:4011,ncbigene:1133,-1
3,pubchem.compound:4636,ncbigene:148,1
4,pubchem.compound:2083,ncbigene:154,1


In [5]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'clinical_pairs.json')) as file:
    clinical_pair_dict = json.load(file).keys()

In [6]:
gold_standard_dict = {}
for i in clinical_pair_dict:
    split = i.split("_")
    if split[1] in gold_standard_dict:
        gold_standard_dict[split[1]].append(split[0])
    else:
        gold_standard_dict[split[1]] = [split[0]]

In [7]:
graph_openbio = create_graph_from_df(openbiolink_df)
graph_custom = create_graph_from_df(custom_df)

In [8]:
len(graph_custom)

8610

In the cell below, create a set of diseases and drugs in each graph

In [9]:
disease_openbio = []
drug_openbio = []
for u,v in graph_openbio.to_undirected().edges():
    if v.startswith('mondo'):
        disease_openbio.append(v)
    if u.startswith('mondo'):
        disease_openbio.append(u)
    if v.startswith('pubchem.'):
        drug_openbio.append(v)
    if u.startswith('pubchem.'):
        drug_openbio.append(u)
        
# sets of diseases/drugs for openbio      
disease_openbio = set(disease_openbio)
drug_openbio = set(drug_openbio)


disease_custom = []
drug_custom = []

for u,v in graph_custom.to_undirected().edges():
    if v.startswith('mondo'):
        disease_custom.append(v)
    if u.startswith('mondo'):
        disease_custom.append(u)
    if u.startswith('pubchem.'):
        drug_custom.append(u)
    if v.startswith('pubchem.'):
        drug_custom.append(v)
        
# sets of diseases/drugs for custom KG      
disease_custom = set(disease_custom)
drug_custom = set(drug_custom)

# Common Neighbors (CN)

In [10]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected())

Main iterator: 100%|██████████| 72/72 [00:02<00:00, 26.73it/s]


In [11]:
df_openbio.sort_values(by= sim_scores['cn'], ascending=False).head()

Unnamed: 0,Source,Target,Common Neighbors
573,mondo:0007254,pubchem.compound:449459,3
569,mondo:0007254,pubchem.compound:702,3
85,mondo:0007256,pubchem.compound:3973,3
567,mondo:0007254,pubchem.compound:5757,3
568,mondo:0007254,pubchem.compound:54454,3


In [12]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.09534883720930233

In [13]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected())

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 104.01it/s]


In [14]:
df_custom.sort_values(by=sim_scores['cn'], ascending=False).head()

Unnamed: 0,Source,Target,Common Neighbors
159,mondo:0004985,pubchem.compound:60795,3
158,mondo:0004985,pubchem.compound:2170,3
101,mondo:0018305,pubchem.compound:5360696,3
162,mondo:0011996,pubchem.compound:5328940,2
95,mondo:0011719,pubchem.compound:9829523,2


In [15]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.11214953271028037

# Cosine Similarity 

In [16]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'cos')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 86.67it/s]


In [17]:
df_openbio.sort_values(by=sim_scores['cos']).head(5)

Unnamed: 0,Source,Target,Cosine Similiarity
106,mondo:0007926,pubchem.compound:445154,0.082199
6,mondo:0005190,pubchem.compound:445154,0.082199
109,mondo:0004985,pubchem.compound:3386,0.121268
17,mondo:0010726,pubchem.compound:446220,0.13484
44,mondo:0005021,pubchem.compound:6741,0.144338


In [18]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05813953488372093

In [19]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'cos')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 159.62it/s]


In [20]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.0547945205479452

In [21]:
df_custom.head()

Unnamed: 0,Source,Target,Cosine Similiarity
0,mondo:0007256,pubchem.compound:11626560,0.25
1,mondo:0018177,pubchem.compound:9915743,1.0
2,mondo:0018177,pubchem.compound:123631,1.0
3,mondo:0019154,pubchem.compound:6010,1.0
4,mondo:0019154,pubchem.compound:2375,1.0


# Jaccard Index

In [22]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 39.05it/s]


In [23]:
df_openbio.sort_values(by=sim_scores['ji'])

Unnamed: 0,Source,Target,Jaccard index
106,mondo:0007926,pubchem.compound:445154,0.006757
6,mondo:0005190,pubchem.compound:445154,0.006757
17,mondo:0010726,pubchem.compound:446220,0.018182
109,mondo:0004985,pubchem.compound:3386,0.028571
52,mondo:0007739,pubchem.compound:5711,0.055556
...,...,...,...
81,mondo:0005192,pubchem.compound:302576,1.000000
82,mondo:0005192,pubchem.compound:4817,1.000000
28,mondo:0019154,pubchem.compound:5995,1.000000
27,mondo:0019154,pubchem.compound:753704,1.000000


In [24]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05813953488372093

In [25]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 114.92it/s]


In [26]:
df_custom.head()

Unnamed: 0,Source,Target,Jaccard index
0,mondo:0007256,pubchem.compound:11626560,0.111111
1,mondo:0018177,pubchem.compound:9915743,1.0
2,mondo:0018177,pubchem.compound:123631,1.0
3,mondo:0019154,pubchem.compound:6010,1.0
4,mondo:0019154,pubchem.compound:2375,1.0


In [27]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.04580152671755725

# Sorensen Index 

In [28]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'si')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 58.83it/s]


In [29]:
df_openbio.head(10)

Unnamed: 0,Source,Target,Sorensen index
0,mondo:0006639,pubchem.compound:10022508,1.0
1,mondo:0006639,pubchem.compound:4764,1.0
2,mondo:0006639,pubchem.compound:10172943,1.0
3,mondo:0006639,pubchem.compound:302576,1.0
4,mondo:0006639,pubchem.compound:4817,1.0
5,mondo:0006639,pubchem.compound:216345,1.0
6,mondo:0005190,pubchem.compound:445154,0.013423
7,mondo:0007256,pubchem.compound:10022508,0.333333
8,mondo:0007256,pubchem.compound:4764,0.333333
9,mondo:0007256,pubchem.compound:10172943,0.333333


In [30]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05813953488372093

In [31]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 103.98it/s]


In [32]:
df_custom.head()

Unnamed: 0,Source,Target,Jaccard index
0,mondo:0007256,pubchem.compound:11626560,0.111111
1,mondo:0018177,pubchem.compound:9915743,1.0
2,mondo:0018177,pubchem.compound:123631,1.0
3,mondo:0019154,pubchem.compound:6010,1.0
4,mondo:0019154,pubchem.compound:2375,1.0


In [33]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.04580152671755725

# Hub Promoted Index

In [35]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'hpi')

Main iterator: 100%|██████████| 72/72 [00:03<00:00, 22.61it/s]


In [41]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.08401084010840108

In [42]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'hpi')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 164.39it/s]


In [43]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.06535947712418301

# Hub Depressed Index

In [45]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'hdi')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 58.97it/s]


In [46]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05084745762711865

In [49]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'hdi')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 124.99it/s]


In [50]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.109375

# Leicht–Holme–Newman Index

In [56]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'lhn')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 72.63it/s]


In [57]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05813953488372093

In [58]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'lhn')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 199.35it/s]


In [59]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05517241379310345

# Preferential Attachment

In [63]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'pa')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 78.15it/s]


In [64]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.022222222222222223

In [65]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'pa')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 143.73it/s]


In [66]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.0

# Adamic-Adar

In [79]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'aa')

Main iterator: 100%|██████████| 72/72 [00:03<00:00, 23.98it/s]


In [81]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.09090909090909091

In [83]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'aa')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 137.82it/s]


In [84]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.0736196319018405

# Resource Allocation Index

In [86]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'ra')

Main iterator: 100%|██████████| 72/72 [00:03<00:00, 23.43it/s]


In [87]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.09319899244332494

In [88]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ra')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 161.81it/s]


In [89]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.0736196319018405