# Setting Up Environment

In [2]:
import os
import json
import math
import logging
import pandas as pd
from collections import Counter, defaultdict
from tqdm import tqdm
from itertools import product
from typing import Mapping, List
import networkx as nx
import numpy as np
from tqdm import tqdm
DATA_DIR = '../../data'
KG_DATA_PATH = os.path.join(DATA_DIR, 'kg')

# Helper Functions

In [546]:
sim_scores = {'cn': 'Common Neighbors', 'cos': 'Cosine Similiarity', 'ji': 'Jaccard index', 'si': 'Sorensen index',
             'hpi': 'Hub Promoted Index', 'hdi': 'Hub Depressed Index', 'lhn': 'Leicht–Holme–Newman Index', 'pa':'Preferential Attachment',
             'aa': 'Adamic-Adar', 'ra': 'Resource Allocation Index', 'lp': 'Local Path Index', 'kg': 'Katz global path indicator',
             'act': 'Average Commute Time'}
score_actual = {}
def create_graph_from_df(
    graph_df
) -> nx.DiGraph:
    """Create fully connected graph from dataframe."""
    graph = nx.DiGraph()

    for sub_name, obj_name, relation in graph_df.values:
        # Store edge in the graph
        graph.add_edge(
            sub_name,
            obj_name,
            polarity=relation,
        )


    connected_components_subgraph = [
        component
        for component in sorted(
            nx.connected_components(
                graph.to_undirected()
            ),
            key=len,
            reverse=True
        )
    ]

    final_subgraph = graph.subgraph(connected_components_subgraph[0])

    return final_subgraph


def shared_khop(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return len(list(khop_A & khop_B)), len(khop_A), len(khop_B)

def shared_khop_list(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return list(khop_A & khop_B)

def total_khop(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return len(list(khop_A | khop_B)), len(khop_A), len(khop_B)

notin=[]
# This function will change once the KG and clinical pairs are updated
def get_accuracy(dict_actual, dict_predicted): 
    pos,total = 0,0
    for disease in dict_predicted.keys():
        for drug in dict_predicted[disease]:
                if disease not in dict_actual:
                    notin.append(disease)
                    continue
                if drug in dict_actual[disease]:
                    pos+=1
                total+=1
    print('{pos}/{total}'.format(pos=pos,total=total))
    return round(((pos/total) * 100), 2)


def get_dict_df(diseases, drugs, undirected_graph, similarity_type = 'cn'):
    df = pd.DataFrame(columns=[
        'Source',
        'Target',
        sim_scores[similarity_type]
    ])
    disease_drug_dict = {}
    # loop through each disease in the set
    for disease in tqdm(diseases, desc='Main iterator', total=len(diseases)):
        cn = []
        # for each disease, find the similarity score with for each drug and append to list
        for drug in drugs:
            if similarity_type == 'cn':
                similarity,_,_ = shared_khop(disease, drug,undirected_graph , 1)
            elif similarity_type == 'cos':
                shared, nodeA_neighbor, nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / (math.sqrt(nodeA_neighbor * nodeB_neighbor))
            elif similarity_type == 'ji':
                shared,_,_ = shared_khop(disease, drug,undirected_graph , 1)
                total,_,_ = total_khop(disease, drug,undirected_graph , 1)
                similarity = shared/total
            elif similarity_type == 'si':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = (shared * 2) / (nodeA_neighbor + nodeB_neighbor)
            elif similarity_type == 'hpi':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / min(nodeA_neighbor, nodeB_neighbor) 
            elif similarity_type == 'hdi':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / max(nodeA_neighbor, nodeB_neighbor) 
            elif similarity_type == 'lhn':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / (nodeA_neighbor * nodeB_neighbor) 
            elif similarity_type == 'pa':
                _,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = nodeA_neighbor * nodeB_neighbor
            elif similarity_type == 'aa':
                shared_list = shared_khop_list(disease, drug,undirected_graph , 1)
                similarity = 0
                for n in shared_list:
                    neighbors_list = [i for i in undirected_graph.neighbors(n)]
                    similarity += 1/math.log10(len(neighbors_list))
            elif similarity_type == 'ra':
                shared_list = shared_khop_list(disease, drug,undirected_graph , 1)
                similarity = 0
                for n in shared_list:
                    neighbors_list = [i for i in undirected_graph.neighbors(n)]
                    similarity += 1/len(neighbors_list)
            cn.append(similarity)
        # if list is full of 0's (i.e sum == 0), then there are no shared neighbors 
        if np.sum(cn) == 0:
            disease_drug_dict[disease] = 'none'
        else:
            index = np.where(cn == np.amax(cn))
            # add list of drugs with max # of shared nodes to specified key
            disease_drug_dict[disease] = np.array(list(drugs))[index]
            # create a df
            for i in index:
                for j in i:
                    df = df.append({'Source': disease, 'Target': list(drugs)[j], sim_scores[similarity_type]: cn[j]}, ignore_index=True)
    return disease_drug_dict, df

# Load Graph and Gold Standard

In [547]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)
openbiolink_df.rename(columns={'relation': 'polarity'}, inplace=True)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)
custom_df.rename(columns={'relation': 'polarity'}, inplace=True)

In [548]:
custom_df.head()

Unnamed: 0,source,target,polarity
0,pubchem.compound:4943,ncbigene:2562,1
1,pubchem.compound:667468,ncbigene:147,-1
2,pubchem.compound:4011,ncbigene:1133,-1
3,pubchem.compound:4636,ncbigene:148,1
4,pubchem.compound:2083,ncbigene:154,1


In [549]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-clinical-pairs.json')) as file:
    clinical_pair_dict = json.load(file).keys()

In [550]:
gold_standard_dict = {}
for i in clinical_pair_dict:
    split = i.split("_")
    if split[1] in gold_standard_dict:
        gold_standard_dict[split[1]].append(split[0])
    else:
        gold_standard_dict[split[1]] = [split[0]]

In [551]:
graph_openbio = create_graph_from_df(openbiolink_df)
graph_custom = create_graph_from_df(custom_df)

In [552]:
len(graph_custom)

8610

In the cell below, create a set of diseases and drugs in each graph

In [553]:
disease_openbio = []
drug_openbio = []
for u,v in graph_openbio.to_undirected().edges():
    if v.startswith('mondo'):
        disease_openbio.append(v)
    if u.startswith('mondo'):
        disease_openbio.append(u)
    if v.startswith('pubchem.'):
        drug_openbio.append(v)
    if u.startswith('pubchem.'):
        drug_openbio.append(u)
        
# sets of diseases/drugs for openbio      
disease_openbio = set(disease_openbio)
drug_openbio = set(drug_openbio)


disease_custom = []
drug_custom = []

for u,v in graph_custom.to_undirected().edges():
    if v.startswith('mondo'):
        disease_custom.append(v)
    if u.startswith('mondo'):
        disease_custom.append(u)
    if u.startswith('pubchem.'):
        drug_custom.append(u)
    if v.startswith('pubchem.'):
        drug_custom.append(v)
        
# sets of diseases/drugs for custom KG      
disease_custom = set(disease_custom)
drug_custom = set(drug_custom)

# Common Neighbors (CN)

In [554]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected())

Main iterator: 100%|██████████| 72/72 [00:02<00:00, 25.28it/s]


In [555]:
df_openbio

Unnamed: 0,Source,Target,Common Neighbors
0,mondo:0005029,pubchem.compound:9903786,1
1,mondo:0005029,pubchem.compound:5757,1
2,mondo:0005029,pubchem.compound:5957,1
3,mondo:0005029,pubchem.compound:2162,1
4,mondo:0005029,pubchem.compound:123631,1
...,...,...,...
903,mondo:0004989,pubchem.compound:5083,1
904,mondo:0004989,pubchem.compound:36462,1
905,mondo:0004989,pubchem.compound:327653,1
906,mondo:0004989,pubchem.compound:2019,1


In [556]:
# Accuracy for openbiolink KG
cn_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Common Neighbors'] = [cn_acc1]
cn_acc1

41/430


9.53

In [557]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected())

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 69.81it/s] 


In [558]:
df_custom

Unnamed: 0,Source,Target,Common Neighbors
0,mondo:0019154,pubchem.compound:5833,1
1,mondo:0019154,pubchem.compound:6013,1
2,mondo:0019154,pubchem.compound:4632,1
3,mondo:0019154,pubchem.compound:2375,1
4,mondo:0019154,pubchem.compound:3314,1
...,...,...,...
158,mondo:0004989,pubchem.compound:13791,1
159,mondo:0004989,pubchem.compound:5994,1
160,mondo:0004989,pubchem.compound:10184653,1
161,mondo:0004989,pubchem.compound:28417,1


In [559]:
# Accuracy for custom KG
cn_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Common Neighbors'].append(cn_acc2)
cn_acc2

24/214


11.21

# Cosine Similarity 

In [560]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'cos')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 48.66it/s]


In [561]:
df_openbio

Unnamed: 0,Source,Target,Cosine Similiarity
0,mondo:0005029,pubchem.compound:16722836,0.577350
1,mondo:0010150,pubchem.compound:3712,0.188982
2,mondo:0005072,pubchem.compound:16038120,0.707107
3,mondo:0019154,pubchem.compound:6010,1.000000
4,mondo:0019154,pubchem.compound:5995,1.000000
...,...,...,...
106,mondo:0004989,pubchem.compound:10172943,1.000000
107,mondo:0004989,pubchem.compound:216345,1.000000
108,mondo:0004989,pubchem.compound:10022508,1.000000
109,mondo:0004989,pubchem.compound:4764,1.000000


In [562]:
# Accuracy for openbiolink KG
cs_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Cosine Similiarity'] = [cs_acc1]
cs_acc1

5/86


5.81

In [563]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'cos')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 140.75it/s]


In [564]:
# Accuracy for custom KG
cs_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Cosine Similiarity'].append(cs_acc2)
cs_acc2

8/146


5.48

In [565]:
df_custom

Unnamed: 0,Source,Target,Cosine Similiarity
0,mondo:0019154,pubchem.compound:2375,1.000000
1,mondo:0019154,pubchem.compound:3314,1.000000
2,mondo:0019154,pubchem.compound:6010,1.000000
3,mondo:0019154,pubchem.compound:15951529,1.000000
4,mondo:0007256,pubchem.compound:11626560,0.250000
...,...,...,...
56,mondo:0004979,pubchem.compound:44093,0.158114
57,mondo:0004979,pubchem.compound:5743,0.158114
58,mondo:0018874,pubchem.compound:16722836,0.280056
59,mondo:0004989,pubchem.compound:667476,0.204124


# Jaccard Index

In [566]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 37.19it/s]


In [567]:
df_openbio.sort_values(by=sim_scores['ji'])

Unnamed: 0,Source,Target,Jaccard index
72,mondo:0005190,pubchem.compound:445154,0.006757
9,mondo:0007926,pubchem.compound:445154,0.006757
88,mondo:0010726,pubchem.compound:446220,0.018182
38,mondo:0004985,pubchem.compound:3386,0.028571
58,mondo:0007739,pubchem.compound:5711,0.055556
...,...,...,...
41,mondo:0009831,pubchem.compound:10172943,1.000000
40,mondo:0009831,pubchem.compound:302576,1.000000
48,mondo:0018305,pubchem.compound:3194,1.000000
64,mondo:0005192,pubchem.compound:10022508,1.000000


In [568]:
# Accuracy for openbiolink KG
ji_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Jaccard Index'] = [ji_acc1]
ji_acc1

5/86


5.81

In [569]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 105.03it/s]


In [570]:
df_custom.head()

Unnamed: 0,Source,Target,Jaccard index
0,mondo:0019154,pubchem.compound:2375,1.0
1,mondo:0019154,pubchem.compound:3314,1.0
2,mondo:0019154,pubchem.compound:6010,1.0
3,mondo:0019154,pubchem.compound:15951529,1.0
4,mondo:0007256,pubchem.compound:11626560,0.111111


In [571]:
df_custom.shape

(46, 3)

In [572]:
# Accuracy for custom KG
ji_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Jaccard Index'].append(ji_acc2)
ji_acc2

6/131


4.58

# Sorensen Index 

In [573]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'si')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 41.28it/s]


In [574]:
df_openbio

Unnamed: 0,Source,Target,Sorensen index
0,mondo:0005029,pubchem.compound:16722836,0.500000
1,mondo:0010150,pubchem.compound:3712,0.125000
2,mondo:0005072,pubchem.compound:16038120,0.666667
3,mondo:0019154,pubchem.compound:6010,1.000000
4,mondo:0019154,pubchem.compound:5995,1.000000
...,...,...,...
106,mondo:0004989,pubchem.compound:10172943,1.000000
107,mondo:0004989,pubchem.compound:216345,1.000000
108,mondo:0004989,pubchem.compound:10022508,1.000000
109,mondo:0004989,pubchem.compound:4764,1.000000


In [575]:
# Accuracy for openbiolink KG
si_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Sorensen Index'] = [si_acc1]
si_acc1

5/86


5.81

In [576]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 93.70it/s] 


In [577]:
df_custom.head()

Unnamed: 0,Source,Target,Jaccard index
0,mondo:0019154,pubchem.compound:2375,1.0
1,mondo:0019154,pubchem.compound:3314,1.0
2,mondo:0019154,pubchem.compound:6010,1.0
3,mondo:0019154,pubchem.compound:15951529,1.0
4,mondo:0007256,pubchem.compound:11626560,0.111111


In [578]:
df_custom.shape

(46, 3)

In [579]:
# Accuracy for custom KG
si_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Sorensen Index'].append(si_acc2)
si_acc2

6/131


4.58

# Hub Promoted Index

In [580]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'hpi')

Main iterator: 100%|██████████| 72/72 [00:03<00:00, 19.85it/s]


In [581]:
df_openbio

Unnamed: 0,Source,Target,Hub Promoted Index
0,mondo:0005029,pubchem.compound:16722836,1.0
1,mondo:0010150,pubchem.compound:3712,0.5
2,mondo:0010150,pubchem.compound:444795,0.5
3,mondo:0010150,pubchem.compound:5311,0.5
4,mondo:0010150,pubchem.compound:5757,0.5
...,...,...,...
840,mondo:0004989,pubchem.compound:5083,1.0
841,mondo:0004989,pubchem.compound:36462,1.0
842,mondo:0004989,pubchem.compound:327653,1.0
843,mondo:0004989,pubchem.compound:2019,1.0


In [582]:
# Accuracy for openbiolink KG
hpi_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Hub Promoted Index'] = [hpi_acc1]

31/369


In [583]:
hpi_acc1

8.4

In [584]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'hpi')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 112.53it/s]


In [585]:
df_custom

Unnamed: 0,Source,Target,Hub Promoted Index
0,mondo:0019154,pubchem.compound:5833,1.000000
1,mondo:0019154,pubchem.compound:6013,1.000000
2,mondo:0019154,pubchem.compound:4632,1.000000
3,mondo:0019154,pubchem.compound:2375,1.000000
4,mondo:0019154,pubchem.compound:3314,1.000000
...,...,...,...
80,mondo:0004979,pubchem.compound:44093,0.250000
81,mondo:0004979,pubchem.compound:5743,0.250000
82,mondo:0018874,pubchem.compound:16722836,0.666667
83,mondo:0004989,pubchem.compound:667476,1.000000


In [586]:
# Accuracy for custom KG
hpi_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Hub Promoted Index'].append(hpi_acc2)
hpi_acc2

10/153


6.54

# Hub Depressed Index

In [587]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'hdi')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 50.87it/s]


In [588]:
df_openbio

Unnamed: 0,Source,Target,Hub Depressed Index
0,mondo:0005029,pubchem.compound:16722836,0.333333
1,mondo:0010150,pubchem.compound:3712,0.071429
2,mondo:0005072,pubchem.compound:16038120,0.500000
3,mondo:0019154,pubchem.compound:6010,1.000000
4,mondo:0019154,pubchem.compound:5995,1.000000
...,...,...,...
133,mondo:0004989,pubchem.compound:10172943,1.000000
134,mondo:0004989,pubchem.compound:216345,1.000000
135,mondo:0004989,pubchem.compound:10022508,1.000000
136,mondo:0004989,pubchem.compound:4764,1.000000


In [589]:
# Accuracy for openbiolink KG
hdi_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Hub Depressed Index'] = [hdi_acc1]
hdi_acc1

6/118


5.08

In [590]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'hdi')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 80.71it/s] 


In [591]:
df_custom

Unnamed: 0,Source,Target,Hub Depressed Index
0,mondo:0019154,pubchem.compound:2375,1.000000
1,mondo:0019154,pubchem.compound:3314,1.000000
2,mondo:0019154,pubchem.compound:6010,1.000000
3,mondo:0019154,pubchem.compound:15951529,1.000000
4,mondo:0007256,pubchem.compound:11626560,0.125000
...,...,...,...
125,mondo:0004989,pubchem.compound:13791,0.041667
126,mondo:0004989,pubchem.compound:5994,0.041667
127,mondo:0004989,pubchem.compound:10184653,0.041667
128,mondo:0004989,pubchem.compound:28417,0.041667


In [592]:
# Accuracy for custom KG
hdi_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Hub Depressed Index'].append(hdi_acc2)
hdi_acc2

21/192


10.94

# Leicht–Holme–Newman Index

In [593]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'lhn')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 55.01it/s]


In [594]:
df_openbio

Unnamed: 0,Source,Target,Leicht–Holme–Newman Index
0,mondo:0005029,pubchem.compound:16722836,0.333333
1,mondo:0010150,pubchem.compound:3712,0.035714
2,mondo:0005072,pubchem.compound:16038120,0.500000
3,mondo:0019154,pubchem.compound:6010,1.000000
4,mondo:0019154,pubchem.compound:5995,1.000000
...,...,...,...
106,mondo:0004989,pubchem.compound:10172943,1.000000
107,mondo:0004989,pubchem.compound:216345,1.000000
108,mondo:0004989,pubchem.compound:10022508,1.000000
109,mondo:0004989,pubchem.compound:4764,1.000000


In [595]:
# Accuracy for openbiolink KG
lhn_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Leicht–Holme–Newman Index'] = [lhn_acc1]
lhn_acc1

5/86


5.81

In [596]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'lhn')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 121.79it/s]


In [597]:
df_custom

Unnamed: 0,Source,Target,Leicht–Holme–Newman Index
0,mondo:0019154,pubchem.compound:2375,1.000000
1,mondo:0019154,pubchem.compound:3314,1.000000
2,mondo:0019154,pubchem.compound:6010,1.000000
3,mondo:0019154,pubchem.compound:15951529,1.000000
4,mondo:0007256,pubchem.compound:11626560,0.062500
...,...,...,...
64,mondo:0004979,pubchem.compound:44093,0.025000
65,mondo:0004979,pubchem.compound:5743,0.025000
66,mondo:0018874,pubchem.compound:16722836,0.039216
67,mondo:0004989,pubchem.compound:667476,0.041667


In [598]:
# Accuracy for custom KG
lhn_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Leicht–Holme–Newman Index'].append(lhn_acc2)
lhn_acc2

8/145


5.52

# Preferential Attachment

In [599]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'pa')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 53.93it/s]


In [600]:
df_openbio

Unnamed: 0,Source,Target,Preferential Attachment
0,mondo:0010421,pubchem.compound:448537,246
1,mondo:0005029,pubchem.compound:448537,738
2,mondo:0010150,pubchem.compound:448537,492
3,mondo:0005072,pubchem.compound:448537,492
4,mondo:0002429,pubchem.compound:448537,246
...,...,...,...
67,mondo:0011962,pubchem.compound:448537,492
68,mondo:0002447,pubchem.compound:448537,492
69,mondo:0009267,pubchem.compound:448537,246
70,mondo:0018874,pubchem.compound:448537,984


In [601]:
# Accuracy for openbiolink KG
pa_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Preferential Attachment'] = [pa_acc1]
pa_acc1

1/45


2.22

In [602]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'pa')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 119.07it/s]


In [603]:
df_custom.head()

Unnamed: 0,Source,Target,Preferential Attachment
0,mondo:0010421,pubchem.compound:2170,26
1,mondo:0010150,pubchem.compound:2170,26
2,mondo:0005090,pubchem.compound:2170,26
3,mondo:0019154,pubchem.compound:2170,26
4,mondo:0010679,pubchem.compound:2170,26


In [604]:
df_custom.shape

(59, 3)

In [605]:
# Accuracy for custom KG
pa_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Preferential Attachment'].append(pa_acc2)
pa_acc2

0/44


0.0

# Adamic-Adar

In [606]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'aa')

Main iterator: 100%|██████████| 72/72 [00:04<00:00, 17.78it/s]


In [607]:
df_openbio

Unnamed: 0,Source,Target,Adamic-Adar
0,mondo:0005029,pubchem.compound:9903786,0.551598
1,mondo:0005029,pubchem.compound:5757,0.551598
2,mondo:0005029,pubchem.compound:5957,0.551598
3,mondo:0005029,pubchem.compound:2162,0.551598
4,mondo:0005029,pubchem.compound:123631,0.551598
...,...,...,...
859,mondo:0004989,pubchem.compound:5083,0.433773
860,mondo:0004989,pubchem.compound:36462,0.433773
861,mondo:0004989,pubchem.compound:327653,0.433773
862,mondo:0004989,pubchem.compound:2019,0.433773


In [608]:
# Accuracy for openbiolink KG
aa_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Adamic-Adar'] = [aa_acc1]
aa_acc1

36/396


9.09

In [609]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'aa')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 106.62it/s]


In [610]:
df_custom

Unnamed: 0,Source,Target,Adamic-Adar
0,mondo:0019154,pubchem.compound:5833,0.521084
1,mondo:0019154,pubchem.compound:6013,0.521084
2,mondo:0019154,pubchem.compound:4632,0.521084
3,mondo:0019154,pubchem.compound:2375,0.521084
4,mondo:0019154,pubchem.compound:3314,0.521084
...,...,...,...
79,mondo:0018874,pubchem.compound:9829523,1.036894
80,mondo:0018874,pubchem.compound:216239,1.036894
81,mondo:0018874,pubchem.compound:24826799,1.036894
82,mondo:0004989,pubchem.compound:216239,0.526974


In [611]:
# Accuracy for custom KG
aa_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Adamic-Adar'].append(aa_acc2)

12/163


In [612]:
aa_acc2

7.36

# Resource Allocation Index

In [613]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'ra')

Main iterator: 100%|██████████| 72/72 [00:03<00:00, 18.04it/s]


In [614]:
df_openbio

Unnamed: 0,Source,Target,Resource Allocation Index
0,mondo:0005029,pubchem.compound:9903786,0.015385
1,mondo:0005029,pubchem.compound:5757,0.015385
2,mondo:0005029,pubchem.compound:5957,0.015385
3,mondo:0005029,pubchem.compound:2162,0.015385
4,mondo:0005029,pubchem.compound:123631,0.015385
...,...,...,...
860,mondo:0004989,pubchem.compound:5083,0.004950
861,mondo:0004989,pubchem.compound:36462,0.004950
862,mondo:0004989,pubchem.compound:327653,0.004950
863,mondo:0004989,pubchem.compound:2019,0.004950


In [615]:
# Accuracy for openbiolink KG
ra_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Resource Allocation Index'] = [ra_acc1]
ra_acc1

37/397


9.32

In [616]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ra')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 104.06it/s]


In [617]:
df_custom

Unnamed: 0,Source,Target,Resource Allocation Index
0,mondo:0019154,pubchem.compound:5833,0.012048
1,mondo:0019154,pubchem.compound:6013,0.012048
2,mondo:0019154,pubchem.compound:4632,0.012048
3,mondo:0019154,pubchem.compound:2375,0.012048
4,mondo:0019154,pubchem.compound:3314,0.012048
...,...,...,...
79,mondo:0018874,pubchem.compound:9829523,0.023625
80,mondo:0018874,pubchem.compound:216239,0.023625
81,mondo:0018874,pubchem.compound:24826799,0.023625
82,mondo:0004989,pubchem.compound:216239,0.012658


In [618]:
# Accuracy for custom KG
ra_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Resource Allocation Index'].append(ra_acc2)
ra_acc2

12/163


7.36

In [619]:
# Diseases in KG that are not in Gold-Standard
set(notin)

{'mondo:0001056',
 'mondo:0002046',
 'mondo:0002429',
 'mondo:0002447',
 'mondo:0004989',
 'mondo:0005005',
 'mondo:0005011',
 'mondo:0005061',
 'mondo:0005075',
 'mondo:0005086',
 'mondo:0005159',
 'mondo:0005190',
 'mondo:0005192',
 'mondo:0006486',
 'mondo:0007254',
 'mondo:0008903',
 'mondo:0009265',
 'mondo:0009267',
 'mondo:0009623',
 'mondo:0009831',
 'mondo:0010150',
 'mondo:0010311',
 'mondo:0010421',
 'mondo:0010626',
 'mondo:0011776',
 'mondo:0011962',
 'mondo:0015917',
 'mondo:0019154'}

In [620]:
# final precisions
score_actual

{'Common Neighbors': [9.53, 11.21],
 'Cosine Similiarity': [5.81, 5.48],
 'Jaccard Index': [5.81, 4.58],
 'Sorensen Index': [5.81, 4.58],
 'Hub Promoted Index': [8.4, 6.54],
 'Hub Depressed Index': [5.08, 10.94],
 'Leicht–Holme–Newman Index': [5.81, 5.52],
 'Preferential Attachment': [2.22, 0.0],
 'Adamic-Adar': [9.09, 7.36],
 'Resource Allocation Index': [9.32, 7.36]}