# Setting Up Environment

In [213]:
import os
import json
import math
import logging
import pandas as pd
from collections import Counter, defaultdict
from tqdm import tqdm
from itertools import product
from typing import Mapping, List
import networkx as nx
import numpy as np
from tqdm import tqdm
DATA_DIR = '../../data'
KG_DATA_PATH = os.path.join(DATA_DIR, 'kg')

# Helper Functions

In [214]:
sim_scores = {'cn': 'Common Neighbors', 'cos': 'Cosine Similiarity', 'ji': 'Jaccard index', 'si': 'Sorensen index',
             'hpi': 'Hub Promoted Index', 'hdi': 'Hub Depressed Index', 'lhn': 'Leicht–Holme–Newman Index', 'pa':'Preferential Attachment',
             'aa': 'Adamic-Adar', 'ra': 'Resource Allocation Index', 'lp': 'Local Path Index', 'kg': 'Katz global path indicator',
             'act': 'Average Commute Time'}
score_actual = {}
kg_dfs = {}
def create_graph_from_df(
    graph_df
) -> nx.DiGraph:
    """Create fully connected graph from dataframe."""
    graph = nx.DiGraph()

    for sub_name, obj_name, relation in graph_df.values:
        # Store edge in the graph
        graph.add_edge(
            sub_name,
            obj_name,
            polarity=relation,
        )


    connected_components_subgraph = [
        component
        for component in sorted(
            nx.connected_components(
                graph.to_undirected()
            ),
            key=len,
            reverse=True
        )
    ]

    final_subgraph = graph.subgraph(connected_components_subgraph[0])

    return final_subgraph


def shared_khop(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return len(list(khop_A & khop_B)), len(khop_A), len(khop_B)

def shared_khop_list(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return list(khop_A & khop_B)

def total_khop(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return len(list(khop_A | khop_B)), len(khop_A), len(khop_B)

notin=[]
# This function will change once the KG and clinical pairs are updated
def get_accuracy(dict_actual, dict_predicted): 
    pos,total = 0,0
    for disease in dict_predicted.keys():
        for drug in dict_predicted[disease]:
                if disease not in dict_actual:
                    notin.append(disease)
                    continue
                if drug in dict_actual[disease]:
                    pos+=1
                total+=1
    print('{pos}/{total}'.format(pos=pos,total=total))
    return round(((pos/total) * 100), 2)


def get_dict_df(diseases, drugs, undirected_graph, similarity_type = 'cn'):
    df = pd.DataFrame(columns=[
        'Source',
        'Target',
        sim_scores[similarity_type]
    ])
    disease_drug_dict = {}
    # loop through each disease in the set
    for disease in tqdm(diseases, desc='Main iterator', total=len(diseases)):
        cn = []
        # for each disease, find the similarity score with for each drug and append to list
        for drug in drugs:
            if similarity_type == 'cn':
                similarity,_,_ = shared_khop(disease, drug,undirected_graph , 1)
            elif similarity_type == 'cos':
                shared, nodeA_neighbor, nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / (math.sqrt(nodeA_neighbor * nodeB_neighbor))
            elif similarity_type == 'ji':
                shared,_,_ = shared_khop(disease, drug,undirected_graph , 1)
                total,_,_ = total_khop(disease, drug,undirected_graph , 1)
                similarity = shared/total
            elif similarity_type == 'si':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = (shared * 2) / (nodeA_neighbor + nodeB_neighbor)
            elif similarity_type == 'hpi':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / min(nodeA_neighbor, nodeB_neighbor) 
            elif similarity_type == 'hdi':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / max(nodeA_neighbor, nodeB_neighbor) 
            elif similarity_type == 'lhn':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / (nodeA_neighbor * nodeB_neighbor) 
            elif similarity_type == 'pa':
                _,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = nodeA_neighbor * nodeB_neighbor
            elif similarity_type == 'aa':
                shared_list = shared_khop_list(disease, drug,undirected_graph , 1)
                similarity = 0
                for n in shared_list:
                    neighbors_list = [i for i in undirected_graph.neighbors(n)]
                    similarity += 1/math.log10(len(neighbors_list))
            elif similarity_type == 'ra':
                shared_list = shared_khop_list(disease, drug,undirected_graph , 1)
                similarity = 0
                for n in shared_list:
                    neighbors_list = [i for i in undirected_graph.neighbors(n)]
                    similarity += 1/len(neighbors_list)
            cn.append(similarity)
        index = np.where(cn == np.amax(cn))
        # if list is full of 0's (i.e sum == 0), then there are no shared neighbors 
        if np.sum(cn) == 0 or len(np.array(cn)[index]) > 1:
            disease_drug_dict[disease] = 'none'          
        else:
            # add list of drugs with max # of shared nodes to specified key
            disease_drug_dict[disease] = np.array(list(drugs))[index]
            # create a df
            for i in index:
                for j in i:
                    df = df.append({'Source': disease, 'Target': list(drugs)[j], sim_scores[similarity_type]: cn[j]}, ignore_index=True)
    return disease_drug_dict, df

# Load Graph and Gold Standard

In [215]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)
openbiolink_df.rename(columns={'relation': 'polarity'}, inplace=True)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)
custom_df.rename(columns={'relation': 'polarity'}, inplace=True)

In [216]:
custom_df.head()

Unnamed: 0,source,target,polarity
0,pubchem.compound:4943,ncbigene:2562,1
1,pubchem.compound:667468,ncbigene:147,-1
2,pubchem.compound:4011,ncbigene:1133,-1
3,pubchem.compound:4636,ncbigene:148,1
4,pubchem.compound:2083,ncbigene:154,1


In [217]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-clinical-pairs.json')) as file:
    clinical_pair_dict = json.load(file).keys()

In [218]:
gold_standard_dict = {}
for i in clinical_pair_dict:
    split = i.split("_")
    if split[1] in gold_standard_dict:
        gold_standard_dict[split[1]].append(split[0])
    else:
        gold_standard_dict[split[1]] = [split[0]]

In [219]:
graph_openbio = create_graph_from_df(openbiolink_df)
graph_custom = create_graph_from_df(custom_df)

In [220]:
len(graph_custom)

8610

In the cell below, create a set of diseases and drugs in each graph

In [221]:
disease_openbio = []
drug_openbio = []
for u,v in graph_openbio.to_undirected().edges():
    if v.startswith('mondo'):
        disease_openbio.append(v)
    if u.startswith('mondo'):
        disease_openbio.append(u)
    if v.startswith('pubchem.'):
        drug_openbio.append(v)
    if u.startswith('pubchem.'):
        drug_openbio.append(u)
        
# sets of diseases/drugs for openbio      
disease_openbio = set(disease_openbio)
drug_openbio = set(drug_openbio)


disease_custom = []
drug_custom = []

for u,v in graph_custom.to_undirected().edges():
    if v.startswith('mondo'):
        disease_custom.append(v)
    if u.startswith('mondo'):
        disease_custom.append(u)
    if u.startswith('pubchem.'):
        drug_custom.append(u)
    if v.startswith('pubchem.'):
        drug_custom.append(v)
        
# sets of diseases/drugs for custom KG      
disease_custom = set(disease_custom)
drug_custom = set(drug_custom)

# Common Neighbors (CN)

In [222]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected())

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 114.65it/s]


In [223]:
df_openbio

Unnamed: 0,Source,Target,Common Neighbors
0,mondo:0005190,pubchem.compound:445154,1
1,mondo:0010726,pubchem.compound:446220,1
2,mondo:0017610,pubchem.compound:5282379,1
3,mondo:0005101,pubchem.compound:65399,1
4,mondo:0009265,pubchem.compound:2132,1
5,mondo:0007926,pubchem.compound:445154,1
6,mondo:0001347,pubchem.compound:145068,2
7,mondo:0007959,pubchem.compound:5757,2
8,mondo:0005011,pubchem.compound:65399,1
9,mondo:0020121,pubchem.compound:145068,2


In [224]:
# Accuracy for openbiolink KG
cn_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Common Neighbors'] = [cn_acc1]
cn_acc1

0/156


0.0

In [225]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected())

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 358.61it/s]


In [226]:
df_custom

Unnamed: 0,Source,Target,Common Neighbors
0,mondo:0005068,pubchem.compound:44093,2
1,mondo:0008678,pubchem.compound:44462760,1
2,mondo:0007959,pubchem.compound:24775005,1
3,mondo:0018305,pubchem.compound:5360696,3
4,mondo:0007256,pubchem.compound:11626560,1


In [227]:
# Accuracy for custom KG
cn_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Common Neighbors'].append(cn_acc2)
kg_dfs['Common Neighbors'] = [df_openbio,df_custom]
cn_acc2

0/161


0.0

# Cosine Similarity 

In [228]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'cos')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 73.14it/s]


In [229]:
df_openbio

Unnamed: 0,Source,Target,Cosine Similiarity
0,mondo:0005148,pubchem.compound:5280704,0.408248
1,mondo:0011122,pubchem.compound:4308,0.707107
2,mondo:0019004,pubchem.compound:5382,0.353553
3,mondo:0010150,pubchem.compound:3712,0.188982
4,mondo:0011776,pubchem.compound:5388961,0.316228
5,mondo:0010679,pubchem.compound:6741,0.353553
6,mondo:0005015,pubchem.compound:5280704,0.408248
7,mondo:0007947,pubchem.compound:2471,0.166667
8,mondo:0011719,pubchem.compound:2048,0.707107
9,mondo:0005029,pubchem.compound:16722836,0.57735


In [230]:
# Accuracy for openbiolink KG
cs_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Cosine Similiarity'] = [cs_acc1]
cs_acc1

4/84


4.76

In [231]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'cos')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 290.31it/s]


In [232]:
# Accuracy for custom KG
cs_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Cosine Similiarity'].append(cs_acc2)
kg_dfs['Cosine Similiarity'] = [df_openbio,df_custom]
cs_acc2

1/146


0.68

In [233]:
df_custom

Unnamed: 0,Source,Target,Cosine Similiarity
0,mondo:0011719,pubchem.compound:9829523,0.408248
1,mondo:0005105,pubchem.compound:11707110,0.353553
2,mondo:0018874,pubchem.compound:16722836,0.280056
3,mondo:0005575,pubchem.compound:42611257,0.242536
4,mondo:0008170,pubchem.compound:42611257,0.218218
5,mondo:0008678,pubchem.compound:44462760,0.25
6,mondo:0007959,pubchem.compound:24775005,0.447214
7,mondo:0018305,pubchem.compound:5360696,0.387298
8,mondo:0007254,pubchem.compound:6013,0.235702
9,mondo:0009061,pubchem.compound:2471,0.408248


# Jaccard Index

In [234]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 39.61it/s]


In [235]:
df_openbio.sort_values(by=sim_scores['ji'])

Unnamed: 0,Source,Target,Jaccard index
22,mondo:0007926,pubchem.compound:445154,0.006757
10,mondo:0005190,pubchem.compound:445154,0.006757
12,mondo:0010726,pubchem.compound:446220,0.018182
29,mondo:0004985,pubchem.compound:3386,0.028571
32,mondo:0007739,pubchem.compound:5711,0.055556
3,mondo:0010150,pubchem.compound:3712,0.066667
21,mondo:0002447,pubchem.compound:3712,0.066667
40,mondo:0008315,pubchem.compound:3712,0.066667
41,mondo:0011962,pubchem.compound:3712,0.066667
24,mondo:0005159,pubchem.compound:3712,0.071429


In [236]:
# Accuracy for openbiolink KG
ji_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Jaccard Index'] = [ji_acc1]
ji_acc1

4/84


4.76

In [237]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 164.92it/s]


In [238]:
df_custom.head()

Unnamed: 0,Source,Target,Jaccard index
0,mondo:0011719,pubchem.compound:9829523,0.25
1,mondo:0005105,pubchem.compound:11707110,0.125
2,mondo:0018874,pubchem.compound:16722836,0.111111
3,mondo:0005068,pubchem.compound:44093,0.2
4,mondo:0005575,pubchem.compound:42611257,0.058824


In [239]:
df_custom.shape

(14, 3)

In [240]:
# Accuracy for custom KG
ji_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Jaccard Index'].append(ji_acc2)
kg_dfs['Jaccard Index'] = [df_openbio,df_custom]
ji_acc2

1/137


0.73

# Sorensen Index 

In [241]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'si')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 73.26it/s]


In [242]:
df_openbio

Unnamed: 0,Source,Target,Sorensen index
0,mondo:0005148,pubchem.compound:5280704,0.285714
1,mondo:0011122,pubchem.compound:4308,0.666667
2,mondo:0019004,pubchem.compound:5382,0.333333
3,mondo:0010150,pubchem.compound:3712,0.125
4,mondo:0011776,pubchem.compound:5388961,0.181818
5,mondo:0010679,pubchem.compound:6741,0.222222
6,mondo:0005015,pubchem.compound:5280704,0.285714
7,mondo:0007947,pubchem.compound:2471,0.133333
8,mondo:0011719,pubchem.compound:2048,0.666667
9,mondo:0005029,pubchem.compound:16722836,0.5


In [243]:
# Accuracy for openbiolink KG
si_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Sorensen Index'] = [si_acc1]
si_acc1

4/84


4.76

In [244]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 121.14it/s]


In [245]:
df_custom.head()

Unnamed: 0,Source,Target,Jaccard index
0,mondo:0011719,pubchem.compound:9829523,0.25
1,mondo:0005105,pubchem.compound:11707110,0.125
2,mondo:0018874,pubchem.compound:16722836,0.111111
3,mondo:0005068,pubchem.compound:44093,0.2
4,mondo:0005575,pubchem.compound:42611257,0.058824


In [246]:
df_custom.shape

(14, 3)

In [247]:
# Accuracy for custom KG
si_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Sorensen Index'].append(si_acc2)
kg_dfs['Sorensen Index'] = [df_openbio,df_custom]
si_acc2

1/137


0.73

# Hub Promoted Index

In [248]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'hpi')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 61.57it/s]


In [249]:
df_openbio

Unnamed: 0,Source,Target,Hub Promoted Index
0,mondo:0005148,pubchem.compound:5280704,1.0
1,mondo:0011122,pubchem.compound:4308,1.0
2,mondo:0005015,pubchem.compound:5280704,1.0
3,mondo:0005029,pubchem.compound:16722836,1.0
4,mondo:0005190,pubchem.compound:445154,1.0
5,mondo:0010726,pubchem.compound:446220,1.0
6,mondo:0017610,pubchem.compound:5282379,0.5
7,mondo:0005101,pubchem.compound:65399,1.0
8,mondo:0009265,pubchem.compound:2132,1.0
9,mondo:0005180,pubchem.compound:2132,0.5


In [250]:
# Accuracy for openbiolink KG
hpi_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Hub Promoted Index'] = [hpi_acc1]

1/138


In [251]:
hpi_acc1

0.72

In [252]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'hpi')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 319.22it/s]


In [253]:
df_custom

Unnamed: 0,Source,Target,Hub Promoted Index
0,mondo:0018874,pubchem.compound:16722836,0.666667
1,mondo:0005068,pubchem.compound:5362129,1.0
2,mondo:0005575,pubchem.compound:42611257,1.0
3,mondo:0008170,pubchem.compound:42611257,1.0
4,mondo:0008678,pubchem.compound:44462760,0.25
5,mondo:0007959,pubchem.compound:24775005,1.0
6,mondo:0018305,pubchem.compound:5360696,1.0
7,mondo:0009061,pubchem.compound:2471,0.5
8,mondo:0007256,pubchem.compound:11626560,0.5


In [254]:
# Accuracy for custom KG
hpi_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Hub Promoted Index'].append(hpi_acc2)
kg_dfs['Hub Promoted Index'] = [df_openbio,df_custom]
hpi_acc2

1/149


0.67

# Hub Depressed Index

In [255]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'hdi')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 87.75it/s]


In [256]:
df_openbio

Unnamed: 0,Source,Target,Hub Depressed Index
0,mondo:0011122,pubchem.compound:4308,0.5
1,mondo:0019004,pubchem.compound:5382,0.25
2,mondo:0010150,pubchem.compound:3712,0.071429
3,mondo:0011776,pubchem.compound:5388961,0.1
4,mondo:0010679,pubchem.compound:6741,0.125
5,mondo:0007947,pubchem.compound:2471,0.083333
6,mondo:0005029,pubchem.compound:16722836,0.333333
7,mondo:0005190,pubchem.compound:445154,0.006757
8,mondo:0005061,pubchem.compound:44462760,0.142857
9,mondo:0010726,pubchem.compound:446220,0.018182


In [257]:
# Accuracy for openbiolink KG
hdi_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Hub Depressed Index'] = [hdi_acc1]
hdi_acc1

4/96


4.17

In [258]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'hdi')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 296.45it/s]


In [259]:
df_custom

Unnamed: 0,Source,Target,Hub Depressed Index
0,mondo:0011719,pubchem.compound:9829523,0.333333
1,mondo:0005068,pubchem.compound:44093,0.25
2,mondo:0005180,pubchem.compound:146570,0.133333
3,mondo:0008678,pubchem.compound:44462760,0.25
4,mondo:0007959,pubchem.compound:24775005,0.2
5,mondo:0018305,pubchem.compound:5360696,0.15
6,mondo:0011996,pubchem.compound:644241,0.5
7,mondo:0007256,pubchem.compound:11626560,0.125


In [260]:
# Accuracy for custom KG
hdi_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Hub Depressed Index'].append(hdi_acc2)
kg_dfs['Hub Depressed Index'] = [df_openbio,df_custom]
hdi_acc2

0/152


0.0

# Leicht–Holme–Newman Index

In [261]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'lhn')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 58.33it/s]


In [262]:
df_openbio

Unnamed: 0,Source,Target,Leicht–Holme–Newman Index
0,mondo:0005148,pubchem.compound:5280704,0.166667
1,mondo:0011122,pubchem.compound:4308,0.5
2,mondo:0019004,pubchem.compound:5382,0.125
3,mondo:0010150,pubchem.compound:3712,0.035714
4,mondo:0011776,pubchem.compound:5388961,0.1
5,mondo:0010679,pubchem.compound:6741,0.125
6,mondo:0005015,pubchem.compound:5280704,0.166667
7,mondo:0007947,pubchem.compound:2471,0.027778
8,mondo:0011719,pubchem.compound:2048,0.5
9,mondo:0005029,pubchem.compound:16722836,0.333333


In [263]:
# Accuracy for openbiolink KG
lhn_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Leicht–Holme–Newman Index'] = [lhn_acc1]
lhn_acc1

4/84


4.76

In [264]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'lhn')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 202.49it/s]


In [265]:
df_custom

Unnamed: 0,Source,Target,Leicht–Holme–Newman Index
0,mondo:0011719,pubchem.compound:644241,0.125
1,mondo:0018874,pubchem.compound:16722836,0.039216
2,mondo:0005068,pubchem.compound:5362129,0.125
3,mondo:0005575,pubchem.compound:42611257,0.058824
4,mondo:0008170,pubchem.compound:42611257,0.047619
5,mondo:0008678,pubchem.compound:44462760,0.0625
6,mondo:0007959,pubchem.compound:24775005,0.2
7,mondo:0018305,pubchem.compound:5360696,0.05
8,mondo:0011996,pubchem.compound:644241,0.25
9,mondo:0009061,pubchem.compound:2471,0.166667


In [266]:
# Accuracy for custom KG
lhn_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Leicht–Holme–Newman Index'].append(lhn_acc2)
kg_dfs['Leicht–Holme–Newman Index'] = [df_openbio,df_custom]
lhn_acc2

1/143


0.7

# Preferential Attachment

In [267]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'pa')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 58.48it/s]


In [268]:
df_openbio

Unnamed: 0,Source,Target,Preferential Attachment
0,mondo:0019154,pubchem.compound:448537,246
1,mondo:0005148,pubchem.compound:448537,1476
2,mondo:0011122,pubchem.compound:448537,492
3,mondo:0011382,pubchem.compound:448537,246
4,mondo:0019004,pubchem.compound:448537,492
...,...,...,...
67,mondo:0008903,pubchem.compound:448537,246
68,mondo:0009061,pubchem.compound:448537,246
69,mondo:0010311,pubchem.compound:448537,246
70,mondo:0005439,pubchem.compound:448537,246


In [269]:
# Accuracy for openbiolink KG
pa_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Preferential Attachment'] = [pa_acc1]
pa_acc1

1/45


2.22

In [270]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'pa')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 159.93it/s]


In [271]:
df_custom.head()

Unnamed: 0,Source,Target,Preferential Attachment
0,mondo:0019154,pubchem.compound:2170,26
1,mondo:0011122,pubchem.compound:2170,390
2,mondo:0011382,pubchem.compound:2170,26
3,mondo:0019004,pubchem.compound:2170,130
4,mondo:0010150,pubchem.compound:2170,26


In [272]:
df_custom.shape

(59, 3)

In [273]:
# Accuracy for custom KG
pa_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Preferential Attachment'].append(pa_acc2)
kg_dfs['Preferential Attachment'] = [df_openbio,df_custom]
pa_acc2

0/44


0.0

# Adamic-Adar

In [274]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'aa')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 94.61it/s] 


In [275]:
df_openbio

Unnamed: 0,Source,Target,Adamic-Adar
0,mondo:0005148,pubchem.compound:451668,3.091641
1,mondo:0005015,pubchem.compound:451668,3.091641
2,mondo:0007947,pubchem.compound:5284616,1.430677
3,mondo:0005190,pubchem.compound:445154,0.79664
4,mondo:0010726,pubchem.compound:446220,3.321928
5,mondo:0017610,pubchem.compound:5282379,3.321928
6,mondo:0005101,pubchem.compound:65399,2.095903
7,mondo:0009265,pubchem.compound:2132,2.095903
8,mondo:0007926,pubchem.compound:445154,0.79664
9,mondo:0008170,pubchem.compound:3385,3.869551


In [276]:
# Accuracy for openbiolink KG
aa_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Adamic-Adar'] = [aa_acc1]
aa_acc1

1/144


0.69

In [277]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'aa')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 166.76it/s]


In [278]:
df_custom

Unnamed: 0,Source,Target,Adamic-Adar
0,mondo:0005068,pubchem.compound:44093,1.816293
1,mondo:0005575,pubchem.compound:20279,1.660964
2,mondo:0008678,pubchem.compound:44462760,0.715338
3,mondo:0007959,pubchem.compound:24775005,0.642549
4,mondo:0004979,pubchem.compound:44093,0.632998
5,mondo:0018305,pubchem.compound:5360696,2.391836
6,mondo:0007254,pubchem.compound:24826799,1.038681
7,mondo:0011962,pubchem.compound:24826799,0.511707
8,mondo:0008903,pubchem.compound:11626560,1.261382
9,mondo:0007256,pubchem.compound:11626560,0.49276


In [279]:
# Accuracy for custom KG
aa_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Adamic-Adar'].append(aa_acc2)
kg_dfs['Adamic-Adar'] = [df_openbio,df_custom]

0/155


In [280]:
aa_acc2

0.0

# Resource Allocation Index

In [281]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'ra')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 91.70it/s]


In [282]:
df_openbio

Unnamed: 0,Source,Target,Resource Allocation Index
0,mondo:0005148,pubchem.compound:451668,0.45
1,mondo:0005015,pubchem.compound:451668,0.45
2,mondo:0007947,pubchem.compound:5284616,0.2
3,mondo:0005190,pubchem.compound:445154,0.055556
4,mondo:0010726,pubchem.compound:446220,0.5
5,mondo:0017610,pubchem.compound:5282379,0.5
6,mondo:0005101,pubchem.compound:65399,0.333333
7,mondo:0009265,pubchem.compound:2132,0.333333
8,mondo:0007926,pubchem.compound:445154,0.055556
9,mondo:0008170,pubchem.compound:3385,0.514925


In [283]:
# Accuracy for openbiolink KG
ra_acc1 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Resource Allocation Index'] = [ra_acc1]
ra_acc1

1/147


0.68

In [284]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ra')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 321.35it/s]


In [285]:
df_custom

Unnamed: 0,Source,Target,Resource Allocation Index
0,mondo:0005068,pubchem.compound:44093,0.169173
1,mondo:0005575,pubchem.compound:20279,0.25
2,mondo:0008678,pubchem.compound:44462760,0.04
3,mondo:0007959,pubchem.compound:24775005,0.027778
4,mondo:0004979,pubchem.compound:44093,0.026316
5,mondo:0018305,pubchem.compound:5360696,0.167677
6,mondo:0007254,pubchem.compound:9931954,0.03125
7,mondo:0011962,pubchem.compound:24826799,0.011111
8,mondo:0008903,pubchem.compound:11626560,0.059346
9,mondo:0007256,pubchem.compound:11626560,0.009346


In [286]:
# Accuracy for custom KG
ra_acc2 = get_accuracy(gold_standard_dict, disease_drug_dict)
score_actual['Resource Allocation Index'].append(ra_acc2)
kg_dfs['Resource Allocation Index'] = [df_openbio,df_custom]
ra_acc2

0/155


0.0

In [287]:
# Diseases in KG that are not in Gold-Standard
set(notin)

{'mondo:0001056',
 'mondo:0002046',
 'mondo:0002429',
 'mondo:0002447',
 'mondo:0004989',
 'mondo:0005005',
 'mondo:0005011',
 'mondo:0005061',
 'mondo:0005075',
 'mondo:0005086',
 'mondo:0005159',
 'mondo:0005190',
 'mondo:0005192',
 'mondo:0006486',
 'mondo:0007254',
 'mondo:0008903',
 'mondo:0009265',
 'mondo:0009267',
 'mondo:0009623',
 'mondo:0009831',
 'mondo:0010150',
 'mondo:0010311',
 'mondo:0010421',
 'mondo:0010626',
 'mondo:0011776',
 'mondo:0011962',
 'mondo:0015917',
 'mondo:0019154'}

In [288]:
# final precisions
score_actual

{'Common Neighbors': [0.0, 0.0],
 'Cosine Similiarity': [4.76, 0.68],
 'Jaccard Index': [4.76, 0.73],
 'Sorensen Index': [4.76, 0.73],
 'Hub Promoted Index': [0.72, 0.67],
 'Hub Depressed Index': [4.17, 0.0],
 'Leicht–Holme–Newman Index': [4.76, 0.7],
 'Preferential Attachment': [2.22, 0.0],
 'Adamic-Adar': [0.69, 0.0],
 'Resource Allocation Index': [0.68, 0.0]}

In [289]:
openbio_pairs = [kg_dfs[i][0].shape[0] 
 for i in kg_dfs.keys()]
custom_pairs = [kg_dfs[i][1].shape[0] 
 for i in kg_dfs.keys()]

In [290]:
data = {'Openbio Shape': openbio_pairs, 'Custom KG Shape': custom_pairs}
frame = pd.DataFrame(index = score_actual.keys(), columns=['Openbio Shape', 'Custom KG Shape'], data=data)

In [291]:
frame

Unnamed: 0,Openbio Shape,Custom KG Shape
Common Neighbors,12,5
Cosine Similiarity,45,11
Jaccard Index,45,14
Sorensen Index,45,14
Hub Promoted Index,18,9
Hub Depressed Index,42,8
Leicht–Holme–Newman Index,45,11
Preferential Attachment,72,59
Adamic-Adar,16,10
Resource Allocation Index,15,10


In [292]:
print('Total openbio pairs:',np.sum(openbio_pairs))
print('Total custom KG pairs:',np.sum(custom_pairs))

Total openbio pairs: 355
Total custom KG pairs: 151
