# Setting Up Environment

In [3]:
import os
import json
import math
import logging
import pandas as pd
from collections import Counter, defaultdict
from tqdm import tqdm
from itertools import product
from typing import Mapping, List
import networkx as nx
import numpy as np
from tqdm import tqdm
DATA_DIR = '../../data'
KG_DATA_PATH = os.path.join(DATA_DIR, 'kg')

# Helper Functions

In [109]:
sim_scores = {'cn': 'Common Neighbors', 'cos': 'Cosine Similiarity', 'ji': 'Jaccard index', 'si': 'Sorensen index',
             'hpi': 'Hub Promoted Index', 'hdi': 'Hub Depressed Index', 'lhn': 'Leicht–Holme–Newman Index', 'pa':'Preferential Attachment',
             'aa': 'Adamic-Adar', 'ra': 'Resource Allocation Index', 'lp': 'Local Path Index', 'kg': 'Katz global path indicator',
             'act': 'Average Commute Time'}
def create_graph_from_df(
    graph_df
) -> nx.DiGraph:
    """Create fully connected graph from dataframe."""
    graph = nx.DiGraph()

    for sub_name, obj_name, relation in graph_df.values:
        # Store edge in the graph
        graph.add_edge(
            sub_name,
            obj_name,
            polarity=relation,
        )


    connected_components_subgraph = [
        component
        for component in sorted(
            nx.connected_components(
                graph.to_undirected()
            ),
            key=len,
            reverse=True
        )
    ]

    final_subgraph = graph.subgraph(connected_components_subgraph[0])

    return final_subgraph


def shared_khop(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return len(list(khop_A & khop_B)), len(khop_A), len(khop_B)

def total_khop(nodeA: str, nodeB: str, graph: nx.Graph, k: int) -> int:
    # find nodes within distance k
    traversalA = nx.bfs_edges(graph, source=nodeA, depth_limit=k)
    traversalB = nx.bfs_edges(graph, source=nodeB, depth_limit=k)
    khop_A = set([v for u, v in traversalA])
    khop_B = set([v for u, v in traversalB])
    # return shared number of k-neighbors
    return len(list(khop_A | khop_B)), len(khop_A), len(khop_B)

# This function will change once the KG and clinical pairs are updated
def get_accuracy(dict_actual, dict_predicted): 
    pos,total = 0,0
    for key in dict_predicted.keys():
        if key in dict_actual:
            for drug in dict_predicted[key]:
                if drug in dict_actual[key]:
                    pos+=1
                total+=1
            
    return pos/total


def get_dict_df(diseases, drugs, undirected_graph, similarity_type = 'cn'):
    df = pd.DataFrame(columns=[
        'Source',
        'Target',
        sim_scores[similarity_type]
    ])
    disease_drug_dict = {}
    # loop through each disease in the set
    for disease in tqdm(diseases, desc='Main iterator', total=len(diseases)):
        cn = []
        # for each disease, find the similarity score with for each drug and append to list
        for drug in drugs:
            if similarity_type == 'cn':
                similarity,_,_ = shared_khop(disease, drug,undirected_graph , 1)
            elif similarity_type == 'cos':
                shared, nodeA_neighbor, nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = shared / (math.sqrt(nodeA_neighbor * nodeB_neighbor))
            elif similarity_type == 'ji':
                shared,_,_ = shared_khop(disease, drug,undirected_graph , 1)
                total,_,_ = total_khop(disease, drug,undirected_graph , 1)
                similarity = shared/total
            elif similarity_type == 'si':
                shared,nodeA_neighbor,nodeB_neighbor = shared_khop(disease, drug,undirected_graph , 1)
                similarity = (shared * 2) / (nodeA_neighbor + nodeB_neighbor)
            cn.append(similarity)
        # if list is full of 0's (i.e sum == 0), then there are no shared neighbors 
        if np.sum(cn) == 0:
            disease_drug_dict[disease] = 'none'
        else:
            index = np.where(cn == np.amax(cn))
            # add list of drugs with max # of shared nodes to specified key
            disease_drug_dict[disease] = np.array(list(drugs))[index]
            # create a df
            for i in index:
                for j in i:
                    df = df.append({'Source': disease, 'Target': list(drugs)[j], sim_scores[similarity_type]: cn[j]}, ignore_index=True)
    return disease_drug_dict, df

# Load Graph and Gold Standard

In [83]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)
openbiolink_df.rename(columns={'relation': 'polarity'}, inplace=True)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)
custom_df.rename(columns={'relation': 'polarity'}, inplace=True)

In [84]:
custom_df.head()

Unnamed: 0,source,target,polarity
0,pubchem.compound:4943,ncbigene:2562,1
1,pubchem.compound:667468,ncbigene:147,-1
2,pubchem.compound:4011,ncbigene:1133,-1
3,pubchem.compound:4636,ncbigene:148,1
4,pubchem.compound:2083,ncbigene:154,1


In [85]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'clinical_pairs.json')) as file:
    clinical_pair_dict = json.load(file).keys()

In [86]:
gold_standard_dict = {}
for i in clinical_pair_dict:
    split = i.split("_")
    if split[1] in gold_standard_dict:
        gold_standard_dict[split[1]].append(split[0])
    else:
        gold_standard_dict[split[1]] = [split[0]]

In [87]:
graph_openbio = create_graph_from_df(openbiolink_df)
graph_custom = create_graph_from_df(custom_df)

In [88]:
len(graph_custom)

8610

In the cell below, create a set of diseases and drugs in each graph

In [89]:
disease_openbio = []
drug_openbio = []
for u,v in graph_openbio.to_undirected().edges():
    if v.startswith('mondo'):
        disease_openbio.append(v)
    if u.startswith('mondo'):
        disease_openbio.append(u)
    if v.startswith('pubchem.'):
        drug_openbio.append(v)
    if u.startswith('pubchem.'):
        drug_openbio.append(u)
        
# sets of diseases/drugs for openbio      
disease_openbio = set(disease_openbio)
drug_openbio = set(drug_openbio)


disease_custom = []
drug_custom = []

for u,v in graph_custom.to_undirected().edges():
    if v.startswith('mondo'):
        disease_custom.append(v)
    if u.startswith('mondo'):
        disease_custom.append(u)
    if u.startswith('pubchem.'):
        drug_custom.append(u)
    if v.startswith('pubchem.'):
        drug_custom.append(v)
        
# sets of diseases/drugs for custom KG      
disease_custom = set(disease_custom)
drug_custom = set(drug_custom)

# Common Neighbors (CN)

In [90]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected())

Main iterator: 100%|██████████| 72/72 [00:02<00:00, 25.14it/s]


In [91]:
df_openbio.sort_values(by= sim_scores['cn'], ascending=False).head()

Unnamed: 0,Source,Target,Common Neighbors
98,mondo:0007254,pubchem.compound:702,3
99,mondo:0007254,pubchem.compound:54454,3
100,mondo:0007254,pubchem.compound:5757,3
101,mondo:0007254,pubchem.compound:448537,3
102,mondo:0007254,pubchem.compound:449459,3


In [92]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.09534883720930233

In [93]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected())

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 98.57it/s] 


In [94]:
df_custom.sort_values(by=sim_scores['cn'], ascending=False).head()

Unnamed: 0,Source,Target,Common Neighbors
0,mondo:0004985,pubchem.compound:60795,3
125,mondo:0018305,pubchem.compound:5360696,3
1,mondo:0004985,pubchem.compound:2170,3
40,mondo:0011719,pubchem.compound:24826799,2
71,mondo:0011996,pubchem.compound:5328940,2


In [95]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.11214953271028037

# Cosine Similarity 

In [96]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'cos')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 71.89it/s]


In [97]:
df_openbio.sort_values(by=sim_scores['cos']).head(5)

Unnamed: 0,Source,Target,Cosine Similiarity
37,mondo:0005190,pubchem.compound:445154,0.082199
93,mondo:0007926,pubchem.compound:445154,0.082199
8,mondo:0004985,pubchem.compound:3386,0.121268
47,mondo:0010726,pubchem.compound:446220,0.13484
84,mondo:0005021,pubchem.compound:6741,0.144338


In [98]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05813953488372093

In [99]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'cos')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 151.90it/s]


In [100]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.0547945205479452

In [101]:
df_custom.head()

Unnamed: 0,Source,Target,Cosine Similiarity
0,mondo:0004985,pubchem.compound:32798,0.267261
1,mondo:0004985,pubchem.compound:31307,0.267261
2,mondo:0004985,pubchem.compound:5311067,0.267261
3,mondo:0004985,pubchem.compound:6714002,0.267261
4,mondo:0004985,pubchem.compound:247839,0.267261


# Jaccard Index

In [102]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 72/72 [00:01<00:00, 43.37it/s]


In [103]:
df_openbio.sort_values(by=sim_scores['ji'])

Unnamed: 0,Source,Target,Jaccard index
37,mondo:0005190,pubchem.compound:445154,0.006757
93,mondo:0007926,pubchem.compound:445154,0.006757
47,mondo:0010726,pubchem.compound:446220,0.018182
8,mondo:0004985,pubchem.compound:3386,0.028571
31,mondo:0007739,pubchem.compound:5711,0.055556
...,...,...,...
50,mondo:0004989,pubchem.compound:4764,1.000000
49,mondo:0004989,pubchem.compound:4817,1.000000
48,mondo:0004989,pubchem.compound:302576,1.000000
54,mondo:0006639,pubchem.compound:302576,1.000000


In [104]:
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05813953488372093

In [105]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 111.13it/s]


In [107]:
df_custom.head()

Unnamed: 0,Source,Target,Jaccard index
0,mondo:0004985,pubchem.compound:5311507,0.105263
1,mondo:0007254,pubchem.compound:6013,0.055556
2,mondo:0005068,pubchem.compound:44093,0.2
3,mondo:0005133,pubchem.compound:3902,0.333333
4,mondo:0005133,pubchem.compound:13109,0.333333


In [108]:
get_accuracy(gold_standard_dict, disease_drug_dict)

0.04580152671755725

# Sorensen Index 

In [114]:
disease_drug_dict, df_openbio = get_dict_df(disease_openbio, drug_openbio, graph_openbio.to_undirected(), similarity_type = 'si')

Main iterator: 100%|██████████| 72/72 [00:00<00:00, 78.65it/s]


In [122]:
df_openbio.head(10)

Unnamed: 0,Source,Target,Sorensen index
0,mondo:0009831,pubchem.compound:302576,1.0
1,mondo:0009831,pubchem.compound:4817,1.0
2,mondo:0009831,pubchem.compound:4764,1.0
3,mondo:0009831,pubchem.compound:216345,1.0
4,mondo:0009831,pubchem.compound:10172943,1.0
5,mondo:0009831,pubchem.compound:10022508,1.0
6,mondo:0015517,pubchem.compound:9949093,0.4
7,mondo:0005101,pubchem.compound:65399,0.4
8,mondo:0004985,pubchem.compound:3386,0.055556
9,mondo:0005005,pubchem.compound:42642645,0.5


In [123]:
# Accuracy for openbiolink KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.05813953488372093

In [124]:
disease_drug_dict, df_custom = get_dict_df(disease_custom, drug_custom, graph_custom.to_undirected(), similarity_type = 'ji')

Main iterator: 100%|██████████| 59/59 [00:00<00:00, 103.22it/s]


In [125]:
df_custom.head()

Unnamed: 0,Source,Target,Jaccard index
0,mondo:0004985,pubchem.compound:5311507,0.105263
1,mondo:0007254,pubchem.compound:6013,0.055556
2,mondo:0005068,pubchem.compound:44093,0.2
3,mondo:0005133,pubchem.compound:3902,0.333333
4,mondo:0005133,pubchem.compound:13109,0.333333


In [126]:
# Accuracy for custom KG
get_accuracy(gold_standard_dict, disease_drug_dict)

0.04580152671755725