# Benchmark analysis

This notebook compares the different benchmark according to the two KG.

# Imports

In [1]:
import os
import json
import math
import logging
import networkx as nx
import numpy as np
import pandas as pd

from collections import Counter, defaultdict
from itertools import product
from typing import Mapping, List
from tqdm import tqdm

from utils import DATA_DIR, KG_DATA_PATH, create_graph_from_df

# Load Graph and Gold Standard

In [2]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'openbiolink_filtered_kg.tsv'),
    sep='\t'
)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'custom_filtered_kg.tsv'),
    sep='\t'
)


# Load clinical data

In [3]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'clinical-trial.json')) as file:
    clinical_pair_dict = json.load(file).keys()

# Create KG 

In [4]:
graph_openbio = create_graph_from_df(openbiolink_df)
graph_custom = create_graph_from_df(custom_df)

Report on the number of relations: {-1: 12477, 1: 29022}
Report on the number of relations: {1: 43578, -1: 8045}


# Create drug disease dicts specific to KG

In [5]:
disease_openbio, drug_openbio,disease_custom,drug_custom = set(), set(), set(), set()

for u in graph_openbio.nodes():
    if u.startswith('mondo'):
        disease_openbio.add(u)
    elif u.startswith('pubchem'):
        drug_openbio.add(u)

for u in graph_custom.nodes():
    if u.startswith('mondo'):
        disease_custom.add(u)
    elif u.startswith('pubchem'):
        drug_custom.add(u)

# Helper Functions

In [6]:
score_actual = {}
kg_dfs = {}

In [7]:
def khop(
    nodeA: str, 
    nodeB: str, 
    graph: nx.Graph, 
    total: bool
) -> tuple:
    
    """Find nodes within the distance limit """
    
    khop_A = {u for u in graph.neighbors(nodeA)}
    khop_B = {u for u in graph.neighbors(nodeB)}
    
    if total:
        return list(khop_A | khop_B), khop_A, khop_B
    else:
        return list(khop_A & khop_B), khop_A, khop_B

In [8]:
def get_dict_df(
    diseases, 
    drugs, 
    undirected_kg_graph, 
    di_kg_graph,
    similarity_type
):
    
    df = pd.DataFrame(columns=[
        'source',
        'target',
        sim_scores[similarity_type]
    ])

    for disease in diseases:
        # Skip drugs not part of largest component of KG
        if disease not in undirected_kg_graph.nodes():
            continue
        
        cn = []
        
        # for each disease, find the similarity score with for each drug and append to list
        for drug in drugs:
            
            # Skip drugs not part of largest component of KG
            if drug not in undirected_kg_graph.nodes():
                continue
            
            shared_nodes, nodeA_neighbor, nodeB_neighbor = khop(
                nodeA=drug,
                nodeB=disease,
                graph=undirected_kg_graph, 
                total=False,
            )
            
            total_nodes, _, _ = khop(
                nodeA=drug,
                nodeB=disease,
                graph=undirected_kg_graph, 
                total=True,
            )
                
            if similarity_type == 'cn':
                similarity = len(shared_nodes)
            
            elif similarity_type == 'cos':
                similarity = len(shared_nodes) / (math.sqrt(len(nodeA_neighbor) * len(nodeB_neighbor)))
            
            elif similarity_type == 'ji':
                similarity = len(shared_nodes) / len(total_nodes)
                
            elif similarity_type == 'si':
                similarity = (2 * len(shared_nodes)) / (len(nodeA_neighbor) + len(nodeB_neighbor))
                
            elif similarity_type == 'hpi':
                similarity = len(shared_nodes) / min(len(nodeA_neighbor), len(nodeB_neighbor)) 
                
            elif similarity_type == 'hdi':
                similarity = len(shared_nodes) / max(len(nodeA_neighbor), len(nodeB_neighbor)) 
                
            elif similarity_type == 'lhn':
                similarity = len(shared_nodes) / (len(nodeA_neighbor) * len(nodeB_neighbor)) 
                
            elif similarity_type == 'pa':
                similarity = len(nodeA_neighbor) * len(nodeB_neighbor)
                
            elif similarity_type == 'aa':
                similarity = 0
                
                for n in shared_nodes:
                    neighbors_list = set(i for i in undirected_kg_graph.neighbors(n))
                    similarity += 1 / math.log10(len(neighbors_list))
                    
            elif similarity_type == 'ra':
                similarity = 0
                
                for n in shared_nodes:
                    neighbors_list = set(i for i in undirected_kg_graph.neighbors(n))
                    similarity += 1 / len(neighbors_list)
            elif similarity_type == 'sp':
                # try to see if path is between two nodes
                try:
                    similarity = len(nx.shortest_path(di_kg_graph,source=drug, target=disease))
                except:
                    continue
            cn.append(similarity)
        
        if not similarity_type == 'sp':
            index = np.where(cn == np.amax(cn))
        else:
            index = np.where(cn == np.amin(cn))
        # if list is full of 0's (i.e sum == 0), then there are no shared neighbors 
        
        if np.sum(cn) == 0 or len(np.array(cn)[index]) > 1:
            continue   
        
        for val in index:
            for j in val:
                df = df.append(
                    {
                        'source': list(drugs)[j], 
                        'target': disease, 
                        sim_scores[similarity_type]: cn[j]
                    }, 
                    ignore_index=True
                )

    return df

In [9]:
def get_precision(
    clinical_trial_dict: dict, 
    predicted: list
)-> tuple: 
    
    total = len(predicted)
    pos = 0
    
    for pair in predicted:
        if pair in clinical_trial_dict:
            pos += 1
    
    
    return round(((pos/total) * 100), 3), pos, total


# Value by change for both KGs

In [10]:
openbio_prob = 0

for disease in disease_openbio:
    for drug in drug_openbio:
        trial = f'{drug}_{disease}'
        if trial in clinical_pair_dict:
            openbio_prob += 1

total = len(drug_openbio) * len(disease_openbio)
prob = openbio_prob / total
open_val_by_chance = round(prob, 3)
open_val_by_chance

0.132

In [11]:
custom_prob = 0

for disease in disease_custom:
    for drug in drug_custom:
        trial = f'{drug}_{disease}'
        if trial in clinical_pair_dict:
            custom_prob += 1

total = len(drug_custom) * len(disease_custom)
prob = custom_prob / total
custom_val_by_chance = round(prob, 3)
custom_val_by_chance

0.108

# Different benchmark methods

In [12]:
sim_scores = {
    'cn': 'Common Neighbors',
    'cos': 'Cosine Similiarity',
    'ji': 'Jaccard index',
    'si': 'Sorensen index',
    'hpi': 'Hub Promoted Index',
    'hdi': 'Hub Depressed Index', 
    'lhn': 'Leicht–Holme–Newman Index',
    'pa':'Preferential Attachment',
    'aa': 'Adamic-Adar', 
    'ra': 'Resource Allocation Index',
    'sp': 'Shortest Path'
}

In [13]:
undirected_openbio = graph_openbio.to_undirected()
undirected_custom = graph_custom.to_undirected()
check_df = None

In [14]:
for algo in tqdm(sim_scores, desc='Calculating scores for algorithms'):
    
    algo_name = sim_scores[algo]
    
    # OpenBioLink KG
    full_df = get_dict_df(
        diseases=disease_openbio,
        drugs=drug_openbio, 
        undirected_kg_graph=undirected_openbio,
        di_kg_graph=graph_openbio,
        similarity_type=algo
    )

        
    full_df['pair'] = full_df['source'] + '_' + full_df['target']
    
    openbio_precision, openbio_pos, openbio_total = get_precision(
        clinical_trial_dict=clinical_pair_dict, 
        predicted=list(full_df['pair'].unique())
    )
    
    # Custom KG
    df_custom = get_dict_df(
        diseases=disease_custom, 
        drugs=drug_custom, 
        undirected_kg_graph=undirected_custom,
        di_kg_graph=graph_custom,
        similarity_type=algo
    )
    
    df_custom['pair'] = df_custom['source'] + '_' + df_custom['target']
    
    custom_precision, custom_pos, custom_total = get_precision(
        clinical_trial_dict=clinical_pair_dict, 
        predicted=list(df_custom['pair'].unique())
    )
   
    score_actual[algo_name] = {
        'openbio_precision': openbio_precision,
        'openbio_val_by_chance': open_val_by_chance,
        '# openbio_pairs': f'{openbio_pos}/{openbio_total}',
        'custom_precision': custom_precision,
        'custom_val_by_chance': custom_val_by_chance,
        '# custom_pairs': f'{custom_pos}/{custom_total}',
    }    

Calculating scores for algorithms: 100%|██████████| 11/11 [00:03<00:00,  3.19it/s]


In [15]:
pd.DataFrame(score_actual).transpose()

Unnamed: 0,openbio_precision,openbio_val_by_chance,# openbio_pairs,custom_precision,custom_val_by_chance,# custom_pairs
Common Neighbors,50.0,0.132,2/4,44.444,0.108,4/9
Cosine Similiarity,43.75,0.132,7/16,42.857,0.108,6/14
Jaccard index,43.75,0.132,7/16,40.0,0.108,6/15
Sorensen index,43.75,0.132,7/16,40.0,0.108,6/15
Hub Promoted Index,60.0,0.132,3/5,46.154,0.108,6/13
Hub Depressed Index,43.75,0.132,7/16,41.667,0.108,5/12
Leicht–Holme–Newman Index,43.75,0.132,7/16,46.667,0.108,7/15
Preferential Attachment,5.0,0.132,1/20,9.302,0.108,4/43
Adamic-Adar,33.333,0.132,2/6,36.364,0.108,4/11
Resource Allocation Index,33.333,0.132,2/6,36.364,0.108,4/11
