# Dataset probabilities

This notebook involves analysing the clinical trial information for chemical-disease pair in for a given dataset to get the probability by chance.

# Imports

In [1]:
import os
import logging
import json
from collections import Counter
from tqdm import tqdm
import pandas as pd
from itertools import product

from networkx import DiGraph
from networkx.algorithms.shortest_paths.generic import has_path

from utils import KG_DATA_PATH, DATA_DIR, create_graph_from_df

In [2]:
logger = logging.getLogger(__name__)

# Load KG

In [3]:
openbiolinks_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'openbiolink_filtered_kg.tsv'),
    sep='\t'
)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'custom_filtered_kg.tsv'),
    sep='\t'
)

openbio_graph = create_graph_from_df(openbiolinks_df)
custom_graph = create_graph_from_df(custom_df)

Report on the number of relations: {-1: 12477, 1: 29022}
Report on the number of relations: {1: 43578, -1: 8045}


# Loading clinical trial pairs

In [4]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'clinical-trial.json'), 'r') as f:
    clinical_filtered = json.load(f)

# GEO, LC1000, OpenTargets and Creed

In [5]:
# Load gene expression files

with open(os.path.join(DATA_DIR, 'transcriptomics', 'creed_harmonized_expression.json')) as file:
    creed_chemicals = json.load(file).keys()
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'geo_harmonized_expression.json')) as file2:
    geo_diseases = json.load(file2).keys()
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'lc1000_harmonized_expression.json')) as file3:
    lc1000_chemicals = json.load(file3).keys()
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'target_harmonized_expression.json')) as file4:
    open_target_diseases = json.load(file4).keys()


# Get overlaps with subgraph

In [6]:
MAP = {
    'creed' : creed_chemicals,
    'target': open_target_diseases,
    'geo': geo_diseases,
    'lc1000': lc1000_chemicals,
}

In [7]:
def get_overlap_stats(graph_dict: dict):
    
    openbio_graph = graph_dict['openbio']
    
    openlink_kg_chemical = set()
    openlink_kg_disease = set()

    for node in openbio_graph.nodes():
        if 'pubchem' in node:
            openlink_kg_chemical.add(node)

        elif 'mondo' in node:
            openlink_kg_disease.add(node)

    custom_graph = graph_dict['custom']
    
    custom_kg_chemical = set()
    custom_kg_disease = set()

    for node in custom_graph.nodes():
        if 'pubchem' in node:
            custom_kg_chemical.add(node)

        elif 'mondo' in node:
            custom_kg_disease.add(node)
            
    # Get data present on in the KG 
    info_dict = {}

    for i in MAP:
        curies = set(MAP[i])

        print(f'\n#### {i} ####')
        disease1 = openlink_kg_disease.intersection(curies)
        disease2 = custom_kg_disease.intersection(curies)

        chemical1 = openlink_kg_chemical.intersection(curies)
        chemical2 = custom_kg_chemical.intersection(curies)

        if i not in info_dict:
            info_dict[i] = {}

        if len(disease1) > 0:
            info_dict[i]['openbiolinks'] = disease1
            info_dict[i]['custom'] = disease2
        elif len(chemical1) > 0:
            info_dict[i]['openbiolinks'] = chemical1
            info_dict[i]['custom'] = chemical2
        else:
            print('SOMETHING IS WRONG!!!')

        print(f'Overlaps with OpenBioLink KG chemical', len(chemical1))
        print(f'Overlaps with Custom KG chemical', len(chemical2))
        print(f'Overlaps with OpenBioLink KG disease', len(disease1))
        print(f'Overlaps with Custom KG disease', len(disease2))
    return info_dict

In [8]:
info_graph = get_overlap_stats(graph_dict={'openbio': openbio_graph, 'custom': custom_graph})


#### creed ####
Overlaps with OpenBioLink KG chemical 31
Overlaps with Custom KG chemical 30
Overlaps with OpenBioLink KG disease 0
Overlaps with Custom KG disease 0

#### target ####
Overlaps with OpenBioLink KG chemical 0
Overlaps with Custom KG chemical 0
Overlaps with OpenBioLink KG disease 18
Overlaps with Custom KG disease 39

#### geo ####
Overlaps with OpenBioLink KG chemical 0
Overlaps with Custom KG chemical 0
Overlaps with OpenBioLink KG disease 10
Overlaps with Custom KG disease 17

#### lc1000 ####
Overlaps with OpenBioLink KG chemical 189
Overlaps with Custom KG chemical 198
Overlaps with OpenBioLink KG disease 0
Overlaps with Custom KG disease 0


# Get clinical pair percentage based on dataset and gold standard data

In [9]:
def get_gold_standard_stats(
    gold_standard_dict: dict, 
    graph_dict: dict, 
    info_dict: dict
):
    
    clinical_evidence = {'openbiolinks': {}, 'custom': {}}
    count_dict = {'openbiolinks': {}, 'custom': {}}
    total_count_dict = {'openbiolinks': {}, 'custom': {}}
    
    openbio_graph = graph_dict['openbio']
    custom_graph = graph_dict['custom']

    for c, d in product(['creed', 'lc1000'], ['target', 'geo']):
        c_set = info_dict[c]
        d_set = info_dict[d]
        
        # For OpenBioLinks
        openbio_pairs_with_paths = 0
        tp_openbio = 0
        
        for chem_idx, disease_idx in product(c_set['openbiolinks'], d_set['openbiolinks']):
            if has_path(G=openbio_graph, source=chem_idx, target=disease_idx):
                openbio_pairs_with_paths += 1
                name = chem_idx + '_' + disease_idx
                
                if name in gold_standard_dict:
                    clinical_evidence['openbiolinks'][name] = 'yes'
                    tp_openbio += 1

        key_name = c + '_' + d
        
        # No.of clinical trial pairs
        count_dict['openbiolinks'][key_name] = tp_openbio  
        
        # Total no.of pairs
        total_count_dict['openbiolinks'][key_name] = openbio_pairs_with_paths
        
        # For Custom Network
        custom_pairs_with_paths = 0
        tp_custom = 0

        for chem_idx, disease_idx in product(c_set['custom'], d_set['custom']):
            if has_path(G=custom_graph, source=chem_idx, target=disease_idx):
                custom_pairs_with_paths += 1
                name = chem_idx + '_' + disease_idx
                if name in gold_standard_dict:
                    clinical_evidence['custom'][name] = 'yes'
                    tp_custom += 1

        key_name = c + '_' + d
        
        # No.of clinical trial pairs
        count_dict['custom'][key_name] = tp_custom
        
        # Total no.of pairs
        total_count_dict['custom'][key_name] = custom_pairs_with_paths
    
    # Display the value by change df
    data_df = pd.DataFrame(
        columns=['creed_target', 'creed_geo', 'lc1000_target', 'lc1000_geo'],
        index=['openbiolinks', 'custom']
    )    
    
    for kg_name in total_count_dict:
        for el in total_count_dict[kg_name]: 
            val = count_dict[kg_name][el] / total_count_dict[kg_name][el]
            data_df.loc[kg_name, el] = val*100
    
    return data_df, count_dict

# Clinical data - Value by chance

In [10]:
df, counts = get_gold_standard_stats(
    clinical_filtered, 
    graph_dict={'openbio': openbio_graph, 'custom': custom_graph},
    info_dict=info_graph,
)

In [11]:
df

Unnamed: 0,creed_target,creed_geo,lc1000_target,lc1000_geo
openbiolinks,32.6648,41.1538,15.012,17.4214
custom,24.4032,34.0771,9.61701,13.7363


In [12]:
counts

{'openbiolinks': {'creed_target': 146,
  'creed_geo': 114,
  'lc1000_target': 385,
  'lc1000_geo': 296},
 'custom': {'creed_target': 285,
  'creed_geo': 171,
  'lc1000_target': 713,
  'lc1000_geo': 432}}