# Clinical trial analysis

This notebook involves anlaysing the clinical trial information for chemical-disease pair in combination between datasets

# Imports

In [1]:
import os
from tqdm import tqdm
import json
import pandas as pd
from itertools import product
from networkx.algorithms.shortest_paths.generic import has_path
from utils import create_graph_from_df

from utils import get_disease_map, KG_DATA_PATH, DATA_DIR, create_venn_diagram

# Load subgraph

In [2]:
openbiolinks_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)

# Loading clinical trial pairs

In [3]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-clinical-pairs.json'), 'r') as f:
    clinical_trial_dict = json.load(f)

In [4]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-indications.json'), 'r') as f:
    indication_trial_dict = json.load(f)

# GEO, LC1000, OpenTargets and Creed

In [5]:
# Load gene expression files

with open(os.path.join(DATA_DIR, 'creeds', 'normalized', 'chemical_expression.json')) as file:
    creed_chemicals = json.load(file).keys()
    
with open(os.path.join(DATA_DIR, 'geo', 'normalized', 'disease_expression.json')) as file2:
    geo_diseases = json.load(file2).keys()
    
with open(os.path.join(DATA_DIR, 'l1000', 'normalized', 'chemical_expression.json')) as file3:
    lc1000_chemicals = json.load(file3).keys()
    
with open(os.path.join(DATA_DIR, 'open_targets', 'normalized', 'disease_expression.json')) as file4:
    open_target_diseases = json.load(file4).keys()


# Get overlaps with subgraph

In [6]:
MAP = {
    'creed' : creed_chemicals,
    'target': open_target_diseases,
    'geo': geo_diseases,
    'lc1000': lc1000_chemicals,
}

In [7]:
# For open target
openlink_kg_chemical = set(
    openbiolinks_df[openbiolinks_df['source'].str.contains('pubchem.compound')]['source']
)
openlink_kg_disease = set(
    openbiolinks_df[openbiolinks_df['target'].str.contains('mondo')]['target']
)

# For custom networks
custom_kg_chemical = set(
    custom_df[custom_df['source'].str.contains('pubchem.compound')]['source']
)
custom_kg_disease = set(
    custom_df[custom_df['target'].str.contains('mondo')]['target']
)


In [8]:
info_dict = {}

for i in MAP:
    curies = set(MAP[i])
    
    print(f'\n#### {i} ####')
    disease1 = openlink_kg_disease.intersection(curies)
    disease2 = custom_kg_disease.intersection(curies)
    
    chemical1 = openlink_kg_chemical.intersection(curies)
    chemical2 = custom_kg_chemical.intersection(curies)
    
    if i not in info_dict:
        info_dict[i] = {}
    
    if len(disease1) > 0:
        info_dict[i]['openbiolinks'] = disease1
        info_dict[i]['custom'] = disease2
    elif len(chemical1) > 0:
        info_dict[i]['openbiolinks'] = chemical1
        info_dict[i]['custom'] = chemical2
    else:
        print('SOMETHING IS WRONG!!!')
    
    print(f'Overlaps with OpenBioLink KG chemical', len(chemical1))
    print(f'Overlaps with Custom KG chemical', len(chemical2))
    print(f'Overlaps with OpenBioLink KG disease', len(disease1))
    print(f'Overlaps with Custom KG disease', len(disease2))
    


#### creed ####
Overlaps with OpenBioLink KG chemical 96
Overlaps with Custom KG chemical 55
Overlaps with OpenBioLink KG disease 0
Overlaps with Custom KG disease 0

#### target ####
Overlaps with OpenBioLink KG chemical 0
Overlaps with Custom KG chemical 0
Overlaps with OpenBioLink KG disease 47
Overlaps with Custom KG disease 35

#### geo ####
Overlaps with OpenBioLink KG chemical 0
Overlaps with Custom KG chemical 0
Overlaps with OpenBioLink KG disease 18
Overlaps with Custom KG disease 17

#### lc1000 ####
Overlaps with OpenBioLink KG chemical 804
Overlaps with Custom KG chemical 292
Overlaps with OpenBioLink KG disease 0
Overlaps with Custom KG disease 0


# Get clinical pair percentage based on dataset and gold standard data

In [9]:
def get_gold_standard_stats(gold_standard_dict: dict):
    clinical_evidence = {'openbiolinks': {}, 'custom': {}}
    count_dict = {'openbiolinks': {}, 'custom': {}}
    total_count_dict = {'openbiolinks': {}, 'custom': {}}

    for c, d in product(['creed', 'lc1000'], ['target', 'geo']):
        print(c, d)
        c_set = info_dict[c]
        d_set = info_dict[d]

        # Filter nodes based on KG
        new_c_openbio = set()
        new_d_openbio = set()

        graph_copy = create_graph_from_df(openbiolinks_df)
        openbio_graph = graph_copy.copy()

        for chem_idx, disease_idx in product(c_set['openbiolinks'], d_set['openbiolinks']):
            if chem_idx in openbio_graph.nodes and disease_idx in openbio_graph.nodes: 
                if has_path(openbio_graph, chem_idx, disease_idx):
                    new_c_openbio.add(chem_idx)
                    new_d_openbio.add(disease_idx)

        new_c_custom = set()
        new_d_custom = set()

        graph_copy = create_graph_from_df(custom_df)
        custom_graph = graph_copy.copy()

        for chem_idx, disease_idx in product(c_set['custom'], d_set['custom']):
            if chem_idx in custom_graph.nodes and disease_idx in custom_graph.nodes: 
                if has_path(custom_graph, chem_idx, disease_idx):
                    new_c_custom.add(chem_idx)
                    new_d_custom.add(disease_idx)

        new_c_set = {'openbiolinks': new_c_openbio, 'custom': new_c_custom}
        new_d_set = {'openbiolinks': new_d_openbio, 'custom': new_d_custom} 
        
        # Get gold standard count
        
        # For OpenBioLinks
        biolink_count = 0

        for chem_idx, disease_idx in product(new_c_set['openbiolinks'], new_d_set['openbiolinks']):
            if has_path(openbio_graph, chem_idx, disease_idx):
                name = chem_idx + '_' + disease_idx
                if name in gold_standard_dict:
                    clinical_evidence['openbiolinks'][name] = 'yes'
                    biolink_count += 1

        key_name = c + '_' + d
        count_dict['openbiolinks'][key_name] = biolink_count
        total_count_dict['openbiolinks'][key_name] = len(
            list(product(new_c_set['openbiolinks'], new_d_set['openbiolinks']))
        )


        # For Custom Network
        custom_network_count = 0

        for chem_idx, disease_idx in product(c_set['custom'], d_set['custom']):
            if has_path(custom_graph, chem_idx, disease_idx):
                name = chem_idx + '_' + disease_idx
                if name in gold_standard_dict:
                    clinical_evidence['custom'][name] = 'yes'
                    custom_network_count += 1

        key_name = c + '_' + d
        count_dict['custom'][key_name] = custom_network_count
        total_count_dict['custom'][key_name] = len(
            list(product(new_c_set['custom'], new_d_set['custom']))
        )
    
    # Display the value by change df
    data_df = pd.DataFrame(
        columns=['creed_target', 'creed_geo', 'lc1000_target', 'lc1000_geo'],
        index=['openbiolinks', 'custom']
    )    
    
    for kg_name in total_count_dict:
        for el in total_count_dict[kg_name]: 
            val = count_dict[kg_name][el] / total_count_dict[kg_name][el]
            data_df.loc[kg_name, el] = val*100
    
    return data_df

In [10]:
get_gold_standard_stats(clinical_trial_dict)

Report on the number of relations: {-1: 16133, 1: 32745}


creed target


Report on the number of relations: {1: 43810, -1: 8372}
Report on the number of relations: {-1: 16133, 1: 32745}


creed geo


Report on the number of relations: {1: 43810, -1: 8372}


lc1000 target


Report on the number of relations: {-1: 16133, 1: 32745}
Report on the number of relations: {1: 43810, -1: 8372}
Report on the number of relations: {-1: 16133, 1: 32745}


lc1000 geo


Report on the number of relations: {1: 43810, -1: 8372}


Unnamed: 0,creed_target,creed_geo,lc1000_target,lc1000_geo
openbiolinks,4.44805,9.36533,1.73482,3.20311
custom,10.0,15.9502,5.04384,7.85035


In [11]:
get_gold_standard_stats(indication_trial_dict)

Report on the number of relations: {-1: 16133, 1: 32745}


creed target


Report on the number of relations: {1: 43810, -1: 8372}
Report on the number of relations: {-1: 16133, 1: 32745}


creed geo


Report on the number of relations: {1: 43810, -1: 8372}
Report on the number of relations: {-1: 16133, 1: 32745}


lc1000 target


Report on the number of relations: {1: 43810, -1: 8372}
Report on the number of relations: {-1: 16133, 1: 32745}


lc1000 geo


Report on the number of relations: {1: 43810, -1: 8372}


Unnamed: 0,creed_target,creed_geo,lc1000_target,lc1000_geo
openbiolinks,0.551948,1.23839,0.165221,0.21004
custom,1.31868,2.60181,0.866426,1.44896
