# Clinical trial analysis

This notebook involves anlaysing the clinical trial information for chemical-disease pair in combination between datasets

# Imports

In [1]:
import os
from tqdm import tqdm
import json
import pandas as pd
from itertools import product

from utils import get_disease_map, KG_DATA_PATH, DATA_DIR

# Load mapping file

In [2]:
mapping_dict = get_disease_map()

# Load subgraph

In [3]:
openbiolinks_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)

# Loading and saving clinical trial info

In [4]:
clinical_df = pd.read_csv(
    'https://raw.githubusercontent.com/drug2ways/results/master/validation/data/PubChem-MeSH-slim-counts.tsv',
    sep='\t'
)

clinical_df['cid_id'] = clinical_df['cid_id'].apply(lambda x: f'pubchem.compound:{x}')
clinical_df['condition'] = clinical_df['condition'].apply(lambda x: f'mesh:{x}')

mondo_id = []
for i in tqdm(clinical_df['condition'], desc='Normalizing diseases'):
    mondo_id.append(mapping_dict.get(i))
    
clinical_df['mondo_id'] = mondo_id
clinical_df

Normalizing diseases: 100%|██████████| 57458/57458 [00:00<00:00, 1492579.22it/s]


Unnamed: 0,cid_id,condition,n_trials,mondo_id
0,pubchem.compound:10026,mesh:D004931,1,mondo:0008698
1,pubchem.compound:10071196,mesh:D000374,2,mesh:D000374
2,pubchem.compound:10071196,mesh:D000544,3,mondo:0004975
3,pubchem.compound:10071196,mesh:D001523,9,mondo:0005084
4,pubchem.compound:10071196,mesh:D003704,1,mondo:0001627
...,...,...,...,...
57453,pubchem.compound:999,mesh:D016543,1,mondo:0002714
57454,pubchem.compound:999,mesh:D017093,2,mesh:D017093
57455,pubchem.compound:999,mesh:D017114,2,mondo:0019542
57456,pubchem.compound:999,mesh:D022124,1,mesh:D022124


In [5]:
# Creating dict with condition-chemical pair
clinical_trial_dict = {}

for pubchem_idx, _, n_trial, mondo_idx in tqdm(clinical_df.values, desc='Creating dict'):
    name = pubchem_idx + '_' + mondo_idx
    if name not in clinical_trial_dict:
        clinical_trial_dict[name] = {}

    clinical_trial_dict[name] = n_trial

Creating dict: 100%|██████████| 57458/57458 [00:00<00:00, 553274.28it/s]


In [6]:
# Save pair dictionary for future use
if not os.path.exists(os.path.join(DATA_DIR, 'gold-standard')):
    os.mkdir(os.path.join(DATA_DIR, 'gold-standard'))

with open(
    os.path.join(DATA_DIR, 'gold-standard', 'clinical_pairs.json'),
    'w'
) as f:
    json.dump(clinical_trial_dict, f, ensure_ascii=False, indent=2)

# GEO, LC1000, OpenTargets and Creed

In [7]:
# Load gene expression files

with open(os.path.join(DATA_DIR, 'creeds', 'normalized', 'chemical_expression.json')) as file:
    creed_chemicals = json.load(file).keys()
    
with open(os.path.join(DATA_DIR, 'geo', 'normalized', 'disease_expression.json')) as file2:
    geo_diseases = json.load(file2).keys()
    
with open(os.path.join(DATA_DIR, 'l1000', 'normalized', 'chemical_expression.json')) as file3:
    lc1000_chemicals = json.load(file3).keys()
    
with open(os.path.join(DATA_DIR, 'open_targets', 'normalized', 'disease_expression.json')) as file4:
    open_target_diseases = json.load(file4).keys()


# Get overlaps with subgraph

In [8]:
MAP = {
    'creed_chem' : creed_chemicals,
    'target_dis': open_target_diseases,
    'geo_dis': geo_diseases,
    'lc1000': lc1000_chemicals,
}

In [9]:
# For open target
openlink_kg_chemical = set(
    openbiolinks_df[openbiolinks_df['source'].str.contains('pubchem.compound')]['source']
)
openlink_kg_disease = set(
    openbiolinks_df[openbiolinks_df['target'].str.contains('mondo')]['target']
)

# For custom networks
custom_kg_chemical = set(
    custom_df[custom_df['source'].str.contains('pubchem.compound')]['source']
)
custom_kg_disease = set(
    custom_df[custom_df['target'].str.contains('mondo')]['target']
)


In [10]:
info_dict = {}

for i in MAP:
    curies = set(MAP[i])
    
    print(f'\n#### {i} ####')
    disease1 = openlink_kg_disease.intersection(curies)
    disease2 = custom_kg_disease.intersection(curies)
    
    chemical1 = openlink_kg_chemical.intersection(curies)
    chemical2 = custom_kg_chemical.intersection(curies)
    
    if i not in info_dict:
        info_dict[i] = {}
    
    if len(disease1) > 0:
        info_dict[i]['openbiolinks'] = disease1
        info_dict[i]['custom'] = disease2
    elif len(chemical1) > 0:
        info_dict[i]['openbiolinks'] = chemical1
        info_dict[i]['custom'] = chemical2
    else:
        print('SOMETHING IS WRONG!!!')
    
    print(f'Overlaps with OpenBioLink KG chemical', len(chemical1))
    print(f'Overlaps with Custom KG chemical', len(chemical2))
    print(f'Overlaps with OpenBioLink KG disease', len(disease1))
    print(f'Overlaps with Custom KG disease', len(disease2))
    


#### creed_chem ####
Overlaps with OpenBioLink KG chemical 96
Overlaps with Custom KG chemical 55
Overlaps with OpenBioLink KG disease 0
Overlaps with Custom KG disease 0

#### target_dis ####
Overlaps with OpenBioLink KG chemical 0
Overlaps with Custom KG chemical 0
Overlaps with OpenBioLink KG disease 47
Overlaps with Custom KG disease 35

#### geo_dis ####
Overlaps with OpenBioLink KG chemical 0
Overlaps with Custom KG chemical 0
Overlaps with OpenBioLink KG disease 18
Overlaps with Custom KG disease 17

#### lc1000 ####
Overlaps with OpenBioLink KG chemical 804
Overlaps with Custom KG chemical 292
Overlaps with OpenBioLink KG disease 0
Overlaps with Custom KG disease 0


# Filter KG based on possible chemical-disease paths

In [11]:
from networkx.algorithms.shortest_paths.generic import has_path
from utils import create_graph_from_df

# Get clinical pair percentage based on dataset

In [12]:
clinical_evidence = {'openbiolinks': {}, 'custom': {}}
count_dict = {'openbiolinks': {}, 'custom': {}}
total_count_dict = {'openbiolinks': {}, 'custom': {}}

for c, d in product(['creed_chem', 'lc1000'], ['target_dis', 'geo_dis']):
    print(c, d)
    c_set = info_dict[c]
    d_set = info_dict[d]
    
    # Remove nodes that do not have a path
    new_c_openbio = set()
    new_d_openbio = set()
    
    graph_copy = create_graph_from_df(openbiolinks_df)
    openbio_graph = graph_copy.copy()
    
    for chem_idx, disease_idx in product(c_set['openbiolinks'], d_set['openbiolinks']):
        if chem_idx in openbio_graph.nodes and disease_idx in openbio_graph.nodes: 
            if has_path(openbio_graph, chem_idx, disease_idx):
                new_c_openbio.add(chem_idx)
                new_d_openbio.add(disease_idx)
    
    new_c_custom = set()
    new_d_custom = set()
    
    graph_copy = create_graph_from_df(custom_df)
    custom_graph = graph_copy.copy()
    
    for chem_idx, disease_idx in product(c_set['custom'], d_set['custom']):
        if chem_idx in custom_graph.nodes and disease_idx in custom_graph.nodes: 
            if has_path(custom_graph, chem_idx, disease_idx):
                new_c_custom.add(chem_idx)
                new_d_custom.add(disease_idx)
                
    new_c_set = {'openbiolinks': new_c_openbio, 'custom': new_c_custom}
    new_d_set = {'openbiolinks': new_d_openbio, 'custom': new_d_custom} 
    print(
        f"Openbio - chemicals: {len(new_c_set['openbiolinks'])}, diseases - {len(new_d_set['openbiolinks'])} \n"
        f"Custom - chemicals: {len(new_c_set['custom'])}, diseases - {len(new_d_set['custom'])} \n"
    )
    
    # For OpenBioLinks
    biolink_count = 0
    
    for chem_idx, disease_idx in product(new_c_set['openbiolinks'], new_d_set['openbiolinks']):
        if has_path(openbio_graph, chem_idx, disease_idx):
            name = chem_idx + '_' + disease_idx
            if name in clinical_trial_dict:
                clinical_evidence['openbiolinks'][name] = 'yes'
                biolink_count += 1

    key_name = c + '_' + d
    count_dict['openbiolinks'][key_name] = biolink_count
    total_count_dict['openbiolinks'][key_name] = len(
        list(product(new_c_set['openbiolinks'], new_d_set['openbiolinks']))
    )
    
    
    # For Custom Network
    custom_network_count = 0
    
    for chem_idx, disease_idx in product(c_set['custom'], d_set['custom']):
        if has_path(custom_graph, chem_idx, disease_idx):
            name = chem_idx + '_' + disease_idx
            if name in clinical_trial_dict:
                clinical_evidence['custom'][name] = 'yes'
                custom_network_count += 1

    key_name = c + '_' + d
    count_dict['custom'][key_name] = custom_network_count
    total_count_dict['custom'][key_name] = len(
        list(product(new_c_set['custom'], new_d_set['custom']))
    )

Report on the number of relations: {-1: 16133, 1: 32745}


creed_chem target_dis


Report on the number of relations: {1: 43810, -1: 8372}


Openbio - chemicals: 77, diseases - 40 
Custom - chemicals: 52, diseases - 35 

creed_chem geo_dis


Report on the number of relations: {-1: 16133, 1: 32745}
Report on the number of relations: {1: 43810, -1: 8372}


Openbio - chemicals: 76, diseases - 17 
Custom - chemicals: 52, diseases - 17 

lc1000 target_dis


Report on the number of relations: {-1: 16133, 1: 32745}
Report on the number of relations: {1: 43810, -1: 8372}


Openbio - chemicals: 538, diseases - 45 
Custom - chemicals: 277, diseases - 35 



Report on the number of relations: {-1: 16133, 1: 32745}


lc1000 geo_dis


Report on the number of relations: {1: 43810, -1: 8372}


Openbio - chemicals: 529, diseases - 18 
Custom - chemicals: 272, diseases - 17 



In [13]:
# Clinical trial pair count
count_dict

{'openbiolinks': {'creed_chem_target_dis': 137,
  'creed_chem_geo_dis': 121,
  'lc1000_target_dis': 420,
  'lc1000_geo_dis': 305},
 'custom': {'creed_chem_target_dis': 182,
  'creed_chem_geo_dis': 141,
  'lc1000_target_dis': 489,
  'lc1000_geo_dis': 363}}

In [14]:
# Total number of pairs
total_count_dict

{'openbiolinks': {'creed_chem_target_dis': 3080,
  'creed_chem_geo_dis': 1292,
  'lc1000_target_dis': 24210,
  'lc1000_geo_dis': 9522},
 'custom': {'creed_chem_target_dis': 1820,
  'creed_chem_geo_dis': 884,
  'lc1000_target_dis': 9695,
  'lc1000_geo_dis': 4624}}

In [15]:
for kg_name in total_count_dict:
    print(f'### {kg_name} KG ###')
    for el in total_count_dict[kg_name]: 
        val = count_dict[kg_name][el] / total_count_dict[kg_name][el]
        print(el, val*100)
    print('\n')

### openbiolinks KG ###
creed_chem_target_dis 4.4480519480519485
creed_chem_geo_dis 9.365325077399381
lc1000_target_dis 1.734820322180917
lc1000_geo_dis 3.20310859063222


### custom KG ###
creed_chem_target_dis 10.0
creed_chem_geo_dis 15.95022624434389
lc1000_target_dis 5.043837029396596
lc1000_geo_dis 7.850346020761245


