# Notebook to extract and curate ChEMBL data for the Leeson data set (drug-target interactions)

### Authors: Barbara Zdrazil, Lina Heinzke
### 10/2022

**This notebook extracts data from ChEMBL and performs some curation steps in order to retrieve a data set for drug-target, and clinical candidate-target associations including comparator compounds.**

**The notebook is based on initial work by Anne Hersey, Patrica Bento, Emma Manners, Paul Leeson, and Andrew Leach.**

**More documentation on the initial data set compilation can be found here ("Ligand Efficiency"): https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?spaceKey=CHEMBL&title=Anne%27s+Notes**


In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold

#### notebook settings
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

# Get data from ChEMBL

In [2]:
# # @Barbara: uncomment and modify this to your preferred paths
# path_results = "/Users/bzdrazil/Dropbox/ChEMBL/NP/data/"
# path_sqlite3_database = <your sqlite database location>

chembl_version = "26"
base_path = "/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/"
path_results = base_path+"results/"
path_sqlite3_database = base_path+"data/chembl_"+chembl_version+"/chembl_"+chembl_version+"_sqlite/chembl_"+chembl_version+".db"

In [3]:
# # @Barbara: Accessing ChEMBL using Oracle
# # If you want to use this option, you have to change the sql statements to the commented line:
# # from docs -> from chembl_31.docs

# import cx_Oracle

# #cx_Oracle.init_oracle_client(lib_dir="/Users/bzdrazil/Downloads/instantclient_19_8")  #https://www.oracle.com/in/database/technologies/instant-client/macos-intel-x86-downloads.html; https://stackoverflow.com/questions/56119490/cx-oracle-error-dpi-1047-cannot-locate-a-64-bit-oracle-client-library
# cx_Oracle.clientversion() 

# import pandas as pd
# import sqlalchemy as sa
# import requests

# workdir = '/Users/bzdrazil/Desktop/'

# chemdev2 = 'oracle://{}:{}@ora-dlvm-103.ebi.ac.uk:1521/?service_name=chemdev2'.format('user', 'pw') # insert your username and password
# engine_ch = sa.create_engine(chemdev2)

In [4]:
import sqlite3

engine_ch = sqlite3.connect(path_sqlite3_database)

In [5]:
sql = '''

select mh.parent_molregno, 
docs.year, docs.journal, 
act.pchembl_value, act.standard_type, 
ass.assay_type, ass.tid, 
md.chembl_id as compound_chembl_id, md.pref_name as compound_pref_name, md.max_phase, 
md.first_approval, md.prodrug, md.oral, md.parenteral, md.topical, md.black_box_warning, 
td.pref_name as Target_pref_name, td.target_type, td.organism, td.chembl_id as Target_chembl_id
from docs, activities act, molecule_hierarchy mh, assays ass, target_dictionary td, molecule_dictionary md
-- from chembl_31.docs, chembl_31.activities act, chembl_31.molecule_hierarchy mh, chembl_31.assays ass, chembl_31.target_dictionary td, chembl_31.molecule_dictionary md
where mh.molregno=act.molregno
and act.pchembl_value is not null
-- and ass.assay_type ='B'
and act.assay_id=ass.assay_id
and act.doc_id = docs.doc_id
and ass.tid=td.tid
and md.molregno=mh.parent_molregno
and act.potential_duplicate =0
and data_validity_comment is null
and act.standard_relation ='='
and td.tid <>22226   ----exclude unchecked targets
and td.target_type like '%PROTEIN%'

'''

df_mols = pd.read_sql_query(sql, con=engine_ch)
df_mols

Unnamed: 0,parent_molregno,year,journal,pchembl_value,standard_type,assay_type,tid,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,Target_pref_name,target_type,organism,Target_chembl_id
0,252199,2004.0,Bioorg. Med. Chem. Lett.,5.40,IC50,B,10483,CHEMBL357278,,0,,-1,0,0,0,0,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
1,253534,2004.0,Bioorg. Med. Chem. Lett.,4.77,IC50,B,10483,CHEMBL357119,,0,,-1,0,0,0,0,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
2,253199,2004.0,Bioorg. Med. Chem. Lett.,6.75,IC50,B,10483,CHEMBL152968,,0,,-1,0,0,0,0,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
3,253199,2004.0,Bioorg. Med. Chem. Lett.,5.22,IC50,A,12594,CHEMBL152968,,0,,-1,0,0,0,0,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356
4,253199,2004.0,Bioorg. Med. Chem. Lett.,4.43,IC50,A,17045,CHEMBL152968,,0,,-1,0,0,0,0,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475921,2317531,2018.0,J Med Chem,8.10,Ki,B,11522,CHEMBL4278080,,0,,-1,0,0,0,0,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298
2475922,2334349,2018.0,J Med Chem,9.22,Kd,B,134,CHEMBL4294901,,0,,-1,0,0,0,0,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889
2475923,2329285,2018.0,J Med Chem,8.42,Ki,B,134,CHEMBL4289837,,0,,-1,0,0,0,0,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889
2475924,2317951,2018.0,J Med Chem,7.89,Ki,B,134,CHEMBL4278500,,0,,-1,0,0,0,0,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889


In [6]:
df_mols = df_mols.astype({
    'year': 'Int64',
    'first_approval': 'Int64'
})

In [7]:
# df_mols.to_csv(path_results+"ChEMBL31_initial_query.csv")

In [8]:
############### TESTING: load comparison data set ###############
df_comparison = pd.read_csv(base_path+"data/DTI_2021_final.dsv", sep = "\t")
df_comparison[['TID_wo_ending', 'ending']] = df_comparison['TID'].str.split('-', 1, expand=True)
df_comparison['PAIRS'] = df_comparison.agg('{0[PARENT_MOLREGNO]}_{0[TID_wo_ending]}'.format, axis=1)
df_comparison = df_comparison.astype({'TID_wo_ending': 'Int64', 'PARENT_MOLREGNO': 'Int64'})

comp_mols = set(df_comparison["PARENT_MOLREGNO"]) 
comp_drugs = set(df_comparison[df_comparison['DTI'] == 'D_DT']["PARENT_MOLREGNO"])
comp_targets = set(df_comparison["TID_wo_ending"]) 
comp_pairs = set(df_comparison["PAIRS"])
comp_drug_pairs = set(df_comparison[df_comparison['DTI'] == 'D_DT']["PAIRS"])

all_intersect_comp = [["comparison", len(comp_mols), len(comp_drugs), len(comp_targets), len(comp_pairs), len(comp_drug_pairs)]]
all_length_comp = [["comparison", len(comp_mols), len(comp_drugs), len(comp_targets), len(comp_pairs), len(comp_drug_pairs)]]

min_nof_cpds = 100
comparator_cpds_100 = df_comparison[df_comparison['MAX_PHASE'] < 4].groupby(['TID'])['PARENT_MOLREGNO'].count()
targets_enough_cpds = comparator_cpds_100[comparator_cpds_100 >= min_nof_cpds].index.tolist()
df_comparison_100 = df_comparison.query('TID in @targets_enough_cpds')

comp_mols_100 = set(df_comparison_100["PARENT_MOLREGNO"]) 
comp_drugs_100 = set(df_comparison_100[df_comparison_100['DTI'] == 'D_DT']["PARENT_MOLREGNO"])
comp_targets_100 = set(df_comparison_100["TID_wo_ending"]) 
comp_pairs_100 = set(df_comparison_100["PAIRS"])
comp_drug_pairs_100 = set(df_comparison_100[df_comparison_100['DTI'] == 'D_DT']["PAIRS"])

all_intersect_comp_100 = [["comparison_100", len(comp_mols_100), len(comp_drugs_100), len(comp_targets_100), len(comp_pairs_100), len(comp_drug_pairs_100)]]
all_length_comp.append(["comparison_100", len(comp_mols_100), len(comp_drugs_100), len(comp_targets_100), len(comp_pairs_100), len(comp_drug_pairs_100)])

In [9]:
############### TESTING: method to print comparison to original dataset ###############
def print_comparison(now, comp, intersect):
    print("{:15}".format("current:"), len(now))
    print("{:15}".format("comparison:"), len(comp))
    print("{:15}".format("intersection:"), len(intersect))



def print_mol_comparison(now_mols, comp_mols=comp_mols, output=False):
    intersect = now_mols.intersection(comp_mols)
    if output:
        print("------")
        print("Mols (molregno)")
        print_comparison(now_mols, comp_mols, intersect)
    return len(intersect)

def print_target_comparison(now_targets, comp_targets=comp_targets, output=False):
    intersect = now_targets.intersection(comp_targets)
    if output:
        print("------")
        print("Targets (tid)")
        print_comparison(now_targets, comp_targets, intersect)
    return len(intersect)

def print_pair_comparison(now_pairs, comp_pairs=comp_pairs, output=False):
    intersect = now_pairs.intersection(comp_pairs)
    if output:
        print("------")
        print("Compound-target pairs")
        print_comparison(now_pairs, comp_pairs, intersect)
    return len(intersect)

def print_drug_comparison(now_drugs, comp_drugs=comp_drugs, output=False):
    intersect = now_drugs.intersection(comp_drugs)
    if output:
        print("------")
        print("Drugs (molregno)")
        print_comparison(now_drugs, comp_drugs, intersect)
    return len(intersect)

def print_drug_pair_comparison(now_drug_pairs, comp_drug_pairs=comp_drug_pairs, output=False):
    intersect = now_drug_pairs.intersection(comp_drug_pairs)
    if output:
        print("------")
        print("Drug-target pairs")
        print_comparison(now_drug_pairs, comp_drug_pairs, intersect)
    return len(intersect)



def print_all_comparisons(now_mols, now_targets, now_pairs, now_drugs, now_drug_pairs, output=False):
    intersections = []
    intersections.append(print_mol_comparison(now_mols, output=output))
    intersections.append(print_drug_comparison(now_drugs, output=output))
    intersections.append(print_target_comparison(now_targets, output=output))
    intersections.append(print_pair_comparison(now_pairs, output=output))
    intersections.append(print_drug_pair_comparison(now_drug_pairs, output=output))
    return intersections

def print_all_comparisons_100(now_mols, now_targets, now_pairs, now_drugs, now_drug_pairs, output=False):
    intersections = []
    intersections.append(print_mol_comparison(now_mols, comp_mols_100, output))
    intersections.append(print_drug_comparison(now_drugs, comp_drugs_100, output))
    intersections.append(print_target_comparison(now_targets, comp_targets_100, output))
    intersections.append(print_pair_comparison(now_pairs, comp_pairs_100, output))
    intersections.append(print_drug_pair_comparison(now_drug_pairs, comp_drug_pairs_100, output))
    return intersections



def add_intersections(data, label, output=False):
    now_mols = set(data["parent_molregno"]) 
    now_drugs = set(data[data["max_phase"] == 4]["parent_molregno"]) 
    now_targets = set(data["tid"]) 
    now_pairs = set(data.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1))
    now_drug_pairs = set(data[data["max_phase"] == 4].agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1))
    
    comp = print_all_comparisons(now_mols, now_targets, now_pairs, now_drugs, now_drug_pairs, output)
    all_intersect_comp.append([label] + comp)
    all_length_comp.append([label, len(now_mols), len(now_drugs), len(now_targets), len(now_pairs), len(now_drug_pairs)])

    if output:
        print("..................... >= 100 cpds .....................")
    comp_100 = print_all_comparisons_100(now_mols, now_targets, now_pairs, now_drugs, now_drug_pairs, output)
    all_intersect_comp_100.append([label] + comp_100)

In [10]:
############### TESTING: initial query comparison ###############
add_intersections(df_mols, "init", True)

------
Mols (molregno)
current:        923862
comparison:     378661
intersection:   378661
------
Drugs (molregno)
current:        1637
comparison:     686
intersection:   616
------
Targets (tid)
current:        6093
comparison:     908
intersection:   908
------
Compound-target pairs
current:        2088251
comparison:     561452
intersection:   561452
------
Drug-target pairs
current:        21164
comparison:     1221
intersection:   1122
..................... >= 100 cpds .....................
------
Mols (molregno)
current:        923862
comparison:     372275
intersection:   372275
------
Drugs (molregno)
current:        1637
comparison:     642
intersection:   572
------
Targets (tid)
current:        6093
comparison:     519
intersection:   519
------
Compound-target pairs
current:        2088251
comparison:     549498
intersection:   549498
------
Drug-target pairs
current:        21164
comparison:     1102
intersection:   1007


# Add single proteins for protein families, protein complexes, protein complex groups

## Query protein mappings

In [11]:
sql_dti = '''
select tr.tid, tr.relationship, tr.related_tid, 
td1.pref_name as pref_name_1, td1.target_type as target_type_1, td1.organism as organism_1, 
td2.pref_name as pref_name_2, td2.target_type as target_type_2, td2.organism as organism_2, td2.chembl_id as Target_chembl_id_2 
from target_relations tr
inner join target_dictionary td1
    on tr.tid = td1.tid
inner join target_dictionary td2
    on tr.related_tid = td2.tid
'''

df_related_targets = pd.read_sql_query(sql_dti, con=engine_ch)
df_related_targets

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2,Target_chembl_id_2
0,10193,SUBSET OF,104764,Carbonic anhydrase I,SINGLE PROTEIN,Homo sapiens,Carbonic anhydrase,PROTEIN FAMILY,Homo sapiens,CHEMBL2095180
1,12071,SUBSET OF,109746,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase,PROTEIN FAMILY,Homo sapiens,CHEMBL3559691
2,12071,SUBSET OF,104709,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase 1/cyclin B,PROTEIN COMPLEX,Homo sapiens,CHEMBL2094127
3,12071,SUBSET OF,107893,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,CDK1/Cyclin A,PROTEIN COMPLEX,Homo sapiens,CHEMBL3038467
4,12071,SUBSET OF,117095,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase 1/G1/S-specific cyclin-D1,PROTEIN COMPLEX,Homo sapiens,CHEMBL3885551
...,...,...,...,...,...,...,...,...,...,...
9573,117017,OVERLAPS WITH,117016,Aurora kinase A/Targeting protein for Xklp2,PROTEIN COMPLEX,Homo sapiens,Aurora kinase A/B,PROTEIN FAMILY,Homo sapiens,CHEMBL3883303
9574,117017,SUPERSET OF,100970,Aurora kinase A/Targeting protein for Xklp2,PROTEIN COMPLEX,Homo sapiens,Targeting protein for Xklp2,SINGLE PROTEIN,Homo sapiens,CHEMBL5389
9575,117017,SUPERSET OF,20014,Aurora kinase A/Targeting protein for Xklp2,PROTEIN COMPLEX,Homo sapiens,Serine/threonine-protein kinase Aurora-A,SINGLE PROTEIN,Homo sapiens,CHEMBL4722
9576,117026,OVERLAPS WITH,117045,Perilipin-1/ABHD5,PROTEIN-PROTEIN INTERACTION,Homo sapiens,1-acylglycerol-3-phosphate O-acyltransferase ABHD5/Perilipin-5,PROTEIN-PROTEIN INTERACTION,Homo sapiens,CHEMBL3885501


In [12]:
protein_complex_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_family_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN FAMILY") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_complex_group_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX GROUP") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

# TODO: should these be included? which direction (how to avoid duplications)?
single_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "SINGLE PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "EQUIVALENT TO")]

# TODO: should these be included?
chimeric_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "CHIMERIC PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

# TODO: should these be included?
ppi_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN-PROTEIN INTERACTION") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

In [13]:
relevant_mappings = pd.concat([protein_complex_mapping, 
                               protein_family_mapping, 
                               protein_complex_group_mapping,
                               #single_protein_mapping, 
                               #chimeric_protein_mapping, 
                               #ppi_mapping, 
                               ])
relevant_mappings['tid'] = relevant_mappings['tid'].astype('Int64')
relevant_mappings.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2,Target_chembl_id_2
249,104282,SUPERSET OF,10819,Acetylcholine receptor; alpha1/beta1/delta/gamma,PROTEIN COMPLEX,Homo sapiens,Acetylcholine receptor protein alpha chain,SINGLE PROTEIN,Homo sapiens,CHEMBL4808
251,104282,SUPERSET OF,12715,Acetylcholine receptor; alpha1/beta1/delta/gamma,PROTEIN COMPLEX,Homo sapiens,Acetylcholine receptor protein delta chain,SINGLE PROTEIN,Homo sapiens,CHEMBL3011
260,104284,SUPERSET OF,10023,Neuronal acetylcholine receptor; alpha2/beta4,PROTEIN COMPLEX,Rattus norvegicus,Neuronal acetylcholine receptor protein alpha-2 subunit,SINGLE PROTEIN,Rattus norvegicus,CHEMBL2584
261,104284,SUPERSET OF,12717,Neuronal acetylcholine receptor; alpha2/beta4,PROTEIN COMPLEX,Rattus norvegicus,Neuronal acetylcholine receptor subunit beta-4,SINGLE PROTEIN,Rattus norvegicus,CHEMBL2658
275,104685,SUPERSET OF,49,Integrin alpha-IIb/beta-3,PROTEIN COMPLEX,Homo sapiens,Integrin alpha-IIb,SINGLE PROTEIN,Homo sapiens,CHEMBL212


## Combine dataset with mapped single proteins

In [14]:
# TODO: I thing this is not necessary; but why was it not done?

In [15]:
# # Compile dataset with the mapped single proteins instead of the protein families / complexes
# df_mols_add_targets = df_mols.merge(relevant_mappings, on = 'tid', how = 'inner')
# df_mols_add_targets = df_mols_add_targets.drop(columns=['tid', 
#                                 'Target_pref_name', 'target_type', 'organism', 'Target_chembl_id',
#                                 'pref_name_1', 'target_type_1', 'organism_1', 'relationship'])
# df_mols_add_targets = df_mols_add_targets.rename(columns={"related_tid": "tid", 
#                                     "pref_name_2": "Target_pref_name", 
#                                     "target_type_2": "target_type", 
#                                     "organism_2": "organism", 
#                                     "Target_chembl_id_2": "Target_chembl_id"})
# df_mols_add_targets.head()

In [16]:
# # Combined original dataset with mapped dataset
# df_mols = pd.concat([df_mols, df_mols_add_targets])

In [17]:
# ############### TESTING: mapped target ids comparison ###############
# add_intersections(df_mols, "mapped")

# Calculate mean, median, and max pChEMBL values for each compound-target pair

In [18]:
# summarise pchembl values into mean, max, median and year into first publication
df_mols['pchembl_value_mean'] = df_mols.groupby(['parent_molregno', 'tid'])['pchembl_value'].transform('mean')
df_mols['pchembl_value_max'] = df_mols.groupby(['parent_molregno', 'tid'])['pchembl_value'].transform('max')
df_mols['pchembl_value_median'] = df_mols.groupby(['parent_molregno', 'tid'])['pchembl_value'].transform('median')
df_mols['first_publication_target_cmpd_pair'] = df_mols.groupby(['parent_molregno', 'tid'])['year'].transform('min')

In [19]:
# repeat based on the data of only the binding assays
df_mols_binding = df_mols[df_mols['assay_type'] == 'B'].copy()
df_mols_binding['pchembl_value_mean'] = df_mols_binding.groupby(['parent_molregno', 'tid'])['pchembl_value'].transform('mean')
df_mols_binding['pchembl_value_max'] = df_mols_binding.groupby(['parent_molregno', 'tid'])['pchembl_value'].transform('max')
df_mols_binding['pchembl_value_median'] = df_mols_binding.groupby(['parent_molregno', 'tid'])['pchembl_value'].transform('median')
df_mols_binding['first_publication_target_cmpd_pair'] = df_mols_binding.groupby(['parent_molregno', 'tid'])['year'].transform('min')

In [20]:
# table that has rows for pchembl_mean, max, median based on all assay data (only_binding = False)
# and rows for values based on only binding assays (only_binding = True)
df_mols['only_binding'] = False
df_mols_binding['only_binding'] = True
df_mols_limited = pd.concat([df_mols, df_mols_binding])
df_mols_limited = df_mols_limited.drop(columns=['year', 'journal', 'pchembl_value', 'standard_type', 'assay_type'])
df_mols_limited = df_mols_limited.drop_duplicates()

# Add compound properties

In [21]:
sql_cpd_props = '''

select mh.parent_molregno, 
cp.mw_freebase, cp.alogp, cp.hba, cp.hbd, cp.psa, cp.rtb, cp.ro3_pass, cp.num_ro5_violations, 
cp.cx_most_apka, cp.cx_most_bpka, cp.cx_logp, cp.cx_logd, cp.molecular_species, cp.full_mwt, 
cp.aromatic_rings, cp.heavy_atoms, cp.qed_weighted, cp.mw_monoisotopic, cp.full_molformula, 
cp.hba_lipinski, cp.hbd_lipinski, cp.num_lipinski_ro5_violations, 
struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles
from compound_properties cp, molecule_hierarchy mh, compound_structures struct
-- from chembL_31.compound_properties cp,chembl_31.molecule_hierarchy mh, CHEMBL_31.compound_structures struct
where cp.molregno=mh.parent_molregno
and struct.molregno=mh.parent_molregno

'''

df_cpd_props = pd.read_sql_query(sql_cpd_props, con=engine_ch)
df_cpd_props.head()

Unnamed: 0,parent_molregno,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,1,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.48,,3.63,2.69,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0,InChI=1S/C17H12ClN3O3/c1-10-8-11(21-17(24)20-15(22)9-19-21)6-7-12(10)16(23)13-4-2-3-5-14(13)18/h...,OWRSAHYFSSNENM-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,2,332.32,1.33,6.0,1.0,108.61,3.0,N,0.0,6.33,,2.88,1.82,ACID,332.32,3.0,25.0,0.73,332.0909,C18H12N4O3,7.0,1.0,0.0,InChI=1S/C18H12N4O3/c1-11-8-14(22-18(25)21-16(23)10-20-22)6-7-15(11)17(24)13-4-2-12(9-19)3-5-13/...,ZJYUMURGSZQFMH-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,3,357.8,2.27,5.0,2.0,87.98,3.0,N,0.0,6.33,,3.7,2.64,ACID,357.8,3.0,25.0,0.75,357.088,C18H16ClN3O3,6.0,2.0,0.0,InChI=1S/C18H16ClN3O3/c1-10-7-14(22-18(25)21-15(23)9-20-22)8-11(2)16(10)17(24)12-3-5-13(19)6-4-1...,YOMWDCALSDWFSV-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,4,307.31,1.46,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.02,1.97,ACID,307.31,3.0,23.0,0.74,307.0957,C17H13N3O3,6.0,1.0,0.0,InChI=1S/C17H13N3O3/c1-11-2-4-12(5-3-11)16(22)13-6-8-14(9-7-13)20-17(23)19-15(21)10-18-20/h2-10H...,PSOPUAQFGCRDIP-UHFFFAOYSA-N,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,5,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.63,2.57,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0,InChI=1S/C17H12ClN3O3/c1-10-8-13(21-17(24)20-15(22)9-19-21)6-7-14(10)16(23)11-2-4-12(18)5-3-11/h...,KEZNSCMBVRNOHO-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1


In [22]:
df_cpd_props_unique = df_cpd_props.drop_duplicates()

In [23]:
# df_cpd_props_unique.to_csv(path_results+"ChEMBL31_cmpd_info.csv")

## Combine initial query with compound properties

In [24]:
df_combined = df_mols_limited.merge(df_cpd_props_unique, on = 'parent_molregno', how = 'inner')

In [25]:
# df_combined.to_csv(path_results+"ChEMBL31_with_cmpd.csv", sep = ';')

In [26]:
############### TESTING ###############
print("All mols:", df_mols_limited.shape)
print("Mols with available properties:", df_combined.shape)
print("Difference:", len(df_mols_limited) - len(df_combined))

All mols: (3149043, 20)
Mols with available properties: (3145693, 45)
Difference: 3350


In [27]:
############### TESTING: mapped target ids comparison ###############
add_intersections(df_mols_limited, "cpd props")

# Calculate LE metrics

Ligand Efficiency (LE):

$\text{LE} = \frac{\Delta\text{G}}{\text{HA}}$
where $ \Delta\text{G} = − RT \ln(K_d)$, $− RT\ln(K_i)$, or $− RT\ln(IC_{50})$

$\text{LE}=\frac{(2.303 \cdot 298 \cdot 0.00199 \cdot \text{pchembl_value})} {\text{heavy_atoms}}$


$\text{BEI}=\frac{\text{pchembl_mean} \cdot 1000} {\text{mw_freebase}}$

$\text{SEI}=\frac{\text{pchembl_mean} \cdot 100} {\text{PSA}}$

$\text{LLE}=\text{pchembl_mean}-\text{ALOGP}$

In [28]:
df_combined['LE'] = df_combined['pchembl_value_mean']/df_combined['heavy_atoms']*(2.303*298*0.00199)
df_combined['BEI'] = df_combined['pchembl_value_mean']*1000/df_combined["mw_freebase"]
df_combined['SEI'] = df_combined['pchembl_value_mean']*100/df_combined["psa"]
df_combined['LLE'] = df_combined['pchembl_value_mean']-df_combined["alogp"]

# Extract drug-target interactions with disease relevance from drug_mechanism table

In [29]:
sql_dti = '''
select distinct mh.parent_molregno, dm.tid, dm.disease_efficacy
from drug_mechanism dm, molecule_hierarchy mh, molecule_dictionary md
where mh.molregno=dm.molregno
and md.molregno=mh.parent_molregno
'''

df_dti = pd.read_sql_query(sql_dti, con=engine_ch)
# drop rows that don't have a tid
df_dti = df_dti.dropna(subset = ['tid'])
df_dti['tid'] = df_dti['tid'].astype('Int64')
df_dti

Unnamed: 0,parent_molregno,tid,disease_efficacy
0,1124,11060,1
1,675068,10193,1
2,1125,10193,1
3,1085,10193,1
4,1124,10193,1
...,...,...,...
4756,1304559,101019,1
4757,1304559,100417,1
4758,2336099,11540,1
4759,2146132,100097,1


In [30]:
# disease_efficacy NUMBER
# Flag to show whether the target assigned is believed to play a role in the efficacy of the drug in the indication(s)
# for which it is approved (1 = yes, 0 = no)
df_dti = df_dti[df_dti['disease_efficacy'] == 1].copy()

In [31]:
# DT-interactions and targets based on drug_mechanisms table
DTIs_original = set(df_dti.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1))
dti_tids_original = set(df_dti['tid'])

# DT-interactions and targets with other target IDs based on mapping
df_dti_add_targets = df_dti.merge(relevant_mappings, on = 'tid', how = 'inner')
DTIs_mapped = set(df_dti_add_targets.agg('{0[parent_molregno]}_{0[related_tid]}'.format, axis=1))
dti_tids_mapped = set(df_dti_add_targets['related_tid'].astype("int"))

# combined DT-interactions and targets
DTIs_set = DTIs_original.union(DTIs_mapped)
dti_tids_set = dti_tids_original.union(dti_tids_mapped)

In [32]:
############### TESTING: drug_mechanisms available data comparison ###############
dti_mols = set(df_dti['parent_molregno'])
print_drug_comparison(dti_mols, output=True)

print('................. Before mapping target IDs .................')
dti_targets = dti_tids_original
print_target_comparison(dti_targets, output=True)

dti_pairs = DTIs_original 
print_drug_pair_comparison(dti_pairs, output=True)

print('................. Including mapping target IDs .................')
dti_mapped_target = dti_tids_set
print_target_comparison(dti_mapped_target, output=True)

dti_mapped_pairs = DTIs_set 
print_drug_pair_comparison(dti_mapped_pairs, output=True)

------
Drugs (molregno)
current:        3528
comparison:     686
intersection:   629
................. Before mapping target IDs .................
------
Targets (tid)
current:        1080
comparison:     908
intersection:   654
------
Drug-target pairs
current:        4535
comparison:     1221
intersection:   653
................. Including mapping target IDs .................
------
Targets (tid)
current:        1369
comparison:     908
intersection:   895
------
Drug-target pairs
current:        8579
comparison:     1221
intersection:   1113


1113

# DTI classification

Identify which TIDs are drug targets from the drug_mechanism table and add field called "defined_DTI": 
    Value: "True" if it is a drug with a curated drug_meachnism and "False" if not.

Map again to the drug mechanism table via "tid" to identify therapeutic targets and add column "therapeutic_target":
    Value: "True" | "False"

Use this logic to define per compound/target pair whether it corresponds to:
    
    drug and its therapeutic target "DTI": "D_DT";
        
    a drug target but not a drug "DTI": "DT";
        
    not a drug and not a drug target "DTI": "NDT";
        
(done by first creating a new column "DT_assoc" in both the master and the mapping table which reflects the molregno-tid association and then mapping it)

In [33]:
drugs_set = set(df_combined[df_combined['max_phase'] == 4]["parent_molregno"])

In [34]:
df_combined['therapeutic_target'] = df_combined['tid'].isin(dti_tids_set)
df_combined['is_drug'] = df_combined['parent_molregno'].isin(drugs_set)
df_combined['DT_assoc'] = df_combined.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1)

In [35]:
df_combined['DTI'] = "Nan"
df_combined.loc[df_combined['DT_assoc'].isin(DTIs_set), 'DTI'] = "D_DT"
df_combined.loc[(df_combined['therapeutic_target']== True) & (df_combined['is_drug']== False), 'DTI'] = "DT"
df_combined.loc[(df_combined['therapeutic_target']== False) & (df_combined['is_drug']== False), 'DTI'] = "NDT"

In [36]:
############### TESTING: before reducing to D_DT and DT ###############
add_intersections(df_combined, "Pre DTI")

In [37]:
# keep only D_DT and DT
df_combined = df_combined[(df_combined['DTI'].isin(['D_DT', 'DT']))]

In [38]:
############### TESTING ###############
# This was supposedly changed to include single proteins for protein families / complexes
df_combined[(df_combined['compound_pref_name'] == "PRAZOSIN") 
            &(df_combined['Target_pref_name'].str.contains("drenergic receptor"))  
            & (df_combined['DTI'] == 'D_DT')
            & (df_combined['only_binding'] == True)][['tid', 'pchembl_value_mean', 'pchembl_value_median', 'pchembl_value_max', 'target_type']]

Unnamed: 0,tid,pchembl_value_mean,pchembl_value_median,pchembl_value_max,target_type
856,125,9.51875,9.49,10.4,SINGLE PROTEIN
857,128,9.283636,9.39,9.68,SINGLE PROTEIN
858,103,9.43,9.485,9.7,SINGLE PROTEIN
859,104723,9.502,9.7,10.27,PROTEIN FAMILY


In [39]:
############### TESTING ###############
# This potentially doesn't make sense: the max pcheml should be the same as the protein family?
df_comparison[(df_comparison['CMPD_PREF_NAME'] == "PRAZOSIN") 
            &(df_comparison['TARGET_PREF_NAME'].str.contains("drenergic receptor"))  
            & (df_comparison['DTI'] == 'D_DT')][['TID', 'PCHEMBL_MEAN', 'PCHEMBL_MEDIAN', 'PCHEMBL_MAX', 'TARGET_TYPE']]

Unnamed: 0,TID,PCHEMBL_MEAN,PCHEMBL_MEDIAN,PCHEMBL_MAX,TARGET_TYPE
65993,128,9.28,9.39,9.68,SINGLE PROTEIN
250309,103,9.43,9.49,9.7,SINGLE PROTEIN
280827,125,9.52,9.49,10.4,SINGLE PROTEIN
493849,104723,9.5,9.7,10.27,PROTEIN FAMILY


# Add scaffold smiles

In [40]:
from tqdm import tqdm

# note: this takes a few minutes to calculate for all molecules
def calculate_scaffolds(smiles_set):
    scaffolds_dict = dict()
    scaffolds_no_stereo_dict = dict()
    for smiles in tqdm(smiles_set):
        mol = Chem.MolFromSmiles(smiles)
        if Chem.rdMolDescriptors.CalcNumRings(mol) == 0:
            continue

        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        scaffolds_dict[smiles] = Chem.MolToSmiles(scaffold)
        
        # repeat after removing stereochemistry
        Chem.RemoveStereochemistry(mol)
        scaffold_no_stereo = MurckoScaffold.GetScaffoldForMol(mol)
        scaffolds_no_stereo_dict[smiles] = Chem.MolToSmiles(scaffold_no_stereo)
        
    return scaffolds_dict, scaffolds_no_stereo_dict

In [41]:
smiles_set = set(df_combined["canonical_smiles"])
scaffolds_dict, scaffolds_no_stereo_dict = calculate_scaffolds(smiles_set)

100%|██████████████████████████████████████████████████████████████| 479088/479088 [03:27<00:00, 2308.47it/s]


In [42]:
df_combined['scaffold_wo_stereo'] = df_combined['canonical_smiles'].map(scaffolds_no_stereo_dict)
df_combined["scaffold_w_stereo"] = df_combined['canonical_smiles'].map(scaffolds_dict)

# Filter for targets (all assay types) with at least 100 comparator compounds

In [43]:
# consider all assay types
df_combined_all_assays = df_combined[(df_combined['only_binding'] == False)]

In [44]:
min_nof_cpds = 100

df_rest = df_combined_all_assays[df_combined_all_assays['max_phase'] < 4]
comparator_counts = df_rest.groupby(['tid'])['parent_molregno'].count()
targets_w_enough_cpds = comparator_counts[comparator_counts >= min_nof_cpds].index.tolist()
df_filtered_targets = df_combined_all_assays.query('tid in @targets_w_enough_cpds')

In [45]:
# df_filtered_targets.to_csv(path_results+"ChEMBL31_DTI_filtered_targets.csv", sep = ";")

In [46]:
# ############### TESTING ###############
# print("Targets with >= 100 comparators:", len(targets_w_enough_cpds), "\n")  

# for phase in range(1, 5):
#     print("Phase: ", phase)
#     df_phase = df_filtered_targets[df_filtered_targets['max_phase'] == phase]
#     print(df_phase.DTI.value_counts())
#     print("Unique targets with annotated cpds in phase " + str(phase) + ":", df_phase.tid.nunique())
#     print()

In [47]:
############### TESTING: all assay types ###############
add_intersections(df_combined_all_assays, "all assays")
add_intersections(df_filtered_targets, "all, >= 100")

# Filter for targets (only binding assays) with at least 100 comparator compounds

In [48]:
# consider only binding assays and therapeutic targets
df_comb_B = df_combined[(df_combined['only_binding'] == True)]

In [49]:
min_nof_cpds = 100

df_rest_B = df_comb_B[df_comb_B['max_phase'] < 4]
comparator_counts_B = df_rest_B.groupby(['tid'])['parent_molregno'].count()
targets_w_enough_cpds_B = comparator_counts_B[comparator_counts_B >= min_nof_cpds].index.tolist()
df_filtered_targets_B = df_comb_B.query('tid in @targets_w_enough_cpds_B')

In [50]:
# ############### TESTING ###############
# print("Targets with >= 100 comparators:", len(targets_w_enough_cpds_B), "\n")  

# for phase in range(1, 5):
#     print("Phase: ", phase)
#     df_phase = df_filtered_targets_B[df_filtered_targets_B['max_phase'] == phase]
#     print(df_phase.DTI.value_counts())
#     print("unique targets with annotated cpds in phase " + str(phase) + ":", df_phase.tid.nunique())
#     print()

In [51]:
############### TESTING: binding assays ###############
add_intersections(df_comb_B, "binding")
add_intersections(df_filtered_targets_B, "b, >= 100")

In [52]:
############### TESTING: deleopment of intersection(curr_data, old_dataset) ###############
pd.DataFrame(all_intersect_comp,
                   columns=['type', 'mols', 'drugs', 'targets', 'cpd_target', 'drug_target'])

Unnamed: 0,type,mols,drugs,targets,cpd_target,drug_target
0,comparison,378661,686,908,561452,1221
1,init,378661,616,908,561452,1122
2,cpd props,378661,616,908,561452,1122
3,Pre DTI,378661,616,908,561452,1122
4,all assays,375142,595,890,552523,1077
5,"all, >= 100",370294,558,549,543608,989
6,binding,375116,595,890,552523,1077
7,"b, >= 100",369260,555,522,541949,978


In [53]:
############### TESTING: deleopment of intersection(curr_data, old_dataset) limited to >= 100 cpds ###############
pd.DataFrame(all_intersect_comp_100,
                   columns=['type', 'mols', 'drugs', 'targets', 'cpd_target', 'drug_target'])

Unnamed: 0,type,mols,drugs,targets,cpd_target,drug_target
0,comparison_100,372275,642,519,549498,1102
1,init,372275,572,519,549498,1007
2,cpd props,372275,572,519,549498,1007
3,Pre DTI,372275,572,519,549498,1007
4,all assays,368956,555,514,541349,973
5,"all, >= 100",368944,555,514,541349,973
6,binding,368931,555,514,541349,973
7,"b, >= 100",368919,555,514,541349,973


In [54]:
############### TESTING: deleopment of size(curr_data) vs. size(old_dataset) ###############
pd.DataFrame(all_length_comp,
                   columns=['type', 'mols', 'drugs', 'targets', 'cpd_target', 'drug_target'])

Unnamed: 0,type,mols,drugs,targets,cpd_target,drug_target
0,comparison,378661,686,908,561452,1221
1,comparison_100,372275,642,519,549498,1102
2,init,923862,1637,6093,2088251,21164
3,cpd props,923862,1637,6093,2088251,21164
4,Pre DTI,922987,1623,6087,2085831,21056
5,all assays,479110,691,916,716212,1225
6,"all, >= 100",473191,650,551,705522,1124
7,binding,383867,656,902,566068,1172
8,"b, >= 100",377462,613,522,554684,1068
