# Notebook to extract and curate ChEMBL data for the Leeson data set (drug-target interactions)

### Author: Barbara Zdrazil
### 02/09/2022

##### This notebook extracts data from ChEMBL and performs some curation steps in order to retrieve a data set for drug-target, and clinical candidate-target associations including comparator compounds.
##### The notebook is based on initial work by Anne Hersey, Patrica Bento, Emma Manners, Paul Leeson, and Andrew Leach..
##### More documentation on the initial data set compilation can be found here ("Ligand Efficiency"): https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?spaceKey=CHEMBL&title=Anne%27s+Notes


In [1]:
import pandas as pd
import numpy as np
import re

#### notebook settings
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

### Get data from ChEMBL

In [2]:
# # @Barbara: uncomment this and modify this to your preferred paths
# path_results = "/Users/bzdrazil/Dropbox/ChEMBL/NP/data/"
# path_sqlite3_database = <your sqlite database location>

base_path = "/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/"
path_results = base_path+"results/"
path_sqlite3_database = base_path+"data/chembl_31/chembl_31_sqlite/chembl_31.db"

In [3]:
# # @Barbara: Accessing ChEMBL using Oracle
# # If you want to use this option, you have to change the sql statements to the commented line:
# # from docs -> from chembl_31.docs

# import cx_Oracle

# #cx_Oracle.init_oracle_client(lib_dir="/Users/bzdrazil/Downloads/instantclient_19_8")  #https://www.oracle.com/in/database/technologies/instant-client/macos-intel-x86-downloads.html; https://stackoverflow.com/questions/56119490/cx-oracle-error-dpi-1047-cannot-locate-a-64-bit-oracle-client-library
# cx_Oracle.clientversion() 

# import pandas as pd
# import sqlalchemy as sa
# import requests

# workdir = '/Users/bzdrazil/Desktop/'

# chemdev2 = 'oracle://{}:{}@ora-dlvm-103.ebi.ac.uk:1521/?service_name=chemdev2'.format('user', 'pw') # insert your username and password
# engine_ch = sa.create_engine(chemdev2)

In [4]:
import sqlite3

engine_ch = sqlite3.connect(path_sqlite3_database)

In [5]:
sql = '''

select distinct mh.molregno, docs.year, docs.journal, act.pchembl_value,act.standard_type, ass.assay_type, md.chembl_id as compound_chembl_id, md.pref_name as compound_pref_name ,md.max_phase, md.first_approval, md.prodrug, md.oral, md.parenteral, md.topical, md.black_box_warning, ass.tid, td.pref_name as Target_pref_name, td.target_type, td.organism, td.chembl_id as Target_chembl_id
from docs, activities act, molecule_hierarchy mh, assays ass, target_dictionary td, molecule_dictionary md
-- from chembl_31.docs, chembl_31.activities act,chembl_31.molecule_hierarchy mh, chembl_31.assays ass,chembl_31.target_dictionary td,chembl_31.molecule_dictionary md
where mh.molregno=act.molregno
and act.pchembl_value is not null
-- and ass.assay_type ='B'
and act.assay_id=ass.assay_id
and act.doc_id = docs.doc_id
and ass.tid=td.tid
and md.molregno=mh.parent_molregno
and act.potential_duplicate =0
and data_validity_comment is null
and act.standard_relation ='='
and td.tid <>22226   ----exclude unchecked targets
and td.target_type like '%PROTEIN%'

'''

data = pd.read_sql_query(sql, con=engine_ch)
data

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id
0,252199,2004.0,Bioorg. Med. Chem. Lett.,5.40,IC50,B,CHEMBL357278,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
1,253534,2004.0,Bioorg. Med. Chem. Lett.,4.77,IC50,B,CHEMBL357119,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
2,253199,2004.0,Bioorg. Med. Chem. Lett.,6.75,IC50,B,CHEMBL152968,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
3,253199,2004.0,Bioorg. Med. Chem. Lett.,5.22,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,12594,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356
4,253199,2004.0,Bioorg. Med. Chem. Lett.,4.43,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,17045,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595319,2408605,2021.0,,7.26,Kd,B,CHEMBL4538174,ERKi,0,,-1,0,0,0,0,101602,Serine/threonine-protein kinase LATS2,SINGLE PROTEIN,Homo sapiens,CHEMBL5907
2595320,2408605,2021.0,,7.01,Kd,B,CHEMBL4538174,ERKi,0,,-1,0,0,0,0,10811,Rho-associated protein kinase 1,SINGLE PROTEIN,Homo sapiens,CHEMBL3231
2595321,2408605,2021.0,,7.09,Kd,B,CHEMBL4538174,ERKi,0,,-1,0,0,0,0,11149,Rho-associated protein kinase 2,SINGLE PROTEIN,Homo sapiens,CHEMBL2973
2595322,2408605,2021.0,,7.27,Kd,B,CHEMBL4538174,ERKi,0,,-1,0,0,0,0,100075,TGF-beta receptor type II,SINGLE PROTEIN,Homo sapiens,CHEMBL4267


In [6]:
df = pd.DataFrame(data)

In [7]:
print("columns:")
print(df.columns)

print("\nshape:")
print(df.shape)

print("\ninfo:")
print(df.info())

columns:
Index(['molregno', 'year', 'journal', 'pchembl_value', 'standard_type',
       'assay_type', 'compound_chembl_id', 'compound_pref_name', 'max_phase',
       'first_approval', 'prodrug', 'oral', 'parenteral', 'topical',
       'organism', 'Target_chembl_id'],
      dtype='object')

shape:
(2595324, 20)

info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2595324 entries, 0 to 2595323
Data columns (total 20 columns):
 #   Column              Dtype  
---  ------              -----  
 0   molregno            int64  
 1   year                float64
 2   journal             object 
 3   pchembl_value       float64
 4   standard_type       object 
 5   assay_type          object 
 6   compound_chembl_id  object 
 7   compound_pref_name  object 
 8   max_phase           int64  
 9   first_approval      float64
 10  prodrug             int64  
 11  oral                int64  
 12  parenteral          int64  
 13  topical             int64  
 15  tid                 int64  
 16  Ta

In [8]:
df = df.astype({
    'year': 'Int64',
    'first_approval': 'Int64'
})

In [9]:
df.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id
0,252199,2004,Bioorg. Med. Chem. Lett.,5.4,IC50,B,CHEMBL357278,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
1,253534,2004,Bioorg. Med. Chem. Lett.,4.77,IC50,B,CHEMBL357119,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
2,253199,2004,Bioorg. Med. Chem. Lett.,6.75,IC50,B,CHEMBL152968,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632
3,253199,2004,Bioorg. Med. Chem. Lett.,5.22,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,12594,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356
4,253199,2004,Bioorg. Med. Chem. Lett.,4.43,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,17045,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340


In [10]:
df.to_csv(path_results+"ChEMBL31_initial_query.csv")

### Compound properties

In [11]:
sql_cpd_props = '''

select cp.*, struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles
from compound_properties cp, molecule_hierarchy mh, compound_structures struct
-- from chembL_31.compound_properties cp,chembl_31.molecule_hierarchy mh, CHEMBL_31.compound_structures struct
where cp.molregno=mh.parent_molregno
and struct.molregno=mh.parent_molregno

'''

data_cpd = pd.read_sql_query(sql_cpd_props, con=engine_ch)
data_cpd

Unnamed: 0,molregno,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,1,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.48,,3.63,2.69,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0,InChI=1S/C17H12ClN3O3/c1-10-8-11(21-17(24)20-15(22)9-19-21)6-7-12(10)16(23)13-4-2-3-5-14(13)18/h...,OWRSAHYFSSNENM-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,2,332.32,1.33,6.0,1.0,108.61,3.0,N,0.0,6.33,,2.88,1.82,ACID,332.32,3.0,25.0,0.73,332.0909,C18H12N4O3,7.0,1.0,0.0,InChI=1S/C18H12N4O3/c1-11-8-14(22-18(25)21-16(23)10-20-22)6-7-15(11)17(24)13-4-2-12(9-19)3-5-13/...,ZJYUMURGSZQFMH-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,3,357.80,2.27,5.0,2.0,87.98,3.0,N,0.0,6.33,,3.70,2.64,ACID,357.80,3.0,25.0,0.75,357.0880,C18H16ClN3O3,6.0,2.0,0.0,InChI=1S/C18H16ClN3O3/c1-10-7-14(22-18(25)21-15(23)9-20-22)8-11(2)16(10)17(24)12-3-5-13(19)6-4-1...,YOMWDCALSDWFSV-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,4,307.31,1.46,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.02,1.97,ACID,307.31,3.0,23.0,0.74,307.0957,C17H13N3O3,6.0,1.0,0.0,InChI=1S/C17H13N3O3/c1-11-2-4-12(5-3-11)16(22)13-6-8-14(9-7-13)20-17(23)19-15(21)10-18-20/h2-10H...,PSOPUAQFGCRDIP-UHFFFAOYSA-N,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,5,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.63,2.57,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0,InChI=1S/C17H12ClN3O3/c1-10-8-13(21-17(24)20-15(22)9-19-21)6-7-14(10)16(23)11-2-4-12(18)5-3-11/h...,KEZNSCMBVRNOHO-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220527,2713769,232.28,2.27,2.0,1.0,41.57,1.0,Y,0.0,13.52,,1.79,1.79,NEUTRAL,232.28,1.0,17.0,0.81,232.1212,C13H16N2O2,4.0,1.0,0.0,"InChI=1S/C13H16N2O2/c1-3-15-12(16)14-10-8-13(15,2)17-11-7-5-4-6-9(10)11/h4-7,10H,3,8H2,1-2H3,(H,...",IXPLPFXPVINMGZ-UHFFFAOYSA-N,CCN1C(=O)NC2CC1(C)Oc1ccccc12
2220528,2713770,251.37,4.56,1.0,1.0,12.03,3.0,N,0.0,,4.45,4.46,4.46,NEUTRAL,251.37,2.0,19.0,0.84,251.1674,C18H21N,1.0,1.0,0.0,"InChI=1S/C18H21N/c1-3-18(2,14-9-5-4-6-10-14)16-13-19-17-12-8-7-11-15(16)17/h4-12,16,19H,3,13H2,1...",ZARYWWBNZPSIGT-UHFFFAOYSA-N,CCC(C)(c1ccccc1)C1CNc2ccccc21
2220529,2713771,534.81,6.31,5.0,2.0,80.68,19.0,N,2.0,,2.76,0.79,0.79,NEUTRAL,534.81,1.0,38.0,0.16,534.4265,C31H56N3O4+,7.0,2.0,2.0,"InChI=1S/C31H55N3O4/c1-7-8-9-10-11-12-13-14-15-16-17-19-29(36)32-20-18-21-34(5,6)22-28-26(23-35)...",GFPIKIGKCBEABU-UHFFFAOYSA-O,CCCCCCCCCCCCCC(=O)NCCC[N+](C)(C)Cc1nc(C)c2c(c1CO)COC(C)(C)O2
2220530,2713772,309.33,4.40,2.0,1.0,35.25,4.0,N,0.0,,8.76,4.71,3.34,BASE,309.33,2.0,22.0,0.91,309.1340,C17H18F3NO,2.0,2.0,0.0,"InChI=1S/C17H18F3NO/c1-11-4-3-5-12(2)16(11)22-10-15(21)13-6-8-14(9-7-13)17(18,19)20/h3-9,15H,10,...",GDTGLBUCEJWYCV-UHFFFAOYSA-N,Cc1cccc(C)c1OCC(N)c1ccc(C(F)(F)F)cc1


In [12]:
df_cpd = pd.DataFrame(data_cpd)

In [13]:
df_cpd.molregno.nunique()

2194281

In [14]:
df_cpd_unique = df_cpd.drop_duplicates()

In [15]:
print(df_cpd.shape)
print(df_cpd_unique.shape)

(2220532, 26)
(2194281, 26)


In [16]:
df_cpd_unique.to_csv(path_results+"ChEMBL31_cmpd_info.csv")

In [17]:
df_comb = df.merge(df_cpd_unique, on = 'molregno', how = 'inner')

In [18]:
df_comb.to_csv(path_results+"ChEMBL31_with_cmpd.csv", sep = ';')

In [19]:
print(df.shape)
print(df_comb.shape)
df_comb.head()

(2595324, 20)
(2483167, 45)


Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,252199,2004,Bioorg. Med. Chem. Lett.,5.4,IC50,B,CHEMBL357278,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,543.01,4.27,7.0,2.0,77.93,8.0,N,1.0,13.21,6.75,3.71,3.62,NEUTRAL,543.01,3.0,36.0,0.44,542.1366,C24H26ClF3N4O3S,7.0,2.0,1.0,InChI=1S/C24H26ClF3N4O3S/c1-15-29-21-11-18(3-5-22(21)36-15)35-14-17(33)12-31-6-8-32(9-7-31)13-23...,LFSFDIDSIQKNDC-QGZVFWFLSA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4ccc(Cl)c(C(F)(F)F)c4)CC3)ccc2s1
1,253534,2004,Bioorg. Med. Chem. Lett.,4.77,IC50,B,CHEMBL357119,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,468.62,2.32,7.0,2.0,77.93,10.0,N,0.0,,6.82,2.22,2.12,NEUTRAL,468.62,3.0,33.0,0.48,468.2195,C25H32N4O3S,7.0,2.0,0.0,InChI=1S/C25H32N4O3S/c1-19-27-23-15-22(7-8-24(23)33-19)32-18-21(30)16-28-11-13-29(14-12-28)17-25...,LCCPHXWRAPZDSN-OAQYLSRUSA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)NCCc4ccccc4)CC3)ccc2s1
2,253199,2004,Bioorg. Med. Chem. Lett.,6.75,IC50,B,CHEMBL152968,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,516.67,4.27,7.0,2.0,77.93,9.0,N,1.0,13.29,6.77,3.87,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1
3,253199,2004,Bioorg. Med. Chem. Lett.,5.22,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,12594,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,516.67,4.27,7.0,2.0,77.93,9.0,N,1.0,13.29,6.77,3.87,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1
4,253199,2004,Bioorg. Med. Chem. Lett.,4.43,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,17045,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,516.67,4.27,7.0,2.0,77.93,9.0,N,1.0,13.29,6.77,3.87,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1


### Calculate mean, median, and max pChEMBL values for each compound-target pair

In [20]:
df_comb['pchembl_value_mean'] = df_comb.groupby(['molregno', 'tid'])['pchembl_value'].transform('mean')
df_comb['pchembl_value_max'] = df_comb.groupby(['molregno', 'tid'])['pchembl_value'].transform('max')
df_comb['pchembl_value_median'] = df_comb.groupby(['molregno', 'tid'])['pchembl_value'].transform('median')

In [21]:
df_comb['first_publication_target_cmpd_pair'] = df_comb.groupby(['molregno', 'tid'])['year'].transform('min')

In [22]:
df_comb.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair
0,252199,2004,Bioorg. Med. Chem. Lett.,5.4,IC50,B,CHEMBL357278,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,543.01,4.27,7.0,2.0,77.93,8.0,N,1.0,13.21,6.75,3.71,3.62,NEUTRAL,543.01,3.0,36.0,0.44,542.1366,C24H26ClF3N4O3S,7.0,2.0,1.0,InChI=1S/C24H26ClF3N4O3S/c1-15-29-21-11-18(3-5-22(21)36-15)35-14-17(33)12-31-6-8-32(9-7-31)13-23...,LFSFDIDSIQKNDC-QGZVFWFLSA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4ccc(Cl)c(C(F)(F)F)c4)CC3)ccc2s1,5.4,5.4,5.4,2004
1,253534,2004,Bioorg. Med. Chem. Lett.,4.77,IC50,B,CHEMBL357119,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,468.62,2.32,7.0,2.0,77.93,10.0,N,0.0,,6.82,2.22,2.12,NEUTRAL,468.62,3.0,33.0,0.48,468.2195,C25H32N4O3S,7.0,2.0,0.0,InChI=1S/C25H32N4O3S/c1-19-27-23-15-22(7-8-24(23)33-19)32-18-21(30)16-28-11-13-29(14-12-28)17-25...,LCCPHXWRAPZDSN-OAQYLSRUSA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)NCCc4ccccc4)CC3)ccc2s1,4.77,4.77,4.77,2004
2,253199,2004,Bioorg. Med. Chem. Lett.,6.75,IC50,B,CHEMBL152968,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,516.67,4.27,7.0,2.0,77.93,9.0,N,1.0,13.29,6.77,3.87,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,6.75,6.75,6.75,2004
3,253199,2004,Bioorg. Med. Chem. Lett.,5.22,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,12594,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,516.67,4.27,7.0,2.0,77.93,9.0,N,1.0,13.29,6.77,3.87,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,5.22,5.22,5.22,2004
4,253199,2004,Bioorg. Med. Chem. Lett.,4.43,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,17045,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,516.67,4.27,7.0,2.0,77.93,9.0,N,1.0,13.29,6.77,3.87,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,4.43,4.43,4.43,2004


### Calculate LE metrics

Ligand Efficiency (LE):

$\text{LE} = \frac{\Delta\text{G}}{\text{HA}}$
where $ \Delta\text{G} = − RT \ln(K_d)$, $− RT\ln(K_i)$, or $− RT\ln(IC_{50})$

$\text{LE}=\frac{(2.303 \cdot 298 \cdot 0.00199 \cdot \text{pchembl_value})} {\text{heavy_atoms}}$


$\text{BEI}=\frac{\text{pchembl_mean} \cdot 1000} {\text{mw_freebase}}$

$\text{SEI}=\frac{\text{pchembl_mean} \cdot 100} {\text{PSA}}$

$\text{LLE}=\text{pchembl_mean}-\text{ALOGP}$

In [23]:
df_comb['LE'] = df_comb['pchembl_value_mean']/df_comb['heavy_atoms']*(2.303*298*0.00199)
df_comb['BEI'] = df_comb['pchembl_value_mean']*1000/df_comb["mw_freebase"]
df_comb['SEI'] = df_comb['pchembl_value_mean']*100/df_comb["psa"]
df_comb['LLE'] = df_comb['pchembl_value_mean']-df_comb["alogp"]

### Extract drug-target interactions with disease relevance from drug_mechanism table

In [24]:
sql_dti = '''
select distinct molregno, mechanism_of_action, tid, disease_efficacy 
-- from chembl_31.drug_mechanism
from drug_mechanism
'''

data_dti = pd.read_sql_query(sql_dti, con=engine_ch)
data_dti

Unnamed: 0,molregno,mechanism_of_action,tid,disease_efficacy
0,1124,Carbonic anhydrase VII inhibitor,11060.0,1
1,675068,Carbonic anhydrase I inhibitor,10193.0,1
2,674765,Carbonic anhydrase I inhibitor,10193.0,1
3,1085,Carbonic anhydrase I inhibitor,10193.0,1
4,1125,Carbonic anhydrase I inhibitor,10193.0,1
...,...,...,...,...
6635,2486701,IL36 receptor antagonist,120084.0,1
6636,2335784,CBP/beta catenin inhibitor,120086.0,1
6637,2336055,T cell surface glycoprotein CD3 binding agent,106147.0,1
6638,2197740,Casein kinase I epsilon inhibitor,10967.0,1


In [25]:
data_dti_all_tids = data_dti[data_dti['disease_efficacy'] == 1].copy()

In [26]:
data_dti_all_tids['tid'] = data_dti_all_tids['tid'].astype('Int64')

In [27]:
print("data_dti: ", len(data_dti))

print("\ndata_dti_all_tids: ") 
print(len(data_dti_all_tids))
print(data_dti_all_tids.dtypes)

data_dti:  6640

data_dti_all_tids: 
6638
molregno                int64
mechanism_of_action    object
tid                     Int64
disease_efficacy        int64
dtype: object


In [28]:
list_dti_tid = data_dti_all_tids['tid'].to_list()
list_dti = data_dti['molregno'].to_list()

Identify which TIDs are drug targets from the drug_mechanism table and add field called "defined_DTI": 
    Value: "True" if it is a drug with a curated drug_meachnism and "False" if not.

Map again to the drug mechanism table via "tid" to identify therapeutic targets and add column "Therapeutic_Target":
    Value: "True" | "False"

Use this logic to define per compound/target pair whether it corresponds to:
    
    drug and its therapeutic target "DTI": "D_DT";
        
    a drug target but not a drug "DTI": "DT";
        
    not a drug and not a drug target "DTI": "NDT";
        
(done by first creating a new column "DT_assoc" in both the master and the mapping table which reflects the molregno-tid association and then mapping it)

In [29]:
df_comb['Therapeutic_Target'] = df_comb['tid'].isin(list_dti_tid)

In [30]:
df_comb['defined_DTI'] = df_comb['molregno'].isin(list_dti)

In [31]:
data_dti_all_tids['DT_assoc'] = data_dti_all_tids.agg('{0[molregno]}_{0[tid]}'.format, axis=1)
data_dti_all_tids

Unnamed: 0,molregno,mechanism_of_action,tid,disease_efficacy,DT_assoc
0,1124,Carbonic anhydrase VII inhibitor,11060,1,1124_11060
1,675068,Carbonic anhydrase I inhibitor,10193,1,675068_10193
2,674765,Carbonic anhydrase I inhibitor,10193,1,674765_10193
3,1085,Carbonic anhydrase I inhibitor,10193,1,1085_10193
4,1125,Carbonic anhydrase I inhibitor,10193,1,1125_10193
...,...,...,...,...,...
6635,2486701,IL36 receptor antagonist,120084,1,2486701_120084
6636,2335784,CBP/beta catenin inhibitor,120086,1,2335784_120086
6637,2336055,T cell surface glycoprotein CD3 binding agent,106147,1,2336055_106147
6638,2197740,Casein kinase I epsilon inhibitor,10967,1,2197740_10967


In [32]:
DTI_list = data_dti_all_tids['DT_assoc'].to_list()

In [33]:
df_comb['DT_assoc'] = df_comb.agg('{0[molregno]}_{0[tid]}'.format, axis=1)

In [34]:
df_comb.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc
0,252199,2004,Bioorg. Med. Chem. Lett.,5.4,IC50,B,CHEMBL357278,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,543.01,4.27,7.0,2.0,77.93,...,3.62,NEUTRAL,543.01,3.0,36.0,0.44,542.1366,C24H26ClF3N4O3S,7.0,2.0,1.0,InChI=1S/C24H26ClF3N4O3S/c1-15-29-21-11-18(3-5-22(21)36-15)35-14-17(33)12-31-6-8-32(9-7-31)13-23...,LFSFDIDSIQKNDC-QGZVFWFLSA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4ccc(Cl)c(C(F)(F)F)c4)CC3)ccc2s1,5.4,5.4,5.4,2004,0.204859,9.944568,6.929296,1.13,False,False,252199_10483
1,253534,2004,Bioorg. Med. Chem. Lett.,4.77,IC50,B,CHEMBL357119,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,468.62,2.32,7.0,2.0,77.93,...,2.12,NEUTRAL,468.62,3.0,33.0,0.48,468.2195,C25H32N4O3S,7.0,2.0,0.0,InChI=1S/C25H32N4O3S/c1-19-27-23-15-22(7-8-24(23)33-19)32-18-21(30)16-28-11-13-29(14-12-28)17-25...,LCCPHXWRAPZDSN-OAQYLSRUSA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)NCCc4ccccc4)CC3)ccc2s1,4.77,4.77,4.77,2004,0.197409,10.178823,6.120878,2.45,False,False,253534_10483
2,253199,2004,Bioorg. Med. Chem. Lett.,6.75,IC50,B,CHEMBL152968,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,516.67,4.27,7.0,2.0,77.93,...,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,6.75,6.75,6.75,2004,0.249153,13.064432,8.661619,2.48,False,False,253199_10483
3,253199,2004,Bioorg. Med. Chem. Lett.,5.22,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,12594,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,516.67,4.27,7.0,2.0,77.93,...,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,5.22,5.22,5.22,2004,0.192678,10.103161,6.698319,0.95,False,False,253199_12594
4,253199,2004,Bioorg. Med. Chem. Lett.,4.43,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,17045,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,516.67,4.27,7.0,2.0,77.93,...,3.78,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,4.43,4.43,4.43,2004,0.163518,8.574138,5.684589,0.16,True,False,253199_17045


In [35]:
df_comb['DTI'] = "Nan"
df_comb.loc[df_comb['DT_assoc'].isin(DTI_list), 'DTI'] = "D_DT"
df_comb.loc[(df_comb['Therapeutic_Target']== True) & (df_comb['defined_DTI']== False), 'DTI'] = "DT"
df_comb.loc[(df_comb['Therapeutic_Target']== False) & (df_comb['defined_DTI']== False), 'DTI'] = "NDT"

In [36]:
df_comb.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc,DTI
0,252199,2004,Bioorg. Med. Chem. Lett.,5.4,IC50,B,CHEMBL357278,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,543.01,4.27,7.0,2.0,77.93,...,NEUTRAL,543.01,3.0,36.0,0.44,542.1366,C24H26ClF3N4O3S,7.0,2.0,1.0,InChI=1S/C24H26ClF3N4O3S/c1-15-29-21-11-18(3-5-22(21)36-15)35-14-17(33)12-31-6-8-32(9-7-31)13-23...,LFSFDIDSIQKNDC-QGZVFWFLSA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4ccc(Cl)c(C(F)(F)F)c4)CC3)ccc2s1,5.4,5.4,5.4,2004,0.204859,9.944568,6.929296,1.13,False,False,252199_10483,NDT
1,253534,2004,Bioorg. Med. Chem. Lett.,4.77,IC50,B,CHEMBL357119,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,468.62,2.32,7.0,2.0,77.93,...,NEUTRAL,468.62,3.0,33.0,0.48,468.2195,C25H32N4O3S,7.0,2.0,0.0,InChI=1S/C25H32N4O3S/c1-19-27-23-15-22(7-8-24(23)33-19)32-18-21(30)16-28-11-13-29(14-12-28)17-25...,LCCPHXWRAPZDSN-OAQYLSRUSA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)NCCc4ccccc4)CC3)ccc2s1,4.77,4.77,4.77,2004,0.197409,10.178823,6.120878,2.45,False,False,253534_10483,NDT
2,253199,2004,Bioorg. Med. Chem. Lett.,6.75,IC50,B,CHEMBL152968,,0,,-1,0,0,0,0,10483,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,516.67,4.27,7.0,2.0,77.93,...,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,6.75,6.75,6.75,2004,0.249153,13.064432,8.661619,2.48,False,False,253199_10483,NDT
3,253199,2004,Bioorg. Med. Chem. Lett.,5.22,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,12594,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,516.67,4.27,7.0,2.0,77.93,...,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,5.22,5.22,5.22,2004,0.192678,10.103161,6.698319,0.95,False,False,253199_12594,NDT
4,253199,2004,Bioorg. Med. Chem. Lett.,4.43,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,17045,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,516.67,4.27,7.0,2.0,77.93,...,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,4.43,4.43,4.43,2004,0.163518,8.574138,5.684589,0.16,True,False,253199_17045,DT


In [37]:
df_comb['DTI'].value_counts()

NDT     1351251
DT      1086498
Nan       37134
D_DT       8284
Name: DTI, dtype: int64

In [38]:
df_comb.max_phase.value_counts()

0    2409005
4      46479
2      11555
3       9936
1       6192
Name: max_phase, dtype: int64

In [39]:
df_4 = df_comb[df_comb['max_phase'] == 4]

In [40]:
df_4.DTI.value_counts()

Nan     22667
DT       9743
NDT      9188
D_DT     4881
Name: DTI, dtype: int64

In [41]:
df_4_D_DT = df_4[df_4['DTI'] == 'D_DT']

In [42]:
df_4_D_DT.tid.nunique()  #218 unique targets with annotated drugs

226

In [43]:
df_3 = df_comb[df_comb['max_phase'] == 3]

In [44]:
df_3.DTI.value_counts()

Nan     5170
NDT     1815
DT      1645
D_DT    1306
Name: DTI, dtype: int64

In [45]:
df_3_D_DT = df_3[df_3['DTI'] == 'D_DT']

In [46]:
df_3_D_DT.tid.nunique()   #176 unique targets with clincal candidates in phase 3

181

In [47]:
df_2 = df_comb[df_comb['max_phase'] == 2]

In [48]:
df_2.DTI.value_counts()   

Nan     6031
NDT     2160
DT      1707
D_DT    1657
Name: DTI, dtype: int64

In [49]:
df_2_D_DT = df_2[df_2['DTI'] == 'D_DT']

In [50]:
df_2_D_DT.tid.nunique()   #221 unique targets with clincal candidates in phase 2

229

In [51]:
df_1 = df_comb[df_comb['max_phase'] == 1]

In [52]:
df_1.DTI.value_counts()

Nan     3177
NDT     1335
DT      1271
D_DT     409
Name: DTI, dtype: int64

In [53]:
df_1_D_DT = df_1[df_1['DTI'] == 'D_DT']

In [54]:
df_1_D_DT.tid.nunique()     #71 unique targets with clincal candidates in phase 1

72

In [55]:
df_comb.tid.nunique()  # No of targets all together

6710

### Filter for targets with at least 100 comparator compounds

In [56]:
df_rest = df_comb[df_comb['max_phase'] == 0]

In [57]:
df_rest.tid.nunique()

6456

In [58]:
df_drugs_clin_cand = df_comb[df_comb['max_phase'] != 0]

In [59]:
df_drugs_clin_cand.tid.nunique()

3340

In [60]:
s = df_rest.groupby(['tid'])['molregno'].count()

In [61]:
s

tid
1          137
2           52
3         1872
4          442
6         1161
          ... 
120523      91
120525       9
120526       9
120529       5
120530       1
Name: molregno, Length: 6456, dtype: int64

In [62]:
len(s)

6456

In [63]:
list = s[s >= 100].index.tolist()

In [64]:
len(list)  # No of targets with at least 100 comparator compounds

1689

In [65]:
#df.to_csv(path_results+"ChEMBL31_DTI.tsv")

In [66]:
df_filtered_targets = df_comb.query('tid in @list')

In [67]:
len(df_filtered_targets)

2378379

In [68]:
df_filtered_targets.tid.nunique()

1689

In [69]:
df_filtered_targets.max_phase.value_counts()

0    2314685
4      40500
2       9734
3       8407
1       5053
Name: max_phase, dtype: int64

In [70]:
df_filtered_targets.to_csv(path_results+"ChEMBL31_DTI_filtered_targets.csv", sep = ";")

In [71]:
df_filtered_targets_4 = df_filtered_targets[df_filtered_targets['max_phase'] == 4]
df_filtered_targets_4_D_DT = df_filtered_targets_4[df_filtered_targets_4['DTI'] == 'D_DT']
df_filtered_targets_4_D_DT.tid.nunique() 

193

In [72]:
df_filtered_targets_3 = df_filtered_targets[df_filtered_targets['max_phase'] == 3]
df_filtered_targets_3_D_DT = df_filtered_targets_3[df_filtered_targets_3['DTI'] == 'D_DT']
df_filtered_targets_3_D_DT.tid.nunique() 

164

In [73]:
df_filtered_targets_2 = df_filtered_targets[df_filtered_targets['max_phase'] == 2]
df_filtered_targets_2_D_DT = df_filtered_targets_2[df_filtered_targets_2['DTI'] == 'D_DT']
df_filtered_targets_2_D_DT.tid.nunique() 

220

In [74]:
df_filtered_targets_1 = df_filtered_targets[df_filtered_targets['max_phase'] == 1]
df_filtered_targets_1_D_DT = df_filtered_targets_1[df_filtered_targets_1['DTI'] == 'D_DT']
df_filtered_targets_1_D_DT.tid.nunique() 

70

In [75]:
df_filtered_targets.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc,DTI
3,253199,2004,Bioorg. Med. Chem. Lett.,5.22,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,12594,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,516.67,4.27,7.0,2.0,77.93,...,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,5.22,5.22,5.22,2004,0.192678,10.103161,6.698319,0.95,False,False,253199_12594,NDT
4,253199,2004,Bioorg. Med. Chem. Lett.,4.43,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,17045,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,516.67,4.27,7.0,2.0,77.93,...,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,4.43,4.43,4.43,2004,0.163518,8.574138,5.684589,0.16,True,False,253199_17045,DT
5,253199,2004,Bioorg. Med. Chem. Lett.,4.62,IC50,A,CHEMBL152968,,0,,-1,0,0,0,0,12911,Cytochrome P450 2C9,SINGLE PROTEIN,Homo sapiens,CHEMBL3397,516.67,4.27,7.0,2.0,77.93,...,NEUTRAL,516.67,4.0,37.0,0.35,516.2195,C29H32N4O3S,7.0,2.0,1.0,InChI=1S/C29H32N4O3S/c1-21-30-27-17-26(10-11-28(27)37-21)36-20-25(34)18-32-12-14-33(15-13-32)19-...,SZRMSHBRXJWBFG-RUZDIDTESA-N,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccccc5)c4)CC3)ccc2s1,4.62,4.62,4.62,2004,0.170531,8.941878,5.928397,0.35,False,False,253199_12911,NDT
7,933,2003,J. Med. Chem.,8.19,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,10193,Carbonic anhydrase I,SINGLE PROTEIN,Homo sapiens,CHEMBL261,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,7.34,8.22,7.49,2003,0.501221,21.88497,4.64175,7.77,True,False,933_10193,DT
8,933,2004,J. Med. Chem.,8.69,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,15,Carbonic anhydrase II,SINGLE PROTEIN,Homo sapiens,CHEMBL205,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,8.1625,8.7,8.305,2004,0.557387,24.337339,5.161892,8.5925,True,False,933_15,DT


In [76]:
df_filtered_targets.assay_type.value_counts()

B    1224799
F    1113524
A      39931
T         63
U         62
Name: assay_type, dtype: int64

## Only B assays: filter for at least 100 comparator compounds

In [77]:
df_comb_B = df_comb[df_comb['assay_type'] == 'B']

In [78]:
df_rest_B = df_comb_B[df_comb_B['max_phase'] < 4]

In [79]:
s = df_rest_B.groupby(['tid'])['molregno'].count()

In [80]:
len(s)

6224

In [81]:
list = s[s >= 100].index.tolist()

In [82]:
len(list)

1463

In [83]:
df_filtered_targets_B = df_comb_B.query('tid in @list')

In [84]:
len(df_filtered_targets_B)

1216863

In [85]:
df_filtered_targets_B.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc,DTI
7,933,2003,J. Med. Chem.,8.19,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,10193,Carbonic anhydrase I,SINGLE PROTEIN,Homo sapiens,CHEMBL261,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,7.34,8.22,7.49,2003,0.501221,21.88497,4.64175,7.77,True,False,933_10193,DT
8,933,2004,J. Med. Chem.,8.69,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,15,Carbonic anhydrase II,SINGLE PROTEIN,Homo sapiens,CHEMBL205,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,8.1625,8.7,8.305,2004,0.557387,24.337339,5.161892,8.5925,True,False,933_15,DT
9,933,2004,J. Med. Chem.,8.29,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,11035,Carbonic anhydrase IV,SINGLE PROTEIN,Bos taurus,CHEMBL281,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,8.29,8.29,8.29,2004,0.566093,24.717493,5.242522,8.72,False,False,933_11035,NDT
10,933,2005,Bioorg. Med. Chem. Lett.,7.89,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,11063,Carbonic anhydrase XIV,SINGLE PROTEIN,Homo sapiens,CHEMBL3510,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,7.89,7.89,7.89,2005,0.538779,23.524852,4.989566,8.32,False,False,933_11063,NDT
11,933,2005,Bioorg. Med. Chem. Lett.,7.07,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,12209,Carbonic anhydrase XII,SINGLE PROTEIN,Homo sapiens,CHEMBL3242,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,7.07,7.07,7.07,2005,0.482784,21.079937,4.471005,7.5,True,False,933_12209,DT


In [86]:
df_filtered_targets_B_4 = df_filtered_targets_B[df_filtered_targets_B['max_phase'] == 4]
df_filtered_targets_B_4_D_DT = df_filtered_targets_B_4[df_filtered_targets_B_4['DTI'] == 'D_DT']
df_filtered_targets_B_4_D_DT.tid.nunique() 

178

In [87]:
targets_B_with_drug_MoA = df_filtered_targets_B_4_D_DT.tid.to_list()

In [88]:
#¢df_filtered_targets_B_max_phase = df_filtered_targets_B.groupby('tid', as_index=False)['max_phase'].max()

In [89]:
#df_filtered_targets_B_max_phase.tid.nunique()

In [90]:
#df_filtered_targets_B_max_phase.max_phase.value_counts()

In [91]:
#df_filtered_targets_B_max_phase_final = df_filtered_targets_B_max_phase[df_filtered_targets_B_max_phase['max_phase'] > 3]

In [92]:
#targets_B_with_drug = df_filtered_targets_B_max_phase_final.tid.to_list()

In [93]:
data_final_targets_B_with_drug = df_filtered_targets_B.query('tid in @targets_B_with_drug_MoA')

In [94]:
len(data_final_targets_B_with_drug)

360640

In [95]:
data_final_targets_B_with_drug.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,Target_pref_name,target_type,organism,Target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc,DTI
7,933,2003,J. Med. Chem.,8.19,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,10193,Carbonic anhydrase I,SINGLE PROTEIN,Homo sapiens,CHEMBL261,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,7.34,8.22,7.49,2003,0.501221,21.88497,4.64175,7.77,True,False,933_10193,DT
8,933,2004,J. Med. Chem.,8.69,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,15,Carbonic anhydrase II,SINGLE PROTEIN,Homo sapiens,CHEMBL205,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,8.1625,8.7,8.305,2004,0.557387,24.337339,5.161892,8.5925,True,False,933_15,DT
11,933,2005,Bioorg. Med. Chem. Lett.,7.07,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,12209,Carbonic anhydrase XII,SINGLE PROTEIN,Homo sapiens,CHEMBL3242,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,7.07,7.07,7.07,2005,0.482784,21.079937,4.471005,7.5,True,False,933_12209,DT
28,933,2013,Bioorg. Med. Chem.,6.79,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,10193,Carbonic anhydrase I,SINGLE PROTEIN,Homo sapiens,CHEMBL261,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,7.34,8.22,7.49,2003,0.501221,21.88497,4.64175,7.77,True,False,933_10193,DT
29,933,2013,Bioorg. Med. Chem.,7.34,Ki,B,CHEMBL268439,,0,,-1,0,0,0,0,15,Carbonic anhydrase II,SINGLE PROTEIN,Homo sapiens,CHEMBL205,335.39,-0.43,8.0,3.0,158.13,...,ACID,335.39,2.0,20.0,0.64,334.9817,C8H9N5O4S3,9.0,5.0,0.0,"InChI=1S/C8H9N5O4S3/c9-5-1-3-6(4-2-5)20(16,17)13-7-11-12-8(18-7)19(10,14)15/h1-4H,9H2,(H,11,13)(...",BDLSLORLEPSOGW-UHFFFAOYSA-N,Nc1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,8.1625,8.7,8.305,2004,0.557387,24.337339,5.161892,8.5925,True,False,933_15,DT


In [96]:
data_final_targets_B_with_drug.DTI.value_counts()

DT      348471
Nan       6350
D_DT      5819
Name: DTI, dtype: int64

In [97]:
data_final_targets_B_with_drug.tid.nunique()

178