# Notebook to extract and curate ChEMBL data for the Leeson data set (drug-target interactions)

### Author: Barbara Zdrazil
### 02/09/2022

##### This notebook extracts data from ChEMBL and performs some curation steps in order to retrieve a data set for drug-target, and clinical candidate-target associations including comparator compounds.
##### The notebook is based on initial work by Anne Hersey, Patrica Bento, Emma Manners, Paul Leeson, and Andrew Leach..
##### More documentation on the initial data set compilation can be found here ("Ligand Efficiency"): https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?spaceKey=CHEMBL&title=Anne%27s+Notes


In [8]:
import pandas as pd
import numpy as np
import re

#### notebook settings
pd.options.display.max_rows= 100
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

In [63]:
import cx_Oracle

#cx_Oracle.init_oracle_client(lib_dir="/Users/bzdrazil/Downloads/instantclient_19_8")  #https://www.oracle.com/in/database/technologies/instant-client/macos-intel-x86-downloads.html; https://stackoverflow.com/questions/56119490/cx-oracle-error-dpi-1047-cannot-locate-a-64-bit-oracle-client-library
cx_Oracle.clientversion() 

import pandas as pd
import sqlalchemy as sa
import requests

workdir = '/Users/bzdrazil/Desktop/'

chemdev2 = 'oracle://{}:{}@ora-dlvm-103.ebi.ac.uk:1521/?service_name=chemdev2'.format('user', 'pw') # insert your username and password
engine_ch = sa.create_engine(chemdev2)

In [33]:
sql = '''

select distinct mh.molregno, docs.year, docs.journal, act.pchembl_value,act.standard_type, ass.assay_type, md.chembl_id as compound_chembl_id, md.pref_name as compound_pref_name ,md.max_phase, md.first_approval, md.prodrug, md.oral, md.parenteral, md.topical, md.black_box_warning, ass.tid, td.pref_name as Target_pref_name, td.target_type, td.organism, td.chembl_id as Target_chembl_id
from chembl_31.docs, chembl_31.activities act,chembl_31.molecule_hierarchy mh, chembl_31.assays ass,chembl_31.target_dictionary td,chembl_31.molecule_dictionary md
where mh.molregno=act.molregno
and act.pchembl_value is not null
----and ass.assay_type ='B'
and act.assay_id=ass.assay_id
and act.doc_id = docs.doc_id
and ass.tid=td.tid
and md.molregno=mh.parent_molregno
and act.potential_duplicate =0
and data_validity_comment is null
and act.standard_relation ='='
and td.tid <>22226   ----exclude unchecked targets
and td.target_type like '%PROTEIN%'

'''

data = pd.read_sql_query(sql, con=engine_ch)
data

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id
0,587795,,,5.09,AC50,B,CHEMBL1197632,,0,,-1,0,0,0,0,103527,CAAX prenyl protease 2,SINGLE PROTEIN,Saccharomyces cerevisiae S288c,CHEMBL1250413
1,1678859,,,4.65,Potency,F,CHEMBL3303987,,0,,-1,0,0,0,0,103668,Prelamin-A/C,SINGLE PROTEIN,Homo sapiens,CHEMBL1293235
2,1569521,2013.0,Bioorg. Med. Chem. Lett.,5.23,Ki,B,CHEMBL3039672,,0,,-1,0,0,0,0,10627,Serotonin 6 (5-HT6) receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL3371
3,826507,,,4.85,Potency,F,CHEMBL1619667,,0,,-1,0,0,0,0,11130,Aldehyde dehydrogenase 1A1,SINGLE PROTEIN,Homo sapiens,CHEMBL3577
4,127054,2002.0,Bioorg. Med. Chem. Lett.,6.60,Ki,B,CHEMBL312093,,0,,-1,0,0,0,0,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595319,2439939,2016.0,Bioorg Med Chem Lett,7.43,IC50,B,CHEMBL4569508,,0,,-1,0,0,0,0,10434,Tyrosine-protein kinase SRC,SINGLE PROTEIN,Homo sapiens,CHEMBL267
2595320,2381789,2019.0,Eur J Med Chem,4.85,EC50,B,CHEMBL4474783,,0,,-1,0,0,0,0,100486,Toll-like receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL4163
2595321,1885873,2021.0,,4.14,IC50,F,CHEMBL3469799,,0,,-1,0,0,0,0,120046,Histidine--tRNA ligase,SINGLE PROTEIN,Leishmania infantum,CHEMBL4662927
2595322,2550446,2021.0,J Med Chem,8.82,Ki,B,CHEMBL4856675,,0,,-1,0,0,0,0,104803,"Hepatitis C virus serine protease, NS3/NS4A",PROTEIN COMPLEX,Hepatitis C virus,CHEMBL2095231


In [34]:
df = pd.DataFrame(data)

In [35]:
df.columns

Index(['molregno', 'year', 'journal', 'pchembl_value', 'standard_type',
       'assay_type', 'compound_chembl_id', 'compound_pref_name', 'max_phase',
       'first_approval', 'prodrug', 'oral', 'parenteral', 'topical',
       'organism', 'target_chembl_id'],
      dtype='object')

In [43]:
df.shape

(2595324, 20)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2595324 entries, 0 to 2595323
Data columns (total 20 columns):
 #   Column              Dtype  
---  ------              -----  
 0   molregno            int64  
 1   year                float64
 2   journal             object 
 3   pchembl_value       float64
 4   standard_type       object 
 5   assay_type          object 
 6   compound_chembl_id  object 
 7   compound_pref_name  object 
 8   max_phase           int64  
 9   first_approval      float64
 10  prodrug             int64  
 11  oral                int64  
 12  parenteral          int64  
 13  topical             int64  
 15  tid                 int64  
 16  target_pref_name    object 
 17  target_type         object 
 18  organism            object 
 19  target_chembl_id    object 
dtypes: float64(3), int64(8), object(9)
memory usage: 396.0+ MB


In [38]:
df = df.astype({
    'year': 'Int64',
    'first_approval': 'Int64'
})

In [39]:
df.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id
0,587795,,,5.09,AC50,B,CHEMBL1197632,,0,,-1,0,0,0,0,103527,CAAX prenyl protease 2,SINGLE PROTEIN,Saccharomyces cerevisiae S288c,CHEMBL1250413
1,1678859,,,4.65,Potency,F,CHEMBL3303987,,0,,-1,0,0,0,0,103668,Prelamin-A/C,SINGLE PROTEIN,Homo sapiens,CHEMBL1293235
2,1569521,2013.0,Bioorg. Med. Chem. Lett.,5.23,Ki,B,CHEMBL3039672,,0,,-1,0,0,0,0,10627,Serotonin 6 (5-HT6) receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL3371
3,826507,,,4.85,Potency,F,CHEMBL1619667,,0,,-1,0,0,0,0,11130,Aldehyde dehydrogenase 1A1,SINGLE PROTEIN,Homo sapiens,CHEMBL3577
4,127054,2002.0,Bioorg. Med. Chem. Lett.,6.6,Ki,B,CHEMBL312093,,0,,-1,0,0,0,0,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL253


In [40]:
df.to_csv("/Users/bzdrazil/Dropbox/ChEMBL/NP/data/ChEMBL31_initial_query.csv")

In [24]:
sql_2 = '''

select cp.*, struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles
from chembL_31.compound_properties cp,chembl_31.molecule_hierarchy mh, CHEMBL_31.compound_structures struct
where cp.molregno=mh.parent_molregno
and struct.molregno=mh.parent_molregno

'''

data_cpd = pd.read_sql_query(sql_2, con=engine_ch)
data_cpd

Exception during reset or similar
Traceback (most recent call last):
  File "/Users/bzdrazil/opt/anaconda3/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 682, in _finalize_fairy
    fairy._reset(pool)
  File "/Users/bzdrazil/opt/anaconda3/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 887, in _reset
    pool._dialect.do_rollback(self)
  File "/Users/bzdrazil/opt/anaconda3/lib/python3.9/site-packages/sqlalchemy/engine/default.py", line 667, in do_rollback
    dbapi_connection.rollback()
cx_Oracle.DatabaseError: DPI-1010: not connected


Unnamed: 0,molregno,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,385336,2598.16,,,,,,,,,,,,,2598.16,,,,2596.0401,C124H154ClN21O39,,,,InChI=1S/C124H154ClN21O39/c1-57(2)48-81-112(170)132-59(5)107(165)146-98(70-34-45-84(155)78(125)5...,UYSXXKGACMHPIM-KFGDMSGDSA-N,CC(C)C[C@@H]1NC(=O)CNC(=O)[C@H](c2ccc(O)cc2)NC(=O)[C@@H]([C@@H](C)O)NC(=O)[C@H](c2ccc(O[C@H]3O[C...
1,385336,2598.16,,,,,,,,,,,,,2598.16,,,,2596.0401,C124H154ClN21O39,,,,InChI=1S/C124H154ClN21O39/c1-57(2)48-81-112(170)132-59(5)107(165)146-98(70-34-45-84(155)78(125)5...,UYSXXKGACMHPIM-KFGDMSGDSA-N,CC(C)C[C@@H]1NC(=O)CNC(=O)[C@H](c2ccc(O)cc2)NC(=O)[C@@H]([C@@H](C)O)NC(=O)[C@H](c2ccc(O[C@H]3O[C...
2,501019,233.22,2.04,3.0,2.0,79.39,3.0,N,0.0,3.63,,2.01,-1.32,ACID,233.22,2.0,17.0,0.79,233.0688,C12H11NO4,5.0,2.0,0.0,"InChI=1S/C12H11NO4/c1-2-17-12(16)10-6-8-5-7(11(14)15)3-4-9(8)13-10/h3-6,13H,2H2,1H3,(H,14,15)",CAVYPAYXEMVXMS-UHFFFAOYSA-N,CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1
3,504967,1023.22,,,,,,,,,,,,,1023.22,,,,1022.5662,C50H86O21,,,,InChI=1S/C50H86O21/c1-9-12-18-21-30-22-19-16-14-13-15-17-20-23-32(52)66-43-40(69-47-38(58)37(57)...,FKTZIXGUMCNOPB-MEGPWIHJSA-N,CCCCC[C@H]1CCCCCCCCCC(=O)O[C@@H]2[C@@H](O[C@@H]3O[C@H](C)[C@@H](OC(=O)C(C)CC)[C@H](O)[C@H]3O)[C@...
4,257456,263.03,1.41,2.0,2.0,49.33,1.0,Y,0.0,9.04,,1.75,1.74,NEUTRAL,263.03,1.0,11.0,0.46,262.9443,C7H6INO2,3.0,2.0,0.0,"InChI=1S/C7H6INO2/c8-6-3-1-5(2-4-6)7(10)9-11/h1-4,11H,(H,9,10)",HXIKIQLHKIBCOH-UHFFFAOYSA-N,O=C(NO)c1ccc(I)cc1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220527,2334754,379.46,4.46,5.0,0.0,51.91,5.0,N,0.0,,8.31,3.44,2.48,NEUTRAL,379.46,3.0,28.0,0.65,379.1784,C23H25NO4,5.0,0.0,0.0,InChI=1S/C23H25NO4/c1-26-22-12-18-19(25)13-20(28-21(18)14-23(22)27-2)17-8-6-16(7-9-17)15-24-10-4...,LOYKCNXJSFGKJT-UHFFFAOYSA-N,COc1cc2oc(-c3ccc(CN4CCCCC4)cc3)cc(=O)c2cc1OC
2220528,2335601,3728.09,,,,,,,,,,,,,3728.09,,,,3725.8213,C167H252N42O55,,,,InChI=1S/C167H252N42O55/c1-13-14-15-16-17-18-19-20-21-22-23-24-31-47-125(219)186-109(165(263)264...,YEKUUBPJRPXMBM-PTCFZACGSA-N,CCCCCCCCCCCCCCCC(=O)N[C@@H](CCC(=O)NCCCC[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO)NC(=O)[C@@H](NC...
2220529,2335886,177.18,0.26,4.0,2.0,79.54,2.0,N,0.0,7.93,,0.01,-0.04,NEUTRAL,177.18,1.0,11.0,0.63,177.0096,C5H7NO4S,5.0,2.0,0.0,"InChI=1S/C5H7NO4S/c1-4-2-3-5(10-4)11(8,9)6-7/h2-3,6-7H,1H3",LIXKIXWSKOENAB-UHFFFAOYSA-N,Cc1ccc(S(=O)(=O)NO)o1
2220530,2334995,322.36,0.90,4.0,3.0,103.68,3.0,N,0.0,9.48,,0.45,0.45,NEUTRAL,322.36,1.0,23.0,0.57,322.1641,C15H22N4O4,8.0,3.0,0.0,"InChI=1S/C15H22N4O4/c1-19(15(22)17-11-6-3-2-4-7-11)18-14(21)13(20)16-10-12-8-5-9-23-12/h5,8-9,11...",RGCGYXFEKWXCJI-UHFFFAOYSA-N,CN(NC(=O)C(=O)NCc1ccco1)C(=O)NC1CCCCC1


In [25]:
df_cpd = pd.DataFrame(data_cpd)

In [47]:
df_cpd.molregno.nunique()

2194281

In [48]:
df_cpd_unique = df_cpd.drop_duplicates()

In [26]:
df_cpd.shape

(2220532, 26)

In [49]:
len(df_cpd_unique)

2194281

In [50]:
df_cpd_unique.to_csv("/Users/bzdrazil/Dropbox/ChEMBL/NP/data/ChEMBL31_cmpd_info.csv")

In [51]:
df_comb = df.merge(df_cpd_unique, on = 'molregno', how = 'inner')

In [52]:
df_comb.shape

(2483167, 45)

In [55]:
df_comb.to_csv("/Users/bzdrazil/Dropbox/ChEMBL/NP/data/ChEMBL31_with_cmpd.csv", sep = ';')

In [56]:
df_comb.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,127054,2002,Bioorg. Med. Chem. Lett.,6.6,Ki,B,CHEMBL312093,,0,,-1,0,0,0,0,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL253,488.07,4.89,5.0,1.0,55.73,6.0,N,0.0,,6.82,4.48,4.38,NEUTRAL,488.07,2.0,34.0,0.63,487.2602,C27H38ClN3O3,6.0,1.0,0.0,"InChI=1S/C27H38ClN3O3/c1-17-22(24(32)29-25-26(2,3)18-6-7-27(25,4)16-18)20-14-19(28)15-21(33-5)23...",BOXMLADZYBZPKU-DYDBGPTBSA-N,COc1cc(Cl)cc2c(C(=O)N[C@@H]3C4(C)CCC(C4)C3(C)C)c(C)n(CCN3CCOCC3)c12
1,330600,2005,Bioorg. Med. Chem. Lett.,7.24,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10495,Cathepsin K,SINGLE PROTEIN,Homo sapiens,CHEMBL268,498.02,5.39,5.0,3.0,92.6,10.0,N,1.0,13.09,5.45,4.44,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1
2,330600,2005,Bioorg. Med. Chem. Lett.,8.03,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,11534,Cathepsin S,SINGLE PROTEIN,Homo sapiens,CHEMBL2954,498.02,5.39,5.0,3.0,92.6,10.0,N,1.0,13.09,5.45,4.44,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1
3,330600,2005,Bioorg. Med. Chem. Lett.,7.17,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10498,Cathepsin L,SINGLE PROTEIN,Homo sapiens,CHEMBL3837,498.02,5.39,5.0,3.0,92.6,10.0,N,1.0,13.09,5.45,4.44,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1
4,383868,2007,Bioorg. Med. Chem. Lett.,8.1,IC50,B,CHEMBL231127,,0,,-1,0,0,0,0,10980,Vascular endothelial growth factor receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL279,452.54,5.95,6.0,3.0,105.82,4.0,N,1.0,11.49,4.96,4.43,4.43,NEUTRAL,452.54,5.0,33.0,0.31,452.1419,C25H20N6OS,7.0,4.0,1.0,InChI=1S/C25H20N6OS/c1-15-3-2-4-19(9-15)31-25(32)30-18-7-5-16(6-8-18)21-13-33-23-20(12-29-24(26)...,SBXVZMIZJGLIRS-UHFFFAOYSA-N,Cc1cccc(NC(=O)Nc2ccc(-c3csc4c(-c5cncnc5)cnc(N)c34)cc2)c1


### Calculate mean, median, and max pChEMBL values for each compound-target pair

In [57]:
df_comb['pchembl_value_mean'] = df_comb.groupby(['molregno', 'tid'])['pchembl_value'].transform('mean')
df_comb['pchembl_value_max'] = df_comb.groupby(['molregno', 'tid'])['pchembl_value'].transform('max')
df_comb['pchembl_value_median'] = df_comb.groupby(['molregno', 'tid'])['pchembl_value'].transform('median')

In [58]:
df_comb['first_publication_target_cmpd_pair'] = df_comb.groupby(['molregno', 'tid'])['year'].transform('min')

In [60]:
df_comb.head(20)

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair
0,127054,2002,Bioorg. Med. Chem. Lett.,6.6,Ki,B,CHEMBL312093,,0,,-1,0,0,0,0,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL253,488.07,4.89,5.0,1.0,55.73,6.0,N,0.0,,6.82,4.48,4.38,NEUTRAL,488.07,2.0,34.0,0.63,487.2602,C27H38ClN3O3,6.0,1.0,0.0,"InChI=1S/C27H38ClN3O3/c1-17-22(24(32)29-25-26(2,3)18-6-7-27(25,4)16-18)20-14-19(28)15-21(33-5)23...",BOXMLADZYBZPKU-DYDBGPTBSA-N,COc1cc(Cl)cc2c(C(=O)N[C@@H]3C4(C)CCC(C4)C3(C)C)c(C)n(CCN3CCOCC3)c12,6.6,6.6,6.6,2002
1,330600,2005,Bioorg. Med. Chem. Lett.,7.24,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10495,Cathepsin K,SINGLE PROTEIN,Homo sapiens,CHEMBL268,498.02,5.39,5.0,3.0,92.6,10.0,N,1.0,13.09,5.45,4.44,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.24,7.24,7.24,2005
2,330600,2005,Bioorg. Med. Chem. Lett.,8.03,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,11534,Cathepsin S,SINGLE PROTEIN,Homo sapiens,CHEMBL2954,498.02,5.39,5.0,3.0,92.6,10.0,N,1.0,13.09,5.45,4.44,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,8.03,8.03,8.03,2005
3,330600,2005,Bioorg. Med. Chem. Lett.,7.17,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10498,Cathepsin L,SINGLE PROTEIN,Homo sapiens,CHEMBL3837,498.02,5.39,5.0,3.0,92.6,10.0,N,1.0,13.09,5.45,4.44,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.17,7.17,7.17,2005
4,383868,2007,Bioorg. Med. Chem. Lett.,8.1,IC50,B,CHEMBL231127,,0,,-1,0,0,0,0,10980,Vascular endothelial growth factor receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL279,452.54,5.95,6.0,3.0,105.82,4.0,N,1.0,11.49,4.96,4.43,4.43,NEUTRAL,452.54,5.0,33.0,0.31,452.1419,C25H20N6OS,7.0,4.0,1.0,InChI=1S/C25H20N6OS/c1-15-3-2-4-19(9-15)31-25(32)30-18-7-5-16(6-8-18)21-13-33-23-20(12-29-24(26)...,SBXVZMIZJGLIRS-UHFFFAOYSA-N,Cc1cccc(NC(=O)Nc2ccc(-c3csc4c(-c5cncnc5)cnc(N)c34)cc2)c1,8.55,9.0,8.55,2007
5,383868,2007,Bioorg. Med. Chem. Lett.,9.0,IC50,F,CHEMBL231127,,0,,-1,0,0,0,0,10980,Vascular endothelial growth factor receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL279,452.54,5.95,6.0,3.0,105.82,4.0,N,1.0,11.49,4.96,4.43,4.43,NEUTRAL,452.54,5.0,33.0,0.31,452.1419,C25H20N6OS,7.0,4.0,1.0,InChI=1S/C25H20N6OS/c1-15-3-2-4-19(9-15)31-25(32)30-18-7-5-16(6-8-18)21-13-33-23-20(12-29-24(26)...,SBXVZMIZJGLIRS-UHFFFAOYSA-N,Cc1cccc(NC(=O)Nc2ccc(-c3csc4c(-c5cncnc5)cnc(N)c34)cc2)c1,8.55,9.0,8.55,2007
6,229816,2000,J. Med. Chem.,9.22,IC50,B,CHEMBL342365,,0,,-1,0,0,0,0,191,Human immunodeficiency virus type 1 protease,SINGLE PROTEIN,Human immunodeficiency virus 1,CHEMBL243,634.64,5.2,5.0,4.0,107.89,10.0,N,2.0,11.41,,5.76,5.76,NEUTRAL,634.64,3.0,45.0,0.14,634.2466,C33H35F5N2O5,7.0,4.0,2.0,"InChI=1S/C33H35F5N2O5/c1-33(2,3)45-32(44)39-22(13-17-9-5-4-6-10-17)23(41)16-19(14-21-25(34)27(36...",LUXWCCSCNDRTSF-PWSDHMJJSA-N,CC(C)(C)OC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)C[C@@H](Cc1c(F)c(F)c(F)c(F)c1F)C(=O)NC1c2ccccc2C[C@@H]1O,9.22,9.22,9.22,2000
7,416145,2007,Bioorg. Med. Chem. Lett.,9.1,IC50,B,CHEMBL250035,,0,,-1,0,0,0,0,12967,Serine/threonine-protein kinase Chk1,SINGLE PROTEIN,Homo sapiens,CHEMBL4630,428.92,5.28,3.0,3.0,87.56,5.0,N,1.0,12.21,10.04,4.16,1.66,BASE,428.92,5.0,31.0,0.33,428.1404,C25H21ClN4O,5.0,4.0,1.0,InChI=1S/C25H21ClN4O/c26-23-6-2-1-5-19(23)22-14-28-25(31)24-20-10-15(17-12-29-30-13-17)7-8-18(20...,ZOLFLTIRMGPHQN-UHFFFAOYSA-N,NCCCc1cc2c(-c3ccccc3Cl)c[nH]c(=O)c2c2cc(-c3cn[nH]c3)ccc12,9.1,9.1,9.1,2007
8,270216,1996,Bioorg. Med. Chem. Lett.,6.67,IC50,B,CHEMBL351243,,0,,-1,0,0,0,0,191,Human immunodeficiency virus type 1 protease,SINGLE PROTEIN,Human immunodeficiency virus 1,CHEMBL243,543.73,7.39,5.0,1.0,66.84,11.0,N,2.0,3.23,,7.42,3.98,ACID,543.73,3.0,39.0,0.27,543.2443,C33H37NO4S,5.0,1.0,2.0,"InChI=1S/C33H37NO4S/c1-24(2)27-17-10-11-18-29(27)39-31-28(35)23-33(38-32(31)37,26-15-8-5-9-16-26...",OPKZYZCPYWWXHX-UHFFFAOYSA-N,CC(C)c1ccccc1SC1=C(O)OC(CCCC(=O)N(C)CCc2ccccc2)(c2ccccc2)CC1=O,6.67,6.67,6.67,1996
9,18809,2000,J. Med. Chem.,7.62,Ki,F,CHEMBL275605,,0,,-1,0,0,0,0,252,Adenosine A2a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL251,475.55,3.07,7.0,2.0,111.01,10.0,N,0.0,7.75,1.78,3.44,3.3,NEUTRAL,475.55,4.0,35.0,0.37,475.222,C26H29N5O4,9.0,2.0,0.0,InChI=1S/C26H29N5O4/c1-3-14-30-24-22(25(33)31(15-4-2)26(30)34)28-23(29-24)19-10-12-20(13-11-19)3...,BZAIRJUILWPFJR-UHFFFAOYSA-N,CCCn1c(=O)c2nc(-c3ccc(OCC(=O)NCc4ccccc4)cc3)[nH]c2n(CCC)c1=O,7.66,7.7,7.66,2000


### Calculate LE metrics

Ligand Efficiency (LE):

LE=ΔG/HA
where ΔG = − RTlnKd, − RTlnKi, or − RTln(IC50)

LE=(2.303*298*0.00199*pchembl_value)/heavy_atoms


BEI=pchembl_mean*1000/mw_freebase

SEI=pchembl_mean*100/PSA

LLE=pchembl_mean-ALOGP

In [61]:
df_comb['LE'] = df_comb['pchembl_value_mean']/df_comb['heavy_atoms']*(2.303*298*0.00199)
df_comb['BEI'] = df_comb['pchembl_value_mean']*1000/df_comb["mw_freebase"]
df_comb['SEI'] = df_comb['pchembl_value_mean']*100/df_comb["psa"]
df_comb['LLE'] = df_comb['pchembl_value_mean']-df_comb["alogp"]

### Extract drug-target interactions with disease relevance from drug_mechanism table

In [64]:
sql_dti = '''
select distinct molregno, mechanism_of_action, tid, disease_efficacy  from chembl_31.drug_mechanism
'''

data_dti = pd.read_sql_query(sql_dti, con=engine_ch)
data_dti

Unnamed: 0,molregno,mechanism_of_action,tid,disease_efficacy
0,1926938,Inhibitor of nuclear factor kappa B kinase beta subunit inhibitor,10752.0,1
1,1927181,Neurotrophic tyrosine kinase receptor inhibitor,109739.0,1
2,699450,Muscle-type nicotinic acetylcholine receptor antagonist,105709.0,1
3,1926930,Dual specificity mitogen-activated protein kinase kinase 2 inhibitor,11046.0,1
4,1927096,PI3-kinase p110-delta subunit inhibitor,11177.0,1
...,...,...,...,...
6635,1927119,Metabotropic glutamate receptor 3 agonist,10167.0,1
6636,2096381,Alpha-synuclein inhibitor,102780.0,1
6637,2089491,Tyrosine-protein kinase receptor FLT3 inhibitor,12670.0,1
6638,10840,Glutathione reductase inhibitor,12642.0,1


In [65]:
len(data_dti)

6640

In [66]:
data_dti_all_tids = data_dti[data_dti['disease_efficacy'] == 1]

In [67]:
len(data_dti_all_tids)

6638

In [68]:
data_dti_all_tids.dtypes

molregno                 int64
mechanism_of_action     object
tid                    float64
disease_efficacy         int64
dtype: object

In [69]:
data_dti_all_tids['tid'] = data_dti_all_tids['tid'].astype('Int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dti_all_tids['tid'] = data_dti_all_tids['tid'].astype('Int64')


In [70]:
data_dti_all_tids.dtypes

molregno                int64
mechanism_of_action    object
tid                     Int64
disease_efficacy        int64
dtype: object

In [71]:
list_dti_tid = data_dti_all_tids['tid'].to_list()
list_dti = data_dti['molregno'].to_list()

Identify which TIDs are drug targets from the drug_mechanism table and add field called "defined_DTI": 
    Value: "True" if it is a drug with a curated drug_meachnism and "False" if not.

Map again to the drug mechanism table via "tid" to identify therapeutic targets and add column "Therapeutic_Target":
    Value: "True" | "False"

Use this logic to define per compound/target pair whether it corresponds to:
    
    drug and its therapeutic target "DTI": "D_DT";
        
    a drug target but not a drug "DTI": "DT";
        
    not a drug and not a drug target "DTI": "NDT";
        
(done by first creating a new column "DT_assoc" in both the master and the mapping table which reflects the molregno-tid association and then mapping it)

In [72]:
df_comb['Therapeutic_Target'] = df_comb['tid'].isin(list_dti_tid)

In [73]:
df_comb['defined_DTI'] = df_comb['molregno'].isin(list_dti)

In [74]:
data_dti_all_tids['DT_assoc'] = data_dti_all_tids.agg('{0[molregno]}_{0[tid]}'.format, axis=1)
data_dti_all_tids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dti_all_tids['DT_assoc'] = data_dti_all_tids.agg('{0[molregno]}_{0[tid]}'.format, axis=1)


Unnamed: 0,molregno,mechanism_of_action,tid,disease_efficacy,DT_assoc
0,1926938,Inhibitor of nuclear factor kappa B kinase beta subunit inhibitor,10752,1,1926938_10752
1,1927181,Neurotrophic tyrosine kinase receptor inhibitor,109739,1,1927181_109739
2,699450,Muscle-type nicotinic acetylcholine receptor antagonist,105709,1,699450_105709
3,1926930,Dual specificity mitogen-activated protein kinase kinase 2 inhibitor,11046,1,1926930_11046
4,1927096,PI3-kinase p110-delta subunit inhibitor,11177,1,1927096_11177
...,...,...,...,...,...
6635,1927119,Metabotropic glutamate receptor 3 agonist,10167,1,1927119_10167
6636,2096381,Alpha-synuclein inhibitor,102780,1,2096381_102780
6637,2089491,Tyrosine-protein kinase receptor FLT3 inhibitor,12670,1,2089491_12670
6638,10840,Glutathione reductase inhibitor,12642,1,10840_12642


In [75]:
DTI_list = data_dti_all_tids['DT_assoc'].to_list()

In [76]:
df_comb['DT_assoc'] = df_comb.agg('{0[molregno]}_{0[tid]}'.format, axis=1)
df_comb.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc
0,127054,2002,Bioorg. Med. Chem. Lett.,6.6,Ki,B,CHEMBL312093,,0,,-1,0,0,0,0,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL253,488.07,4.89,5.0,1.0,55.73,...,4.38,NEUTRAL,488.07,2.0,34.0,0.63,487.2602,C27H38ClN3O3,6.0,1.0,0.0,"InChI=1S/C27H38ClN3O3/c1-17-22(24(32)29-25-26(2,3)18-6-7-27(25,4)16-18)20-14-19(28)15-21(33-5)23...",BOXMLADZYBZPKU-DYDBGPTBSA-N,COc1cc(Cl)cc2c(C(=O)N[C@@H]3C4(C)CCC(C4)C3(C)C)c(C)n(CCN3CCOCC3)c12,6.6,6.6,6.6,2002,0.265111,13.52265,11.842814,1.71,True,False,127054_259
1,330600,2005,Bioorg. Med. Chem. Lett.,7.24,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10495,Cathepsin K,SINGLE PROTEIN,Homo sapiens,CHEMBL268,498.02,5.39,5.0,3.0,92.6,...,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.24,7.24,7.24,2005,0.28251,14.537569,7.818575,1.85,True,False,330600_10495
2,330600,2005,Bioorg. Med. Chem. Lett.,8.03,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,11534,Cathepsin S,SINGLE PROTEIN,Homo sapiens,CHEMBL2954,498.02,5.39,5.0,3.0,92.6,...,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,8.03,8.03,8.03,2005,0.313336,16.12385,8.671706,2.64,True,False,330600_11534
3,330600,2005,Bioorg. Med. Chem. Lett.,7.17,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10498,Cathepsin L,SINGLE PROTEIN,Homo sapiens,CHEMBL3837,498.02,5.39,5.0,3.0,92.6,...,4.43,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.17,7.17,7.17,2005,0.279779,14.397012,7.742981,1.78,False,False,330600_10498
4,383868,2007,Bioorg. Med. Chem. Lett.,8.1,IC50,B,CHEMBL231127,,0,,-1,0,0,0,0,10980,Vascular endothelial growth factor receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL279,452.54,5.95,6.0,3.0,105.82,...,4.43,NEUTRAL,452.54,5.0,33.0,0.31,452.1419,C25H20N6OS,7.0,4.0,1.0,InChI=1S/C25H20N6OS/c1-15-3-2-4-19(9-15)31-25(32)30-18-7-5-16(6-8-18)21-13-33-23-20(12-29-24(26)...,SBXVZMIZJGLIRS-UHFFFAOYSA-N,Cc1cccc(NC(=O)Nc2ccc(-c3csc4c(-c5cncnc5)cnc(N)c34)cc2)c1,8.55,9.0,8.55,2007,0.353847,18.893357,8.079758,2.6,True,False,383868_10980


In [77]:
df_comb['DTI'] = "Nan"
df_comb['DTI'][df_comb['DT_assoc'].isin(DTI_list)] = "D_DT"
df_comb['DTI'][(df_comb['Therapeutic_Target']== True) & (df_comb['defined_DTI']== False)] = "DT"
df_comb['DTI'][(df_comb['Therapeutic_Target']== False) & (df_comb['defined_DTI']== False)] = "NDT"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comb['DTI'][df_comb['DT_assoc'].isin(DTI_list)] = "D_DT"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comb['DTI'][(df_comb['Therapeutic_Target']== True) & (df_comb['defined_DTI']== False)] = "DT"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comb['DTI'][(df_comb['Therapeutic_Target']== False) & (df_comb['defined_DTI']== False)] = "NDT"


In [78]:
df_comb.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc,DTI
0,127054,2002,Bioorg. Med. Chem. Lett.,6.6,Ki,B,CHEMBL312093,,0,,-1,0,0,0,0,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL253,488.07,4.89,5.0,1.0,55.73,...,NEUTRAL,488.07,2.0,34.0,0.63,487.2602,C27H38ClN3O3,6.0,1.0,0.0,"InChI=1S/C27H38ClN3O3/c1-17-22(24(32)29-25-26(2,3)18-6-7-27(25,4)16-18)20-14-19(28)15-21(33-5)23...",BOXMLADZYBZPKU-DYDBGPTBSA-N,COc1cc(Cl)cc2c(C(=O)N[C@@H]3C4(C)CCC(C4)C3(C)C)c(C)n(CCN3CCOCC3)c12,6.6,6.6,6.6,2002,0.265111,13.52265,11.842814,1.71,True,False,127054_259,DT
1,330600,2005,Bioorg. Med. Chem. Lett.,7.24,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10495,Cathepsin K,SINGLE PROTEIN,Homo sapiens,CHEMBL268,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.24,7.24,7.24,2005,0.28251,14.537569,7.818575,1.85,True,False,330600_10495,DT
2,330600,2005,Bioorg. Med. Chem. Lett.,8.03,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,11534,Cathepsin S,SINGLE PROTEIN,Homo sapiens,CHEMBL2954,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,8.03,8.03,8.03,2005,0.313336,16.12385,8.671706,2.64,True,False,330600_11534,DT
3,330600,2005,Bioorg. Med. Chem. Lett.,7.17,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10498,Cathepsin L,SINGLE PROTEIN,Homo sapiens,CHEMBL3837,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.17,7.17,7.17,2005,0.279779,14.397012,7.742981,1.78,False,False,330600_10498,NDT
4,383868,2007,Bioorg. Med. Chem. Lett.,8.1,IC50,B,CHEMBL231127,,0,,-1,0,0,0,0,10980,Vascular endothelial growth factor receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL279,452.54,5.95,6.0,3.0,105.82,...,NEUTRAL,452.54,5.0,33.0,0.31,452.1419,C25H20N6OS,7.0,4.0,1.0,InChI=1S/C25H20N6OS/c1-15-3-2-4-19(9-15)31-25(32)30-18-7-5-16(6-8-18)21-13-33-23-20(12-29-24(26)...,SBXVZMIZJGLIRS-UHFFFAOYSA-N,Cc1cccc(NC(=O)Nc2ccc(-c3csc4c(-c5cncnc5)cnc(N)c34)cc2)c1,8.55,9.0,8.55,2007,0.353847,18.893357,8.079758,2.6,True,False,383868_10980,DT


In [79]:
df_comb['DTI'].value_counts()

NDT     1351251
DT      1086498
Nan       37134
D_DT       8284
Name: DTI, dtype: int64

In [80]:
df_comb.max_phase.value_counts()

0    2409005
4      46479
2      11555
3       9936
1       6192
Name: max_phase, dtype: int64

In [81]:
df_4 = df_comb[df_comb['max_phase'] == 4]

In [82]:
df_4.DTI.value_counts()

Nan     22667
DT       9743
NDT      9188
D_DT     4881
Name: DTI, dtype: int64

In [83]:
df_4_D_DT = df_4[df_4['DTI'] == 'D_DT']

In [84]:
df_4_D_DT.tid.nunique()  #218 unique targets with annotated drugs

226

In [85]:
df_3 = df_comb[df_comb['max_phase'] == 3]

In [86]:
df_3.DTI.value_counts()

Nan     5170
NDT     1815
DT      1645
D_DT    1306
Name: DTI, dtype: int64

In [87]:
df_3_D_DT = df_3[df_3['DTI'] == 'D_DT']

In [88]:
df_3_D_DT.tid.nunique()   #176 unique targets with clincal candidates in phase 3

181

In [89]:
df_2 = df_comb[df_comb['max_phase'] == 2]

In [90]:
df_2.DTI.value_counts()   

Nan     6031
NDT     2160
DT      1707
D_DT    1657
Name: DTI, dtype: int64

In [91]:
df_2_D_DT = df_2[df_2['DTI'] == 'D_DT']

In [92]:
df_2_D_DT.tid.nunique()   #221 unique targets with clincal candidates in phase 2

229

In [93]:
df_1 = df_comb[df_comb['max_phase'] == 1]

In [94]:
df_1.DTI.value_counts()

Nan     3177
NDT     1335
DT      1271
D_DT     409
Name: DTI, dtype: int64

In [95]:
df_1_D_DT = df_1[df_1['DTI'] == 'D_DT']

In [96]:
df_1_D_DT.tid.nunique()     #71 unique targets with clincal candidates in phase 1

72

In [98]:
df_comb.tid.nunique()  # No of targets all together

6710

### Filter for targets with at least 100 comparator compounds

In [99]:
df_rest = df_comb[df_comb['max_phase'] == 0]

In [100]:
df_rest.tid.nunique()

6456

In [102]:
df_drugs_clin_cand = df_comb[df_comb['max_phase'] != 0]

In [103]:
df_drugs_clin_cand.tid.nunique()

3340

In [104]:
s = df_rest.groupby(['tid'])['molregno'].count()

In [105]:
s

tid
1          137
2           52
3         1872
4          442
6         1161
          ... 
120523      91
120525       9
120526       9
120529       5
120530       1
Name: molregno, Length: 6456, dtype: int64

In [106]:
len(s)

6456

In [107]:
list = s[s >= 100].index.tolist()

In [108]:
len(list)  # No of targets with at least 100 comparator compounds

1689

In [None]:
#df.to_csv("/Users/bzdrazil/Dropbox/ChEMBL/NP/data/ChEMBL31_DTI.tsv")

In [109]:
df_filtered_targets = df_comb.query('tid in @list')

In [110]:
len(df_filtered_targets)

2378379

In [111]:
df_filtered_targets.tid.nunique()

1689

In [112]:
df_filtered_targets.max_phase.value_counts()

0    2314685
4      40500
2       9734
3       8407
1       5053
Name: max_phase, dtype: int64

In [114]:
df_filtered_targets.to_csv("/Users/bzdrazil/Dropbox/ChEMBL/NP/data/ChEMBL31_DTI_filtered_targets.csv", sep = ";")

In [115]:
df_filtered_targets_4 = df_filtered_targets[df_filtered_targets['max_phase'] == 4]
df_filtered_targets_4_D_DT = df_filtered_targets_4[df_filtered_targets_4['DTI'] == 'D_DT']
df_filtered_targets_4_D_DT.tid.nunique() 

193

In [116]:
df_filtered_targets_3 = df_filtered_targets[df_filtered_targets['max_phase'] == 3]
df_filtered_targets_3_D_DT = df_filtered_targets_3[df_filtered_targets_3['DTI'] == 'D_DT']
df_filtered_targets_3_D_DT.tid.nunique() 

164

In [117]:
df_filtered_targets_2 = df_filtered_targets[df_filtered_targets['max_phase'] == 2]
df_filtered_targets_2_D_DT = df_filtered_targets_2[df_filtered_targets_2['DTI'] == 'D_DT']
df_filtered_targets_2_D_DT.tid.nunique() 

220

In [118]:
df_filtered_targets_1 = df_filtered_targets[df_filtered_targets['max_phase'] == 1]
df_filtered_targets_1_D_DT = df_filtered_targets_1[df_filtered_targets_1['DTI'] == 'D_DT']
df_filtered_targets_1_D_DT.tid.nunique() 

70

In [119]:
df_filtered_targets.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc,DTI
0,127054,2002,Bioorg. Med. Chem. Lett.,6.6,Ki,B,CHEMBL312093,,0,,-1,0,0,0,0,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL253,488.07,4.89,5.0,1.0,55.73,...,NEUTRAL,488.07,2.0,34.0,0.63,487.2602,C27H38ClN3O3,6.0,1.0,0.0,"InChI=1S/C27H38ClN3O3/c1-17-22(24(32)29-25-26(2,3)18-6-7-27(25,4)16-18)20-14-19(28)15-21(33-5)23...",BOXMLADZYBZPKU-DYDBGPTBSA-N,COc1cc(Cl)cc2c(C(=O)N[C@@H]3C4(C)CCC(C4)C3(C)C)c(C)n(CCN3CCOCC3)c12,6.6,6.6,6.6,2002,0.265111,13.52265,11.842814,1.71,True,False,127054_259,DT
1,330600,2005,Bioorg. Med. Chem. Lett.,7.24,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10495,Cathepsin K,SINGLE PROTEIN,Homo sapiens,CHEMBL268,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.24,7.24,7.24,2005,0.28251,14.537569,7.818575,1.85,True,False,330600_10495,DT
2,330600,2005,Bioorg. Med. Chem. Lett.,8.03,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,11534,Cathepsin S,SINGLE PROTEIN,Homo sapiens,CHEMBL2954,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,8.03,8.03,8.03,2005,0.313336,16.12385,8.671706,2.64,True,False,330600_11534,DT
3,330600,2005,Bioorg. Med. Chem. Lett.,7.17,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10498,Cathepsin L,SINGLE PROTEIN,Homo sapiens,CHEMBL3837,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.17,7.17,7.17,2005,0.279779,14.397012,7.742981,1.78,False,False,330600_10498,NDT
4,383868,2007,Bioorg. Med. Chem. Lett.,8.1,IC50,B,CHEMBL231127,,0,,-1,0,0,0,0,10980,Vascular endothelial growth factor receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL279,452.54,5.95,6.0,3.0,105.82,...,NEUTRAL,452.54,5.0,33.0,0.31,452.1419,C25H20N6OS,7.0,4.0,1.0,InChI=1S/C25H20N6OS/c1-15-3-2-4-19(9-15)31-25(32)30-18-7-5-16(6-8-18)21-13-33-23-20(12-29-24(26)...,SBXVZMIZJGLIRS-UHFFFAOYSA-N,Cc1cccc(NC(=O)Nc2ccc(-c3csc4c(-c5cncnc5)cnc(N)c34)cc2)c1,8.55,9.0,8.55,2007,0.353847,18.893357,8.079758,2.6,True,False,383868_10980,DT


In [124]:
df_filtered_targets.assay_type.value_counts()

B    1224799
F    1113524
A      39931
T         63
U         62
Name: assay_type, dtype: int64

## Only B assays: filter for at least 100 comparator compounds

In [141]:
df_comb_B = df_comb[df_comb['assay_type'] == 'B']

In [142]:
df_rest_B = df_comb_B[df_comb_B['max_phase'] < 4]

In [143]:
s = df_rest_B.groupby(['tid'])['molregno'].count()

In [144]:
len(s)

6224

In [145]:
list = s[s >= 100].index.tolist()

In [146]:
len(list)

1463

In [147]:
df_filtered_targets_B = df_comb_B.query('tid in @list')

In [148]:
len(df_filtered_targets_B)

1216863

In [149]:
df_filtered_targets_B.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc,DTI
0,127054,2002,Bioorg. Med. Chem. Lett.,6.6,Ki,B,CHEMBL312093,,0,,-1,0,0,0,0,259,Cannabinoid CB2 receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL253,488.07,4.89,5.0,1.0,55.73,...,NEUTRAL,488.07,2.0,34.0,0.63,487.2602,C27H38ClN3O3,6.0,1.0,0.0,"InChI=1S/C27H38ClN3O3/c1-17-22(24(32)29-25-26(2,3)18-6-7-27(25,4)16-18)20-14-19(28)15-21(33-5)23...",BOXMLADZYBZPKU-DYDBGPTBSA-N,COc1cc(Cl)cc2c(C(=O)N[C@@H]3C4(C)CCC(C4)C3(C)C)c(C)n(CCN3CCOCC3)c12,6.6,6.6,6.6,2002,0.265111,13.52265,11.842814,1.71,True,False,127054_259,DT
1,330600,2005,Bioorg. Med. Chem. Lett.,7.24,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10495,Cathepsin K,SINGLE PROTEIN,Homo sapiens,CHEMBL268,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.24,7.24,7.24,2005,0.28251,14.537569,7.818575,1.85,True,False,330600_10495,DT
2,330600,2005,Bioorg. Med. Chem. Lett.,8.03,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,11534,Cathepsin S,SINGLE PROTEIN,Homo sapiens,CHEMBL2954,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,8.03,8.03,8.03,2005,0.313336,16.12385,8.671706,2.64,True,False,330600_11534,DT
3,330600,2005,Bioorg. Med. Chem. Lett.,7.17,Ki,B,CHEMBL198420,,0,,-1,0,0,0,0,10498,Cathepsin L,SINGLE PROTEIN,Homo sapiens,CHEMBL3837,498.02,5.39,5.0,3.0,92.6,...,NEUTRAL,498.02,3.0,35.0,0.33,497.2081,C27H32ClN3O4,7.0,3.0,1.0,InChI=1S/C27H32ClN3O4/c1-34-22-10-8-21(9-11-22)29-13-14-30-26(32)23(15-18-5-3-2-4-6-18)31-27(33)...,JWJMPDHKYJTJFE-QHCPKHFHSA-N,COc1ccc(NCCNC(=O)[C@H](CC2CCCCC2)NC(=O)c2cc3cc(Cl)ccc3o2)cc1,7.17,7.17,7.17,2005,0.279779,14.397012,7.742981,1.78,False,False,330600_10498,NDT
4,383868,2007,Bioorg. Med. Chem. Lett.,8.1,IC50,B,CHEMBL231127,,0,,-1,0,0,0,0,10980,Vascular endothelial growth factor receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL279,452.54,5.95,6.0,3.0,105.82,...,NEUTRAL,452.54,5.0,33.0,0.31,452.1419,C25H20N6OS,7.0,4.0,1.0,InChI=1S/C25H20N6OS/c1-15-3-2-4-19(9-15)31-25(32)30-18-7-5-16(6-8-18)21-13-33-23-20(12-29-24(26)...,SBXVZMIZJGLIRS-UHFFFAOYSA-N,Cc1cccc(NC(=O)Nc2ccc(-c3csc4c(-c5cncnc5)cnc(N)c34)cc2)c1,8.55,9.0,8.55,2007,0.353847,18.893357,8.079758,2.6,True,False,383868_10980,DT


In [138]:
df_filtered_targets_B_4 = df_filtered_targets_B[df_filtered_targets_B['max_phase'] == 4]
df_filtered_targets_B_4_D_DT = df_filtered_targets_B_4[df_filtered_targets_B_4['DTI'] == 'D_DT']
df_filtered_targets_B_4_D_DT.tid.nunique() 

177

In [160]:
targets_B_with_drug_MoA = df_filtered_targets_B_4_D_DT.tid.to_list()

In [150]:
#¢df_filtered_targets_B_max_phase = df_filtered_targets_B.groupby('tid', as_index=False)['max_phase'].max()

In [151]:
#df_filtered_targets_B_max_phase.tid.nunique()

1463

In [152]:
#df_filtered_targets_B_max_phase.max_phase.value_counts()

4    997
0    247
3    120
2     67
1     32
Name: max_phase, dtype: int64

In [153]:
#df_filtered_targets_B_max_phase_final = df_filtered_targets_B_max_phase[df_filtered_targets_B_max_phase['max_phase'] > 3]

In [156]:
#targets_B_with_drug = df_filtered_targets_B_max_phase_final.tid.to_list()

In [161]:
data_final_targets_B_with_drug = df_filtered_targets_B.query('tid in @targets_B_with_drug_MoA')

In [162]:
len(data_final_targets_B_with_drug)

360521

In [163]:
data_final_targets_B_with_drug.head()

Unnamed: 0,molregno,year,journal,pchembl_value,standard_type,assay_type,compound_chembl_id,compound_pref_name,max_phase,first_approval,prodrug,oral,parenteral,topical,black_box_warning,tid,target_pref_name,target_type,organism,target_chembl_id,mw_freebase,alogp,hba,hbd,psa,...,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,LE,BEI,SEI,LLE,Therapeutic_Target,defined_DTI,DT_assoc,DTI
4,383868,2007,Bioorg. Med. Chem. Lett.,8.1,IC50,B,CHEMBL231127,,0,,-1,0,0,0,0,10980,Vascular endothelial growth factor receptor 2,SINGLE PROTEIN,Homo sapiens,CHEMBL279,452.54,5.95,6.0,3.0,105.82,...,NEUTRAL,452.54,5.0,33.0,0.31,452.1419,C25H20N6OS,7.0,4.0,1.0,InChI=1S/C25H20N6OS/c1-15-3-2-4-19(9-15)31-25(32)30-18-7-5-16(6-8-18)21-13-33-23-20(12-29-24(26)...,SBXVZMIZJGLIRS-UHFFFAOYSA-N,Cc1cccc(NC(=O)Nc2ccc(-c3csc4c(-c5cncnc5)cnc(N)c34)cc2)c1,8.55,9.0,8.55,2007,0.353847,18.893357,8.079758,2.6,True,False,383868_10980,DT
6,229816,2000,J. Med. Chem.,9.22,IC50,B,CHEMBL342365,,0,,-1,0,0,0,0,191,Human immunodeficiency virus type 1 protease,SINGLE PROTEIN,Human immunodeficiency virus 1,CHEMBL243,634.64,5.2,5.0,4.0,107.89,...,NEUTRAL,634.64,3.0,45.0,0.14,634.2466,C33H35F5N2O5,7.0,4.0,2.0,"InChI=1S/C33H35F5N2O5/c1-33(2,3)45-32(44)39-22(13-17-9-5-4-6-10-17)23(41)16-19(14-21-25(34)27(36...",LUXWCCSCNDRTSF-PWSDHMJJSA-N,CC(C)(C)OC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)C[C@@H](Cc1c(F)c(F)c(F)c(F)c1F)C(=O)NC1c2ccccc2C[C@@H]1O,9.22,9.22,9.22,2000,0.279822,14.527921,8.545741,4.02,True,False,229816_191,DT
8,270216,1996,Bioorg. Med. Chem. Lett.,6.67,IC50,B,CHEMBL351243,,0,,-1,0,0,0,0,191,Human immunodeficiency virus type 1 protease,SINGLE PROTEIN,Human immunodeficiency virus 1,CHEMBL243,543.73,7.39,5.0,1.0,66.84,...,ACID,543.73,3.0,39.0,0.27,543.2443,C33H37NO4S,5.0,1.0,2.0,"InChI=1S/C33H37NO4S/c1-24(2)27-17-10-11-18-29(27)39-31-28(35)23-33(38-32(31)37,26-15-8-5-9-16-26...",OPKZYZCPYWWXHX-UHFFFAOYSA-N,CC(C)c1ccccc1SC1=C(O)OC(CCCC(=O)N(C)CCc2ccccc2)(c2ccccc2)CC1=O,6.67,6.67,6.67,1996,0.233574,12.267118,9.979054,-0.72,True,False,270216_191,DT
18,18809,2006,J. Med. Chem.,7.7,Ki,B,CHEMBL275605,,0,,-1,0,0,0,0,252,Adenosine A2a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL251,475.55,3.07,7.0,2.0,111.01,...,NEUTRAL,475.55,4.0,35.0,0.37,475.222,C26H29N5O4,9.0,2.0,0.0,InChI=1S/C26H29N5O4/c1-3-14-30-24-22(25(33)31(15-4-2)26(30)34)28-23(29-24)19-10-12-20(13-11-19)3...,BZAIRJUILWPFJR-UHFFFAOYSA-N,CCCn1c(=O)c2nc(-c3ccc(OCC(=O)NCc4ccccc4)cc3)[nH]c2n(CCC)c1=O,7.66,7.7,7.66,2000,0.298899,16.107665,6.900279,4.59,True,False,18809_252,DT
20,1595472,1989,J. Med. Chem.,4.7,IC50,B,CHEMBL3085581,,0,,-1,0,0,0,0,104770,Sodium/potassium-transporting ATPase,PROTEIN COMPLEX GROUP,Homo sapiens,CHEMBL2095186,360.54,4.55,3.0,1.0,54.37,...,NEUTRAL,360.54,0.0,26.0,0.79,360.2664,C23H36O3,3.0,1.0,0.0,"InChI=1S/C23H36O3/c1-14(24)16-7-10-21(3)17(13-16)5-6-20-19(21)8-11-22(4)18(15(2)25)9-12-23(20,22...",UOFFWLNONNOOOK-KAJOVLNSSA-N,CC(=O)[C@H]1CC[C@@]2(C)[C@@H](CC[C@@H]3[C@@H]2CC[C@]2(C)[C@@H](C(C)=O)CC[C@@]32O)C1,4.7,4.7,4.7,1989,0.246881,13.036002,8.644473,0.15,True,False,1595472_104770,DT


In [164]:
data_final_targets_B_with_drug.DTI.value_counts()

DT      348365
Nan       6350
D_DT      5806
Name: DTI, dtype: int64

In [166]:
data_final_targets_B_with_drug.tid.nunique()

177