# Retrieve chembl data

The purpose of this notebook is to retrieve all compounds on Chembl for the Human a go go receptor (hERG) for model training.

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem import Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import rdDepictor

# custom drawing options for rdkit molecules:
opts = Draw.MolDrawOptions()
# lighter blue for nitrogen
opts.updateAtomPalette({7: (0.4, 0.4, 1)})
opts.useBWAtomPalette()
# transparent background
opts.clearBackground = True
rdDepictor.SetPreferCoordGen(True)

import math
from pathlib import Path
import pathlib
from zipfile import ZipFile
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm
from utils.chembl_utils import retrieve_chembl_bioactivities, cleanup_bioactivities_df, retrieve_compound_data

  __version__ = __import__('pkg_resources').get_distribution('chembl_webresource_client').version


Protein and data variables.

In [2]:
target = 'herg'
uniprot = 'Q12809'

In [3]:
# define paths
HERE = Path(pathlib.Path.cwd())
DATA = HERE / f"data_{target}"
DATA.mkdir(parents=True, exist_ok=True)

### Download ligands and bioactivity data from Chembl.

In [6]:
bioactivities_df = retrieve_chembl_bioactivities(uniprot, readout='IC50', relation='=', assay_type='B')
bioactivities_df.head()

The target ChEMBL ID is CHEMBL240
Retrieving bioactivities for  Q12809
Length and type of bioactivities object: 7028, <class 'chembl_webresource_client.query_set.QuerySet'>
Length and type of first element: 13, <class 'dict'>


100%|██████████| 7028/7028 [07:43<00:00, 15.17it/s]

DataFrame shape: (7028, 13)





Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,753688,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL443476,=,nM,88.0,CHEMBL240,Homo sapiens,IC50,nM,88.0
1,754652,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL53661,=,nM,137.0,CHEMBL240,Homo sapiens,IC50,nM,137.0
2,754653,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL299390,=,nM,1480.0,CHEMBL240,Homo sapiens,IC50,nM,1480.0
3,755788,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL12186,=,nM,10.0,CHEMBL240,Homo sapiens,IC50,nM,10.0
4,755790,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL556312,=,nM,23.5,CHEMBL240,Homo sapiens,IC50,nM,23.5


In [7]:
bioactivities_after_cleanup = cleanup_bioactivities_df(bioactivities_df)
bioactivities_after_cleanup.head()

DataFrame shape: (7028, 13)
Units in downloaded data: ['nM' 'ug.mL-1']
Number of non-nM entries:        9
Units after filtering: ['nM']
DataFrame shape after filtering: (7019, 13)
DataFrame shape after removing duplicates: (6365, 13)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bioactivities_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bioactivities_df.rename(


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,units,IC50,target_chembl_id,target_organism,type,units.1,value
0,753688,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL443476,=,nM,88.0,CHEMBL240,Homo sapiens,IC50,nM,88.0
1,754652,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL53661,=,nM,137.0,CHEMBL240,Homo sapiens,IC50,nM,137.0
2,754653,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL299390,=,nM,1480.0,CHEMBL240,Homo sapiens,IC50,nM,1480.0
3,755788,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL12186,=,nM,10.0,CHEMBL240,Homo sapiens,IC50,nM,10.0
4,755790,CHEMBL766816,Inhibition of K+ channel activity in CHO cells...,B,CHEMBL556312,=,nM,23.5,CHEMBL240,Homo sapiens,IC50,nM,23.5


In [8]:
bioactivities_with_compound_data = retrieve_compound_data(bioactivities_after_cleanup)
bioactivities_with_compound_data.head()

Downloading 6365 compounds from ChEMBL.


100%|██████████| 6365/6365 [17:23<00:00,  6.10it/s]


Compounds dataframe shape: (6365, 2)
Index(['molecule_chembl_id', 'molecule_structures'], dtype='object')
Compounds dataFrame shape - nans removed: (6365, 2)
Compounds dataFrame shape - duplicates removed: (6365, 2)/n
Summary:

Total bioactivities after filtering: 6365
Total compounds after filtering: 6365
Final dataset has 6365 entries.


Unnamed: 0,molecule_chembl_id,IC50,units,units.1,smiles,pIC50
0,CHEMBL443476,88.0,nM,nM,O=C1NCCN1CCN1CCC(c2cn(-c3ccccc3)c3ccc(Cl)cc23)CC1,7.055517
1,CHEMBL53661,137.0,nM,nM,O=C1NCCN1CCN1CCC(c2cn(C3CCCCC3)c3ccc(Cl)cc23)CC1,6.863279
2,CHEMBL299390,1480.0,nM,nM,CCC(CC)c1cn(-c2ccc(F)cc2)c2ccc(Cl)cc12,5.829738
3,CHEMBL12186,10.0,nM,nM,O=C1NCCN1CCN1CC=C(c2cn(-c3ccc(F)cc3)c3ccc(Cl)c...,8.0
4,CHEMBL556312,23.5,nM,nM,O=C1NCCN1CCN1CCC(C2CN(c3ccc(F)cc3)c3ccccc32)CC1,7.628932


In [9]:
bioactivities_with_compound_data.to_csv(DATA / f"{target}_compounds.csv")
bioactivities_with_compound_data.head()

Unnamed: 0,molecule_chembl_id,IC50,units,units.1,smiles,pIC50
0,CHEMBL443476,88.0,nM,nM,O=C1NCCN1CCN1CCC(c2cn(-c3ccccc3)c3ccc(Cl)cc23)CC1,7.055517
1,CHEMBL53661,137.0,nM,nM,O=C1NCCN1CCN1CCC(c2cn(C3CCCCC3)c3ccc(Cl)cc23)CC1,6.863279
2,CHEMBL299390,1480.0,nM,nM,CCC(CC)c1cn(-c2ccc(F)cc2)c2ccc(Cl)cc12,5.829738
3,CHEMBL12186,10.0,nM,nM,O=C1NCCN1CCN1CC=C(c2cn(-c3ccc(F)cc3)c3ccc(Cl)c...,8.0
4,CHEMBL556312,23.5,nM,nM,O=C1NCCN1CCN1CCC(C2CN(c3ccc(F)cc3)c3ccccc32)CC1,7.628932
