# retrieve chembl data

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem import Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import rdDepictor

# custom drawing options for rdkit molecules:
opts = Draw.MolDrawOptions()
# lighter blue for nitrogen
opts.updateAtomPalette({7: (0.4, 0.4, 1)})
opts.useBWAtomPalette()
# transparent background
opts.clearBackground = True
rdDepictor.SetPreferCoordGen(True)

import math
from pathlib import Path
import pathlib
from zipfile import ZipFile
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm
from utils.chembl_utils import retrieve_chembl_bioactivities, cleanup_bioactivities_df, retrieve_compound_data

  __version__ = __import__('pkg_resources').get_distribution('chembl_webresource_client').version
  from .autonotebook import tqdm as notebook_tqdm


Protein and data variables.

In [2]:
target = 'ep4'
uniprot = 'P35408'
protein_structure = '5yhl'

In [3]:
# define paths
HERE = Path(pathlib.Path.cwd())
DATA = HERE / f"data_{protein_structure}"
DATA.mkdir(parents=True, exist_ok=True)

### Download ligands and bioactivity data from Chembl.

In [12]:
bioactivities_df = retrieve_chembl_bioactivities(uniprot, readout='IC50', relation='=', assay_type='B')
bioactivities_df.head()

The target ChEMBL ID is CHEMBL1836
Retrieving bioactivities for  P35408
Length and type of bioactivities object: 195, <class 'chembl_webresource_client.query_set.QuerySet'>
Length and type of first element: 13, <class 'dict'>


100%|██████████| 195/195 [00:03<00:00, 49.16it/s]


DataFrame shape: (195, 13)


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,211061,CHEMBL767812,Affinity for Prostanoid EP4 receptor expressed...,B,CHEMBL815,=,nM,4200.0,CHEMBL1836,Homo sapiens,IC50,nM,4200.0
1,468799,CHEMBL767813,In vitro binding at EP4 human prostaglandin re...,B,CHEMBL815,=,nM,4200.0,CHEMBL1836,Homo sapiens,IC50,nM,4200.0
2,1192140,CHEMBL765917,Inhibitory activity against human EP4 receptor...,B,CHEMBL548,=,nM,0.7,CHEMBL1836,Homo sapiens,IC50,nM,0.7
3,1210351,CHEMBL767814,Inhibitory activity against human EP4 receptor...,B,CHEMBL275667,=,nM,1.4,CHEMBL1836,Homo sapiens,IC50,nM,1.4
4,1964571,CHEMBL896001,Inhibition of human recombinant EP4 receptor e...,B,CHEMBL400404,=,nM,24000.0,CHEMBL1836,Homo sapiens,IC50,uM,24.0


In [13]:
bioactivities_after_cleanup = cleanup_bioactivities_df(bioactivities_df)
bioactivities_after_cleanup.head()

DataFrame shape: (195, 13)
Units in downloaded data: ['nM']
Number of non-nM entries:        0
Units after filtering: ['nM']
DataFrame shape after filtering: (195, 13)
DataFrame shape after removing duplicates: (140, 13)


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,units,IC50,target_chembl_id,target_organism,type,units.1,value
0,211061,CHEMBL767812,Affinity for Prostanoid EP4 receptor expressed...,B,CHEMBL815,=,nM,4200.0,CHEMBL1836,Homo sapiens,IC50,nM,4200.0
1,1192140,CHEMBL765917,Inhibitory activity against human EP4 receptor...,B,CHEMBL548,=,nM,0.7,CHEMBL1836,Homo sapiens,IC50,nM,0.7
2,1210351,CHEMBL767814,Inhibitory activity against human EP4 receptor...,B,CHEMBL275667,=,nM,1.4,CHEMBL1836,Homo sapiens,IC50,nM,1.4
3,1964571,CHEMBL896001,Inhibition of human recombinant EP4 receptor e...,B,CHEMBL400404,=,nM,24000.0,CHEMBL1836,Homo sapiens,IC50,uM,24.0
4,2288874,CHEMBL960019,Displacement of radioligand from EP4 receptor,B,CHEMBL521609,=,nM,12500.0,CHEMBL1836,Homo sapiens,IC50,uM,12.5


In [14]:
bioactivities_with_compound_data = retrieve_compound_data(bioactivities_after_cleanup)
bioactivities_with_compound_data.head()

Downloading 140 compounds from ChEMBL.


100%|██████████| 140/140 [00:01<00:00, 138.40it/s]

Compounds dataframe shape: (140, 2)
Index(['molecule_chembl_id', 'molecule_structures'], dtype='object')
Compounds dataFrame shape - nans removed: (140, 2)
Compounds dataFrame shape - duplicates removed: (140, 2)/n
Summary:

Total bioactivities after filtering: 140
Total compounds after filtering: 140
Final dataset has 140 entries.





Unnamed: 0,molecule_chembl_id,IC50,units,units.1,smiles,pIC50
0,CHEMBL815,4200.0,nM,nM,CCCCC[C@H](O)/C=C/[C@@H]1[C@@H](C/C=C\CCCC(=O)...,5.376751
1,CHEMBL548,0.7,nM,nM,CCCCC[C@H](O)/C=C/[C@H]1[C@H](O)CC(=O)[C@@H]1C...,9.154902
2,CHEMBL275667,1.4,nM,nM,O=C1CC[C@H](/C=C/[C@@H](O)Cc2ccccc2)N1CCCCCCc1...,8.853872
3,CHEMBL400404,24000.0,nM,uM,CC(=O)Nc1cccc(-c2ccc(Cc3ocnc3C(=O)N[C@@H](Cc3c...,4.619789
4,CHEMBL521609,12500.0,nM,uM,Cn1cc(/C=C/C(=O)NS(=O)(=O)c2ccc(F)c(F)c2)c2c(O...,4.90309


In [15]:
bioactivities_with_compound_data.to_csv(DATA / f"{target}_compounds.csv")
bioactivities_with_compound_data.head()

Unnamed: 0,molecule_chembl_id,IC50,units,units.1,smiles,pIC50
0,CHEMBL815,4200.0,nM,nM,CCCCC[C@H](O)/C=C/[C@@H]1[C@@H](C/C=C\CCCC(=O)...,5.376751
1,CHEMBL548,0.7,nM,nM,CCCCC[C@H](O)/C=C/[C@H]1[C@H](O)CC(=O)[C@@H]1C...,9.154902
2,CHEMBL275667,1.4,nM,nM,O=C1CC[C@H](/C=C/[C@@H](O)Cc2ccccc2)N1CCCCCCc1...,8.853872
3,CHEMBL400404,24000.0,nM,uM,CC(=O)Nc1cccc(-c2ccc(Cc3ocnc3C(=O)N[C@@H](Cc3c...,4.619789
4,CHEMBL521609,12500.0,nM,uM,Cn1cc(/C=C/C(=O)NS(=O)(=O)c2ccc(F)c(F)c2)c2c(O...,4.90309
