# **Computational Drug Discovery - Download Bioactivity Data**

Followed tutorial by Chanin Nantasenamat

> Indented block



Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.

In [None]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.8-py3-none-any.whl (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m41.0/55.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests-cache~=0.7.0 (from chembl_webresource_client)
  Downloading requests_cache-0.7.5-py3-none-any.whl (39 kB)
Collecting attrs<22.0,>=21.2 (from requests-cache~=0.7.0->chembl_webresource_client)
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting url-normalize<2.0,>=1.4 (from requests-cache~=0.7.0->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3

## **Importing libraries**

In [None]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for protein of interest**

---



In [None]:
# Target search for Tropin
target = new_client.target
target_query = target.search('Troponin cardiac muscle')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,"Troponin, cardiac muscle",48.0,False,CHEMBL2095202,"[{'accession': 'P63316', 'component_descriptio...",PROTEIN COMPLEX,9606.0
1,[],Homo sapiens,Fast skeletal troponin complex,34.0,False,CHEMBL3831282,"[{'accession': 'P48788', 'component_descriptio...",PROTEIN COMPLEX,9606.0
2,[],Homo sapiens,Serine/threonine-protein kinase TNNI3K,27.0,False,CHEMBL5260,"[{'accession': 'Q59H18', 'component_descriptio...",SINGLE PROTEIN,9606.0
3,"[{'xref_id': 'Q9ERN6', 'xref_name': None, 'xre...",Mus musculus,Ryanodine receptor 2,23.0,False,CHEMBL4745,"[{'accession': 'E9Q401', 'component_descriptio...",SINGLE PROTEIN,10090.0
4,"[{'xref_id': 'B0LPN4', 'xref_name': None, 'xre...",Rattus norvegicus,Ryanodine receptor 2,23.0,False,CHEMBL3388,"[{'accession': 'B0LPN4', 'component_descriptio...",SINGLE PROTEIN,10116.0
...,...,...,...,...,...,...,...,...,...
173,[],Homo sapiens,Phosphodiesterase 4 and 5 (PDE4 and PDE5),2.0,False,CHEMBL2111340,"[{'accession': 'O76074', 'component_descriptio...",SELECTIVITY GROUP,9606.0
174,[],Homo sapiens,Voltage-gated calcium channel,2.0,False,CHEMBL2363032,"[{'accession': 'O95180', 'component_descriptio...",PROTEIN COMPLEX GROUP,9606.0
175,[],Homo sapiens,Phosphodiesterases; PDE5 & PDE6,1.0,False,CHEMBL2095220,"[{'accession': 'O76074', 'component_descriptio...",SELECTIVITY GROUP,9606.0
176,[],Homo sapiens,Voltage-gated potassium channel,0.0,False,CHEMBL2362996,"[{'accession': 'P51787', 'component_descriptio...",PROTEIN FAMILY,9606.0


### **Select and retrieve bioactivity data for protein of choice, single protein preferred**

We will assign the fifth entry (which corresponds to the target protein) to the ***selected_target*** variable

In [None]:
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL2095202'

Here, we will retrieve only bioactivity data for target protein that are reported as pChEMBL values.

In [None]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [None]:
df = pd.DataFrame.from_dict(res)

In [None]:
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,Active,5651917,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,0.67676
1,,inactive,5651918,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,71.472
2,,inactive,5651919,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,71.481
3,,inactive,5651920,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,12.683
4,,Active,5651921,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,2.724
5,,Active,5651922,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,1.819
6,,Active,5651923,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,0.55814
7,,Active,5651924,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,7.597
8,,Active,5651925,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,3.191
9,,Active,5651926,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,2.648


Finally we will save the resulting bioactivity data to a CSV file **bioactivity_data.csv**.

In [None]:
df.to_csv('Troponin_bioactivity_data_raw.csv', index=False)

## **Handling missing data**
If any compounds has missing value for the **standard_value** and **canonical_smiles** column then drop it.

In [None]:
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,Active,5651917,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,0.67676
1,,inactive,5651918,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,71.472
2,,inactive,5651919,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,71.481
3,,inactive,5651920,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,12.683
4,,Active,5651921,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,2.724
5,,Active,5651922,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,1.819
6,,Active,5651923,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,0.55814
7,,Active,5651924,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,7.597
8,,Active,5651925,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,3.191
9,,Active,5651926,[],CHEMBL1738244,PUBCHEM_BIOASSAY: Fluorescence-based biochemic...,B,,,BAO_0000190,...,Homo sapiens,"Troponin, cardiac muscle",9606,,,IC50,uM,UO_0000065,,2.648


In [None]:
len(df2.canonical_smiles.unique())

4695

In [None]:
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,BAO_0000190,BAO_0000357,single protein format,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,,,CHEMBL1148382,J. Med. Chem.,2004.0,"{'bei': '19.61', 'le': '0.36', 'lle': '3.32', ...",CHEMBL133897,,CHEMBL133897,6.12,False,http://www.openphacts.org/units/Nanomolar,252547,=,1,True,=,,IC50,nM,,750.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,BAO_0000190,BAO_0000357,single protein format,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,,,CHEMBL1148382,J. Med. Chem.,2004.0,"{'bei': '18.57', 'le': '0.38', 'lle': '2.45', ...",CHEMBL336398,,CHEMBL336398,7.00,False,http://www.openphacts.org/units/Nanomolar,252533,=,1,True,=,,IC50,nM,,100.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,BAO_0000190,BAO_0000357,single protein format,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,,,CHEMBL1148382,J. Med. Chem.,2004.0,,CHEMBL131588,,CHEMBL131588,,False,http://www.openphacts.org/units/Nanomolar,252530,>,1,True,>,,IC50,nM,,50000.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,BAO_0000190,BAO_0000357,single protein format,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,,,CHEMBL1148382,J. Med. Chem.,2004.0,"{'bei': '16.11', 'le': '0.34', 'lle': '1.81', ...",CHEMBL130628,,CHEMBL130628,6.52,False,http://www.openphacts.org/units/Nanomolar,252534,=,1,True,=,,IC50,nM,,300.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,BAO_0000190,BAO_0000357,single protein format,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,,,CHEMBL1148382,J. Med. Chem.,2004.0,"{'bei': '17.60', 'le': '0.36', 'lle': '3.00', ...",CHEMBL130478,,CHEMBL130478,6.10,False,http://www.openphacts.org/units/Nanomolar,252552,=,1,True,=,,IC50,nM,,800.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7019,,18798884,[],CHEMBL4274263,Inhibition of human erythrocyte AChE using ace...,B,BAO_0000190,BAO_0000357,single protein format,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3ccccc3Cl)CC...,,,CHEMBL4270597,Eur J Med Chem,2018.0,"{'bei': '10.98', 'le': '0.21', 'lle': '-1.46',...",CHEMBL4293155,,CHEMBL4293155,5.61,False,http://www.openphacts.org/units/Nanomolar,3124023,=,1,True,=,,IC50,nM,,2440.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,2.44
7020,,18798885,[],CHEMBL4274263,Inhibition of human erythrocyte AChE using ace...,B,BAO_0000190,BAO_0000357,single protein format,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3cccc(Cl)c3)...,,,CHEMBL4270597,Eur J Med Chem,2018.0,"{'bei': '10.95', 'le': '0.21', 'lle': '-1.47',...",CHEMBL4282558,,CHEMBL4282558,5.59,False,http://www.openphacts.org/units/Nanomolar,3124024,=,1,True,=,,IC50,nM,,2540.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,2.54
7021,,18798886,[],CHEMBL4274263,Inhibition of human erythrocyte AChE using ace...,B,BAO_0000190,BAO_0000357,single protein format,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3ccc(Br)cc3)...,,,CHEMBL4270597,Eur J Med Chem,2018.0,"{'bei': '9.75', 'le': '0.21', 'lle': '-1.76', ...",CHEMBL4281727,,CHEMBL4281727,5.42,False,http://www.openphacts.org/units/Nanomolar,3124025,=,1,True,=,,IC50,nM,,3810.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,3.81
7022,,18798887,[],CHEMBL4274263,Inhibition of human erythrocyte AChE using ace...,B,BAO_0000190,BAO_0000357,single protein format,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3cccc([N+](=...,,,CHEMBL4270597,Eur J Med Chem,2018.0,"{'bei': '10.47', 'le': '0.20', 'lle': '-0.87',...",CHEMBL4292349,,CHEMBL4292349,5.46,False,http://www.openphacts.org/units/Nanomolar,3124026,=,1,True,=,,IC50,nM,,3460.0,CHEMBL220,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,3.46


## **Data pre-processing of the bioactivity data**

### **Combine the 3 columns (molecule_chembl_id,canonical_smiles,standard_value) and bioactivity_class into a DataFrame**

In [None]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0
...,...,...,...
7019,CHEMBL4293155,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3ccccc3Cl)CC...,2440.0
7020,CHEMBL4282558,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3cccc(Cl)c3)...,2540.0
7021,CHEMBL4281727,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3ccc(Br)cc3)...,3810.0
7022,CHEMBL4292349,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3cccc([N+](=...,3460.0


Saves dataframe to CSV file

In [None]:
df3.to_csv('Troponin_cardiac_muscle_02_bioactivity_data_preprocessed.csv', index=False)

### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [None]:
df4 = pd.read_csv('Troponin_cardiac_muscle_02_bioactivity_data_preprocessed.csv')

In [None]:
bioactivity_threshold = []
for i in df4.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

In [None]:
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df5 = pd.concat([df4, bioactivity_class], axis=1)
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0,active
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0,active
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0,inactive
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0,active
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0,active
...,...,...,...,...
4690,CHEMBL4293155,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3ccccc3Cl)CC...,2440.0,intermediate
4691,CHEMBL4282558,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3cccc(Cl)c3)...,2540.0,intermediate
4692,CHEMBL4281727,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3ccc(Br)cc3)...,3810.0,intermediate
4693,CHEMBL4292349,CC(C)(C)c1cc(/C=C/C(=O)NCCC2CCN(Cc3cccc([N+](=...,3460.0,intermediate


Saves dataframe to CSV file

In [None]:
df5.to_csv('Troponin_03_bioactivity_data_curated.csv', index=False)

In [None]:
! zip Troponin.zip *.csv

In [None]:
! ls -l

---