# Bioactivity data mining for drug discovery using ChEMBL 

Install the ChEMBL web service package to retrieve bioactivity data from ChEMBL Database

In [69]:
# install libraries
! pip install chembl_webresource_client



In [70]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

Search for target protein

In [71]:
# test case: Polyketide synthase Pks13
target = new_client.target
target_query = target.search('Pks13')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,28.0,False,CHEMBL4105939,"[{'accession': 'I6X8D2', 'component_descriptio...",SINGLE PROTEIN,83332


# Select and retrieve bioactivity data 

In [72]:
# select the target of interest from the search results
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL4105939'

Filter results to retrieve only bioactivity data for protein of interest that are reported as IC50 values in nM units

In [73]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [74]:
df = pd.DataFrame.from_dict(res)

In [75]:
df.head(3)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,18127365,[],CHEMBL4028447,Inhibition of Mycobacterium tuberculosis ATCC ...,B,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,IC50,uM,UO_0000065,,0.26
1,,19164286,[],CHEMBL4383981,Inhibition of Mycobacterium tuberculosis H37Rv...,B,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,IC50,ug ml-1,UO_0000274,,0.19
2,,22058278,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4649965,Mycobacterium tuberculosis Polyketide synthase...,F,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,pIC50,,UO_0000065,,4.22002298548261


In [76]:
df.to_csv('bioactivity_data_raw.csv', index=False)

# Handling missing data

Exclude any compound that has missing value for the standard_value column

In [77]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,18127365,[],CHEMBL4028447,Inhibition of Mycobacterium tuberculosis ATCC ...,B,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,IC50,uM,UO_0000065,,0.26
1,,19164286,[],CHEMBL4383981,Inhibition of Mycobacterium tuberculosis H37Rv...,B,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,IC50,ug ml-1,UO_0000274,,0.19
2,,22058278,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4649965,Mycobacterium tuberculosis Polyketide synthase...,F,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,pIC50,,UO_0000065,,4.22002298548261
3,,22058279,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4649965,Mycobacterium tuberculosis Polyketide synthase...,F,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,pIC50,,UO_0000065,,4.60248529399195
4,,22058281,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4649965,Mycobacterium tuberculosis Polyketide synthase...,F,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,pIC50,,UO_0000065,,4.14284041217382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,,22068933,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4649965,Mycobacterium tuberculosis Polyketide synthase...,F,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,pIC50,,UO_0000065,,4.0
599,,22068986,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4649965,Mycobacterium tuberculosis Polyketide synthase...,F,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,pIC50,,UO_0000065,,5.05674199241346
600,,22068987,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4649965,Mycobacterium tuberculosis Polyketide synthase...,F,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,pIC50,,UO_0000065,,4.89637185707977
601,,22068993,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4649965,Mycobacterium tuberculosis Polyketide synthase...,F,,,BAO_0000190,BAO_0000019,...,Mycobacterium tuberculosis (strain ATCC 25618 ...,Polyketide synthase Pks13,83332,,,pIC50,,UO_0000065,,4.62596785534237


# Data pre-processing of bioactivity data

The IC50 unit contains the bioactivity data. Compounds are classified accourding to the following parameters:
- Active:   IC50 <= 1,000 nM
- Inactive: IC50 >= 10,000 nM 
- Values in between 1,000 and 10,000 nM will be considered as intermediate.

In [78]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

Combine the three columns (molecule_chembl_id, anonical_smiles, standard_value) and bioactivity_class into a DataFrame

In [79]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL4063356,CCOC(=O)c1c(-c2ccccc2)oc2ccc(O)c(CN3CCC(C)CC3)c12,260.0
1,CHEMBL4443524,CNC(=O)c1c(-c2ccc(O)cc2)oc2ccc(O)c(CN3CCCCC3)c12,0.19
2,CHEMBL3442631,CNC(=O)c1cc2c(nc1NC(C)C)CCNCC2,60252.77
3,CHEMBL3442631,CNC(=O)c1cc2c(nc1NC(C)C)CCNCC2,24975.53
4,CHEMBL3442632,CNC(=O)c1cc2c(nc1NCC1CC1)CCNCC2,71971.34
...,...,...,...
598,CHEMBL3487994,Cc1cnn(CC(C)NCC(=O)NCCOc2ccccc2)c1,100000.0
599,CHEMBL3493579,Cc1nc(C)n(C2CCCN(C(=O)c3nc(C)n4ccccc34)C2)n1,8775.22
600,CHEMBL3493579,Cc1nc(C)n(C2CCCN(C(=O)c3nc(C)n4ccccc34)C2)n1,12694.87
601,CHEMBL3496335,COc1cc(/C=C/c2ncc(C(=O)O)s2)cc(Cl)c1O,23660.95


In [80]:
# adds bioactivity class to the dataframe
bioactivity_class = pd.Series(bioactivity_class, name='bioactivity_class')
df4 = pd.concat([df3, bioactivity_class], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL4063356,CCOC(=O)c1c(-c2ccccc2)oc2ccc(O)c(CN3CCC(C)CC3)c12,260.0,active
1,CHEMBL4443524,CNC(=O)c1c(-c2ccc(O)cc2)oc2ccc(O)c(CN3CCCCC3)c12,0.19,active
2,CHEMBL3442631,CNC(=O)c1cc2c(nc1NC(C)C)CCNCC2,60252.77,inactive
3,CHEMBL3442631,CNC(=O)c1cc2c(nc1NC(C)C)CCNCC2,24975.53,inactive
4,CHEMBL3442632,CNC(=O)c1cc2c(nc1NCC1CC1)CCNCC2,71971.34,inactive
...,...,...,...,...
598,CHEMBL3487994,Cc1cnn(CC(C)NCC(=O)NCCOc2ccccc2)c1,100000.0,inactive
599,CHEMBL3493579,Cc1nc(C)n(C2CCCN(C(=O)c3nc(C)n4ccccc34)C2)n1,8775.22,intermediate
600,CHEMBL3493579,Cc1nc(C)n(C2CCCN(C(=O)c3nc(C)n4ccccc34)C2)n1,12694.87,inactive
601,CHEMBL3496335,COc1cc(/C=C/c2ncc(C(=O)O)s2)cc(Cl)c1O,23660.95,inactive


In [81]:
# save dataframe to CSV file
df4.to_csv('bioactivity_preprocessed_data.csv', index=False)