In [72]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np

#### ChEMBL Database

This database contains curated bioactivity data of more than 2 million compounds. It is compiled from more than 76,000 documents, 1.2 million assays and the data spans 13,000 targets.


The ChEMBL web service package needs to bbe installed so that we can retrieve bioactivity data from the ChEMBL Database: **pip install chembl_webresource_client**

In [2]:
from chembl_webresource_client.new_client import new_client

# Data Collection

### Search for the target protein - Coronavirus

In [8]:
target_query = new_client.target.search('coronavirus')
targets = pd.DataFrame(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,14.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,14.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
5,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
6,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,5.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859


### Select and retrive the bioactivity for SARS coronavirus 3C-like proteinase

In [16]:
# selecting the protein of interest
selected_target = targets.target_chembl_id[4]

selected_target

'CHEMBL3927'

In [17]:
# retrieving only the bioactivity
activity = new_client.activity

activity

[{'activity_comment': None, 'activity_id': 31863, 'activity_properties': [], 'assay_chembl_id': 'CHEMBL663853', 'assay_description': 'Inhibitory concentration against human DNA topoisomerase II, alpha mediated relaxation of pBR322; no measurable activity', 'assay_type': 'B', 'bao_endpoint': 'BAO_0000190', 'bao_format': 'BAO_0000357', 'bao_label': 'single protein format', 'canonical_smiles': 'c1ccc(-c2nc3c(-c4nc5ccccc5o4)cccc3o2)cc1', 'data_validity_comment': None, 'data_validity_description': None, 'document_chembl_id': 'CHEMBL1137930', 'document_journal': 'Bioorg. Med. Chem. Lett.', 'document_year': 2004, 'ligand_efficiency': None, 'molecule_chembl_id': 'CHEMBL113081', 'molecule_pref_name': None, 'parent_molecule_chembl_id': 'CHEMBL113081', 'pchembl_value': None, 'potential_duplicate': False, 'qudt_units': 'http://www.openphacts.org/units/Nanomolar', 'record_id': 206172, 'relation': '>', 'src_id': 1, 'standard_flag': True, 'standard_relation': '>', 'standard_text_value': None, 'standa

In [23]:
# filtering the activity for the protein of interest
res = activity.filter(target_chembl_id = selected_target)

res

[{'activity_comment': None, 'activity_id': 1480934, 'activity_properties': [], 'assay_chembl_id': 'CHEMBL831837', 'assay_description': 'In vitro percent inhibition against SARS coronavirus main protease (SARS CoV 3C-like protease) at 20 uM', 'assay_type': 'B', 'bao_endpoint': 'BAO_0000201', 'bao_format': 'BAO_0000357', 'bao_label': 'single protein format', 'canonical_smiles': 'Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C(N)=O)ccc21', 'data_validity_comment': None, 'data_validity_description': None, 'document_chembl_id': 'CHEMBL1139624', 'document_journal': 'Bioorg. Med. Chem. Lett.', 'document_year': 2005, 'ligand_efficiency': None, 'molecule_chembl_id': 'CHEMBL372889', 'molecule_pref_name': None, 'parent_molecule_chembl_id': 'CHEMBL372889', 'pchembl_value': None, 'potential_duplicate': False, 'qudt_units': 'http://qudt.org/vocab/unit#Percent', 'record_id': 384102, 'relation': '=', 'src_id': 1, 'standard_flag': True, 'standard_relation': '=', 'standard_text_value': None, 'standard_type': 'Inhibition

In [24]:
# filtering the IC50 values 

ic50 = res.filter(standard_type="IC50")

ic50

[{'activity_comment': None, 'activity_id': 1480935, 'activity_properties': [], 'assay_chembl_id': 'CHEMBL829584', 'assay_description': 'In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease)', 'assay_type': 'B', 'bao_endpoint': 'BAO_0000190', 'bao_format': 'BAO_0000357', 'bao_label': 'single protein format', 'canonical_smiles': 'Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21', 'data_validity_comment': None, 'data_validity_description': None, 'document_chembl_id': 'CHEMBL1139624', 'document_journal': 'Bioorg. Med. Chem. Lett.', 'document_year': 2005, 'ligand_efficiency': {'bei': '18.28', 'le': '0.33', 'lle': '3.25', 'sei': '5.90'}, 'molecule_chembl_id': 'CHEMBL187579', 'molecule_pref_name': None, 'parent_molecule_chembl_id': 'CHEMBL187579', 'pchembl_value': '5.14', 'potential_duplicate': False, 'qudt_units': 'http://www.openphacts.org/units/Nanomolar', 'record_id': 384103, 'relation': '=', 'src_id': 1, 'standard_flag': True, 'standard_relation': '='

In [31]:
# converting from dictionary to dataframe

df = pd.DataFrame.from_dict(ic50)
df.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,BAO_0000190,BAO_0000357,single protein format,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,"{'bei': '18.28', 'le': '0.33', 'lle': '3.25', ...",CHEMBL187579,,CHEMBL187579,5.14,False,http://www.openphacts.org/units/Nanomolar,384103,=,1,True,=,,IC50,nM,,7200.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,"{'bei': '12.10', 'le': '0.33', 'lle': '1.22', ...",CHEMBL188487,,CHEMBL188487,5.03,False,http://www.openphacts.org/units/Nanomolar,383984,=,1,True,=,,IC50,nM,,9400.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,"{'bei': '11.56', 'le': '0.29', 'lle': '2.21', ...",CHEMBL185698,,CHEMBL185698,4.87,False,http://www.openphacts.org/units/Nanomolar,384106,=,1,True,=,,IC50,nM,,13500.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,"{'bei': '16.64', 'le': '0.32', 'lle': '1.25', ...",CHEMBL426082,,CHEMBL426082,4.88,False,http://www.openphacts.org/units/Nanomolar,384075,=,1,True,=,,IC50,nM,,13110.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,"{'bei': '16.84', 'le': '0.32', 'lle': '2.16', ...",CHEMBL187717,,CHEMBL187717,5.7,False,http://www.openphacts.org/units/Nanomolar,384234,=,1,True,=,,IC50,nM,,2000.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0


In [32]:
# saving to csv

df.to_csv('bioactivity_data_raw.csv', index=False)

# Data Pre-Processing

### Handling missing data

- If any row is missing the standard type IC50, we need to drop those

In [36]:
df['standard_type'].isna().sum() #no missing values

0

In [39]:
df['standard_value'].isna().sum() #no missing

0

- We need a new dataframe with 3 columns: molecule_chembl_id,canonical_smiles,standard_value

In [49]:
df_new = df[['molecule_chembl_id','canonical_smiles','standard_value']]
df_new

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0
...,...,...,...
128,CHEMBL2146517,COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...,10600.0
129,CHEMBL187460,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,10100.0
130,CHEMBL363535,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12,11500.0
131,CHEMBL227075,Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1,10700.0


- new a new column to categorize standard values as active if < 1000, intermediate if <10,000, inactive if >10,000

In [82]:
df_new['standard_value'] = df_new['standard_value'].astype(float)

df_new['standard_value']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['standard_value'] = df_new['standard_value'].astype(float)


0       7200.0
1       9400.0
2      13500.0
3      13110.0
4       2000.0
        ...   
128    10600.0
129    10100.0
130    11500.0
131    10700.0
132    78900.0
Name: standard_value, Length: 133, dtype: float64

In [83]:
condition = [
    (df_new['standard_value'] >= 10000),
    (df_new['standard_value'] < 1000),
    (df_new['standard_value'] >1000)&(df_new['standard_value']<10000)

]
                                                
values = ['Inactive', 'Active', 'Intermediate']               

In [84]:
df_new['bioactivity_class'] = np.select(condition,values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['bioactivity_class'] = np.select(condition,values)


In [86]:
df_new['bioactivity_class'].value_counts()

Inactive        104
Active           15
Intermediate     14
Name: bioactivity_class, dtype: int64

In [87]:
df_new

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,Intermediate
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,Intermediate
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,Inactive
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,Inactive
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,Intermediate
...,...,...,...,...
128,CHEMBL2146517,COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...,10600.0,Inactive
129,CHEMBL187460,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,10100.0,Inactive
130,CHEMBL363535,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12,11500.0,Inactive
131,CHEMBL227075,Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1,10700.0,Inactive


In [88]:
# converting to csv

df_new.to_csv('bioactivity_data_processed.csv', index=False)