In [1]:
# Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.
! pip install chembl_webresource_client



In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from chembl_webresource_client.new_client import new_client

In [3]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,14.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,14.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Murine coronavirus,Murine coronavirus,14.0,False,CHEMBL5209664,[],ORGANISM,694005
4,[],Human coronavirus 229E,Human coronavirus 229E,12.0,False,CHEMBL613837,[],ORGANISM,11137
5,[],Human coronavirus OC43,Human coronavirus OC43,12.0,False,CHEMBL5209665,[],ORGANISM,31631
6,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
8,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
9,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


In [4]:
#We will assign the 8,9 (Replicase polyprotein 1ab) to the *selected_target* variable 
selected_target1 = targets.target_chembl_id[8] #binding assays (CHEMBL5118)
selected_target2 = targets.target_chembl_id[9] #functional assays (CHEMBL4523582)
print("selected_target1 :", selected_target1)
print("selected_target2 :", selected_target2)

selected_target1 : CHEMBL5118
selected_target2 : CHEMBL4523582


In [5]:
# Here, we will retrieve only bioactivity data for coronavirus replicase polyprotein 1ab that are reported as IC
# values in nM (nanomolar) unit.
activity = new_client.activity
res1 = activity.filter(target_chembl_id=selected_target1).filter(standard_type="IC50")
res2 = activity.filter(target_chembl_id=selected_target2).filter(standard_type="IC50")

In [6]:
#put them in a dataframe 
df1 = pd.DataFrame.from_dict(res1)
df2 = pd.DataFrame.from_dict(res2)

In [7]:
#Concatenate the dataframes vertically
data = pd.concat([df1, df2], axis=0)
# Reset the index of the resulting dataframe
data = data.reset_index(drop=True)
data.head(5)
data.shape

(1435, 46)

In [None]:
#Finally we will save the resulting bioactivity data to a CSV file bioactivity_data.csv.
# data.to_csv('replicase_data_raw.csv', index=False)

In [8]:
# If any compounds has missing value for the standard_value column then drop it
mydata = data[data.standard_value.notna()]
mydata
mydata.shape

(1359, 46)

In [9]:
#handle compounds that have no smiles.
newdata = mydata[mydata.canonical_smiles.notna()]
newdata.shape

(1352, 46)

In [10]:
#check to see if there are duplicates
len(newdata.canonical_smiles.unique())
#Drop the duplicates
newdata = newdata.drop_duplicates(['canonical_smiles'])
newdata.shape
#Reset the index
newdata.reset_index(drop=True, inplace=True)

In [11]:
#calculate the pChEMBL value of the rows that are missing using the standard value of the IC50 
newdata['standard_value'] = pd.to_numeric(newdata['standard_value'], errors='coerce')
newdata['pchembl_value'] = pd.to_numeric(newdata['pchembl_value'], errors='coerce')
newdata.loc[newdata['pchembl_value'].isnull(), 'pchembl_value'] = np.log10(newdata.loc[newdata['pchembl_value'].isnull(), 'standard_value'].values)
newdata['pchembl_value'] = newdata['pchembl_value'].round(2)

In [12]:
# define the activity class using 4 as the pchembl value cut-off
bioactivity_class = []
for i in newdata.pchembl_value:
    if float(i) <= 4:
        bioactivity_class.append("no activity")
    elif float(i) > 4 and float(i) <= 5.99:
        bioactivity_class.append("low activity")
    elif float(i) >= 6 and float(i) <= 7.99:
        bioactivity_class.append("moderate activity")
    else:
        bioactivity_class.append("high activity")

len(bioactivity_class)

1087

In [14]:
#slice your data
mynewdata= newdata[['molecule_chembl_id','canonical_smiles','pchembl_value']]
mynewdata.shape
#serialize the bioactivity list
bioactivity_class = pd.Series(bioactivity_class, name='bioactivity_class')
# Reset the index of 'mynewdata' and 'bioactivity_class' to ensure consistent indices
mynewdata.reset_index(drop=True, inplace=True)
bioactivity_class.reset_index(drop=True, inplace=True)

# Concatenate 'mynewdata' and 'bioactivity_class' horizontally (axis=1)
dataframe = pd.concat([mynewdata, bioactivity_class], axis=1)

In [15]:
# Verify the resulting DataFrame
print(dataframe)

     molecule_chembl_id                                   canonical_smiles  \
0          CHEMBL194398  CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@@H...   
1          CHEMBL393608  CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...   
2          CHEMBL238216  CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...   
3          CHEMBL235873  CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...   
4          CHEMBL397154  CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...   
...                 ...                                                ...   
1082      CHEMBL5220196  CC(C)C[C@H](NC(=O)OCCC12CC3CC(CC(C3)C1)C2)C(=O...   
1083      CHEMBL5220377  CC(C)C[C@H](NC(=O)OCC12CC3CC(CC(O)(C3)C1)C2)C(...   
1084      CHEMBL5219749  CC(C)C[C@H](NC(=O)OCC12CC3CC(CC(O)(C3)C1)C2)C(...   
1085      CHEMBL5218884  [2H]C([2H])(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C...   
1086      CHEMBL5220641  [2H]C([2H])(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](...   

      pchembl_value  bioactivity_class  
0              5.94   

In [None]:
#save the preprocessed data
# dataframe.to_csv('Replicase_bioactivity_data_preprocessed.csv', index=False)