<a href="https://colab.research.google.com/github/clarefausty/Python-Bioinformatics/blob/main/Python_for_Bioinformatics_(Drug_Discovery_using_ML_%26_DA)_1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data Collection Using CheMBL Database**

**How to install Chembl web service package for easy retrieval of bioactivity**

In [1]:
pip install chembl_webresource_client


Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-23.2.3-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: url-normalize, cattrs, requests-cache, chembl_webresource_client
Successfully installed cattrs-23

In [2]:
#import libraries
import pandas as pd
#Access and retrieve data from the ChEMBL database
#new_client is a specific module within this library that provides an interface for querying and retrieving data from the ChEMBL database
from chembl_webresource_client.new_client import new_client

In [3]:
#Search for target protein
#target search for cancer
target = new_client.target
#search for the key word cancer
target_query = target.search("cancer")
#Convert the result to a pandas DataFrame for easier manipulation and store in the value targets
targets = pd.DataFrame.from_dict(target_query[0:7])
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Cancer-related nucleoside-triphosphatase,13.0,False,CHEMBL4295936,"[{'accession': 'Q9BSD7', 'component_descriptio...",SINGLE PROTEIN,9606.0
1,[],Canis lupus familiaris,ABC-type xenobiotic transporter,13.0,False,CHEMBL4739687,"[{'accession': 'Q38JL0', 'component_descriptio...",SINGLE PROTEIN,9615.0
2,[],Homo sapiens,Cancer/testis antigen 1,13.0,False,CHEMBL4804257,"[{'accession': 'P78358', 'component_descriptio...",SINGLE PROTEIN,9606.0
3,[],Homo sapiens,Kita-kyushu lung cancer antigen 1,13.0,False,CHEMBL5169155,"[{'accession': 'Q5H943', 'component_descriptio...",SINGLE PROTEIN,9606.0
4,[],,Cancer cell lines,11.0,False,CHEMBL614375,[],CELL-LINE,
5,"[{'xref_id': 'O14519', 'xref_name': None, 'xre...",Homo sapiens,Cyclin-dependent kinase 2-associated protein 1,11.0,False,CHEMBL5578,"[{'accession': 'O14519', 'component_descriptio...",SINGLE PROTEIN,9606.0
6,"[{'xref_id': 'P38398', 'xref_name': None, 'xre...",Homo sapiens,Breast cancer type 1 susceptibility protein,11.0,False,CHEMBL5990,"[{'accession': 'P38398', 'component_descriptio...",SINGLE PROTEIN,9606.0


### **Select and retrieve bioactivity data for *Cancer-related nucleoside-triphosphatase* (7th entry)**

In [4]:
#assign the sixth entry(which correspond to the target protein: Cancer-related nucleoside-triphosphatase ) to the selected_target variable
selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL5990'

## **Here we will only retrieve only bioactivity data for that are reported as IC50 values in nM (nanomolar) unit**

In [5]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")


In [6]:
df = pd.DataFrame.from_dict(res)



In [7]:
df[0:4]

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,6222842,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,4.6
1,,,6222843,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,250.0
2,,,6222844,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,52.8
3,,,6222845,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,250.0


## **Save the resulting bioactivity data to csv file bioactivity_data_csv**

In [8]:
df.to_csv("bioactivity_data.csv",index=False )

## **Copy files to Google Drive**

In [10]:
##Mount the google drive into Colab so that it could be accessed to the Google drive with colab

import os
from google.colab import drive

# Create the directory if it does not exist
if not os.path.exists("/cancer/gdrive"):
    os.makedirs("/cancer/gdrive")

# Mount the Google Drive
drive.mount("/cancer/gdrive", force_remount=True)


Mounted at /cancer/gdrive


## **Create a data folder in Colab Notebooks folder on Google Drive**

In [11]:
!mkdir "/cancer/gdrive/MyDrive/Colab Notebooks/data"

mkdir: cannot create directory ‘/cancer/gdrive/MyDrive/Colab Notebooks/data’: File exists


In [12]:
!cp bioactivity_data.csv "/cancer/gdrive/MyDrive/Colab Notebooks/data"

In [13]:
#to check the date folder was generated
!ls -l "/cancer/gdrive/MyDrive/Colab Notebooks/data"

total 28
-rw------- 1 root root 10974 Jul 14 09:56 bioactivity_data.csv
-rw------- 1 root root 14676 Jun 30 16:29 bioactivity_data_csv
-rw------- 1 root root  2435 Jul  7 15:45 bioactivity_preprocessed_data.csv


In [14]:
#to see the content of the csv file
! ls

bioactivity_data.csv  sample_data


In [15]:
#to take a glimpse of the bioactvty-data.csv file
! head bioactivity_data.csv

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,6222842,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarization assay,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O)N[C@H](C(=O)N[C@@H](Cc1ccccc1)C(N)=O)[C@@H](C)O,,,CHEMBL1781980,J Med Chem,2011,"{'bei': '9.34', 'le': '0.19', 'lle': '7.67', 'sei'

## **Handling missng data**

In [16]:
#if any compound has missing value for the standard-value column and canonical_smiles then drop it
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]

df2


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,6222842,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,4.6
1,,,6222843,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,250.0
2,,,6222844,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,52.8
3,,,6222845,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,250.0
4,,,6222846,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,15.0
5,,,6222847,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,35.0
6,,,6222848,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,3.2
7,,,6222849,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,30.1
8,,,6222850,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,7.1
9,,,6222851,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,18.4


In [17]:
len(df2.canonical_smiles.unique())

19

In [18]:
#operation to remove duplicate rows based on a specific column named 'canonical_smiles' and assgn t to the value "df2_nr"
df2_nr = df2.drop_duplicates(["canonical_smiles"])
df2_nr

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,6222842,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,4.6
1,,,6222843,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,250.0
2,,,6222844,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,52.8
3,,,6222845,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,250.0
4,,,6222846,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,15.0
5,,,6222847,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,35.0
6,,,6222848,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,3.2
7,,,6222849,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,30.1
8,,,6222850,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,7.1
9,,,6222851,[],CHEMBL1785941,Inhibition of BRCA1 by fluorescence polarizati...,B,,,BAO_0000190,...,Homo sapiens,Breast cancer type 1 susceptibility protein,9606,,,IC50,uM,UO_0000065,,18.4


## **Data pre-processing of the bioactivity data**

### **Combine the 3 columns (molecule_chembl_id,canonical_smiles,standard_value) and bioactivity_class into a DataFrame**

In [19]:
#declare the variable
selection = ["molecule_chembl_id", "canonical_smiles","standard_value"]
df3 = df2_nr[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL1784774,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,4600.0
1,CHEMBL1784771,CC(=O)N[C@@H](CCC(=O)O)C(=O)N1CCC[C@H]1C(=O)N[...,250000.0
2,CHEMBL1784772,CC(=O)N[C@@H](CC(C(=O)O)C(=O)O)C(=O)N1CCC[C@H]...,52800.0
3,CHEMBL1784773,CC(=O)N[C@H](C(=O)N1CCC[C@H]1C(=O)N[C@H](C(=O)...,250000.0
4,CHEMBL1784704,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N[C@@H](C)C(=O)...,15000.0
5,CHEMBL1784770,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,35000.0
6,CHEMBL1784703,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,3200.0
7,CHEMBL1784775,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,30100.0
8,CHEMBL1784776,CC[C@H](C)[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](C...,7100.0
9,CHEMBL1784777,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,18400.0


In [20]:
#Saves dataframe to CSV file
df3.to_csv("bioactivity_data.csv",index=False )

### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [21]:
# To read data from a CSV file into a Pandas DataFrame.
df4 = pd.read_csv("bioactivity_data.csv")
bioactivity_class = []
for i in df4.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")


In [22]:
#Concatenating df4 with a bioactivity class Series to create df5.

df5 = pd.concat([df4, pd.Series(bioactivity_class, name="class")], axis=1)
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL1784774,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,4600.0,intermediate
1,CHEMBL1784771,CC(=O)N[C@@H](CCC(=O)O)C(=O)N1CCC[C@H]1C(=O)N[...,250000.0,inactive
2,CHEMBL1784772,CC(=O)N[C@@H](CC(C(=O)O)C(=O)O)C(=O)N1CCC[C@H]...,52800.0,inactive
3,CHEMBL1784773,CC(=O)N[C@H](C(=O)N1CCC[C@H]1C(=O)N[C@H](C(=O)...,250000.0,inactive
4,CHEMBL1784704,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N[C@@H](C)C(=O)...,15000.0,inactive
5,CHEMBL1784770,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,35000.0,inactive
6,CHEMBL1784703,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,3200.0,intermediate
7,CHEMBL1784775,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,30100.0,inactive
8,CHEMBL1784776,CC[C@H](C)[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](C...,7100.0,intermediate
9,CHEMBL1784777,CC(=O)N[C@@H](COP(=O)(O)O)C(=O)N1CCC[C@H]1C(=O...,18400.0,inactive


##Saves dataframe to CSV file

In [23]:
#create a csv file for the preprocessed data
df5.to_csv("bioactivity_preprocessed_data.csv", index=False)

In [24]:
#to check the file
! ls -l

total 12
-rw-r--r-- 1 root root 2244 Jul 14 09:58 bioactivity_data.csv
-rw-r--r-- 1 root root 2435 Jul 14 09:58 bioactivity_preprocessed_data.csv
drwxr-xr-x 1 root root 4096 Jul 11 13:22 sample_data


In [25]:
#copy into the google drive
! cp bioactivity_preprocessed_data.csv "/cancer/gdrive/MyDrive/Colab Notebooks/data"

In [26]:
#To have a look
! ls "/cancer/gdrive/MyDrive/Colab Notebooks/data"

bioactivity_data.csv  bioactivity_data_csv  bioactivity_preprocessed_data.csv
