# **PART 1: DATA COLLECTION AND CURATION**

## **Install and Import Required Libraries**

In [None]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-25.3.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-25.3.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━

In [None]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Step 1: Search for target protein**

In [None]:
target = new_client.target
target_query = target.search("CASP8")
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Mus musculus,Caspase-8,18.0,False,CHEMBL4630806,"[{'accession': 'O89110', 'component_descriptio...",SINGLE PROTEIN,10090
1,[],Homo sapiens,CASP8 and FADD-like apoptosis regulator,15.0,False,CHEMBL1955713,"[{'accession': 'O15519', 'component_descriptio...",SINGLE PROTEIN,9606
2,[],Homo sapiens,Caspase-8,13.0,False,CHEMBL3776,"[{'accession': 'Q14790', 'component_descriptio...",SINGLE PROTEIN,9606
3,[],Homo sapiens,Caspase,3.0,False,CHEMBL3831289,"[{'accession': 'P49662', 'component_descriptio...",PROTEIN FAMILY,9606


## Retrieving only bioactivity data for target;

In [None]:
selected_target = targets.target_chembl_id[2]
selected_target

'CHEMBL3776'

## Retrieving only bioactivity data for target; CHEMBL4105965 with reported IC50 values in nM (nanomolar) unit

In [None]:
activity = new_client.activity
results = activity.filter(target_chembl_id = selected_target).filter(standard_type = 'IC50')

In [None]:
df1 = pd.DataFrame.from_dict(results)
df1.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,185163,[],CHEMBL657316,Binding affinity towards human Caspase-8,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.007
1,,,191485,[],CHEMBL657316,Binding affinity towards human Caspase-8,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.179
2,,,194920,[],CHEMBL657316,Binding affinity towards human Caspase-8,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.77
3,,,266593,[],CHEMBL659710,Inhibition of Caspase-8 enzyme,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.012
4,,,269086,[],CHEMBL659710,Inhibition of Caspase-8 enzyme,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.017


In [None]:
df1.standard_type.unique()

array(['IC50'], dtype=object)

In [None]:
df1['standard_type'].value_counts()

Unnamed: 0_level_0,count
standard_type,Unnamed: 1_level_1
IC50,459


In [None]:
df1.shape

(459, 46)

In [None]:
df1[['standard_value', 'standard_units']].head()

Unnamed: 0,standard_value,standard_units
0,7.0,nM
1,179.0,nM
2,770.0,nM
3,12.0,nM
4,17.0,nM


In [None]:
## Saving the resulting bioactivity data to a CSV file bioactivity_raw_data.csv

In [None]:
df1.to_csv('bioactivity_raw_data.csv', index = False)

## Copying to the folder 'Datasets

In [None]:
! cp bioactivity_raw_data.csv "/content/drive/MyDrive/Colab Notebooks/Datasets"

In [None]:
! ls -l "/content/drive/MyDrive/Colab Notebooks/Datasets"

total 95279
-rw------- 1 root root   228762 Jan 31 13:30  bioactivity_raw_data.csv
-rw------- 1 root root  3296741 Dec 23 15:43  books.csv
-rw------- 1 root root    16018 Dec 22 10:41  Car_sales.csv
-rw------- 1 root root   194323 Jan  9 01:19 'Classified Data'
-rw------- 1 root root    23873 Nov 24 13:25  diabetes.csv
-rw------- 1 root root   118350 Dec 20 15:16  gdp.csv
-rw------- 1 root root 47926289 Dec 23 15:52 'IMDb movies.csv'
-rw------- 1 root root 17449108 Dec 23 16:06 'IMDb ratings.csv'
-rw------- 1 root root   198100 Dec 14 04:13  laptop_price.csv
-rw------- 1 root root  3399671 Dec 26 11:27  netflix_titles.csv
-rw------- 1 root root  8945067 Dec 14 21:43  players_20.csv
-rw------- 1 root root    99524 Dec 20 17:31  population_total.csv
-rw------- 1 root root 12437766 Dec 23 15:43  ratings.csv
-rw------- 1 root root    72036 Dec 11 12:12  StudentsPerformance.csv
-rw------- 1 root root   140285 Jan  1 10:57 'SuperMarket Analysis.csv'
-rw------- 1 root root   121621 Dec 20 16:

In [None]:
! head bioactivity_raw_data.csv

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,185163,[],CHEMBL657316,Binding affinity towards human Caspase-8,B,,,BAO_0000190,BAO_0000357,single protein format,CC(C)C[C@H](NC(=O)CNc1cccc2ccccc12)C(=O)N[C@H](C=O)CC(=O)O,,,CHEMBL1146752,Bioorg Med Chem Lett,2004,"{'bei': '19.72', 'le': '0.37', 'lle': '6.21', 'sei': '6.54'}",CHEMBL90224,,CHEMBL90224,8.15,

## **Step 3: Bioactivity Data Retrieval**

In [None]:
### Retrieve bioactivity data (IC50) for the selected CASP8 target.

### Inspecting missing values for the standard_type column.

In [None]:
df1['standard_type'].isnull().sum()

np.int64(0)

### filter rows with valid Bioactivity values.

In [None]:
df2 = df1[df1['standard_value'].notna()]
df2.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,185163,[],CHEMBL657316,Binding affinity towards human Caspase-8,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.007
1,,,191485,[],CHEMBL657316,Binding affinity towards human Caspase-8,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.179
2,,,194920,[],CHEMBL657316,Binding affinity towards human Caspase-8,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.77
3,,,266593,[],CHEMBL659710,Inhibition of Caspase-8 enzyme,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.012
4,,,269086,[],CHEMBL659710,Inhibition of Caspase-8 enzyme,B,,,BAO_0000190,...,Homo sapiens,Caspase-8,9606,,,IC50,uM,UO_0000065,,0.017


In [None]:
df2.shape

(457, 46)

### Assign Bioactivity Classes: Define active, intermediate and inactive classes based on IC50 values.

In [None]:
df2.describe()

Unnamed: 0,activity_id,document_year,potential_duplicate,record_id,src_id,standard_flag
count,457.0,457.0,457.0,457.0,457.0,457.0
mean,3734466.0,2006.656455,0.02407,769273.3,2.774617,1.0
std,5886198.0,4.431512,0.153435,883971.2,9.413944,0.0
min,185163.0,2002.0,0.0,31661.0,1.0,1.0
25%,1421442.0,2005.0,0.0,365453.0,1.0,1.0
50%,1660577.0,2005.0,0.0,411344.0,1.0,1.0
75%,1938683.0,2007.0,0.0,646884.0,1.0,1.0
max,28806500.0,2024.0,1.0,4597317.0,65.0,1.0


In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 457 entries, 0 to 458
Data columns (total 46 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   action_type                3 non-null      object
 1   activity_comment           7 non-null      object
 2   activity_id                457 non-null    int64 
 3   activity_properties        457 non-null    object
 4   assay_chembl_id            457 non-null    object
 5   assay_description          457 non-null    object
 6   assay_type                 457 non-null    object
 7   assay_variant_accession    0 non-null      object
 8   assay_variant_mutation     0 non-null      object
 9   bao_endpoint               457 non-null    object
 10  bao_format                 457 non-null    object
 11  bao_label                  457 non-null    object
 12  canonical_smiles           455 non-null    object
 13  data_validity_comment      27 non-null     object
 14  data_validity_d

In [None]:
bioactivity_class = []
for value in df2.standard_value:
  value = float(value)
  if value >= 10000:
    bioactivity_class.append('inactive')
  elif value <= 1000:
    bioactivity_class.append('active')
  else:
    bioactivity_class.append('intermediate')

## **Extract Relevant Columns.**

In [None]:
molecule_ids = df2.molecule_chembl_id.tolist()
canonical_smiles = df2.canonical_smiles.tolist()
standard_values = df2.standard_value.tolist()

In [None]:
data = list(zip(
    molecule_ids,
    canonical_smiles,
    standard_values,
    bioactivity_class
))

## **Step 4: Preprocessing**

### Create Preprocessed bioactivity Dataset

In [None]:
df3 = pd.DataFrame(
    data,
    columns = [
        'molecule_chembl_id',
        'canonical_smiles',
        'standard_values',
        'bioactivity_class'
    ]
)


df3.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_values,bioactivity_class
0,CHEMBL90224,CC(C)C[C@H](NC(=O)CNc1cccc2ccccc12)C(=O)N[C@H]...,7.0,active
1,CHEMBL90307,CC(C)C[C@H](NC(=O)C(=O)Nc1cccc2ccccc12)C(=O)N[...,179.0,active
2,CHEMBL330535,CC(C)C[C@H](NC(=O)COc1cccc2ccccc12)C(=O)N[C@H]...,770.0,active
3,CHEMBL100643,CC(C)CC(NC(=O)[C@@H](CCC(=O)O)Oc1cccc2ccccc12)...,12.0,active
4,CHEMBL3142757,CC(C)C[C@H](NC(=O)C(C)Nc1cccc2ccccc12)C(=O)N[C...,17.0,active


In [None]:
df3['bioactivity_class'].value_counts()

Unnamed: 0_level_0,count
bioactivity_class,Unnamed: 1_level_1
inactive,212
intermediate,144
active,101


### Remove Compounds without valid SMILES. Drop rows with NaN, empty or None SMILES values.

In [None]:
df3['canonical_smiles'].isnull().sum()

np.int64(2)

In [None]:
df3 = df3.dropna(subset = ['canonical_smiles'])
df3['canonical_smiles'] = df3['canonical_smiles'].str.lower().str.strip()
df3 = df3[df3['canonical_smiles'] != 'none']
df3 = df3[df3['canonical_smiles'] != ""]

In [None]:
df3.shape

(455, 4)

### Save Preprocessed Bioactivity Data. Save the cleaned dataset to CSV and copy to Google Drive.

In [None]:
df3.to_csv('bioactivity_preprocessed_data.csv', index = False)

In [None]:
! cp bioactivity_preprocessed_data.csv "/content/drive/MyDrive/Colab Notebooks/Datasets"