In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style='ticks')
import matplotlib.pyplot as plt

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import time
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
!pip install lazypredict
import lazypredict
from lazypredict.Supervised import LazyRegressor

## Preprocessing and feature engineering of the collected data

In [3]:
df = pd.read_csv("bioactivity_data_raw.csv")
df.columns

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')

### Handling missing data && cleaning

In [4]:
df = df.dropna(subset=['standard_value', 'canonical_smiles'])
df = df.drop_duplicates(['canonical_smiles']).reset_index(drop=True)

In [6]:
#making sure all standard values are positive values
for mol in df.standard_value :
  if (mol)*(10**-9) < 0 :
    print((mol)*(10**-9))

### Simplifying the molecular smile notation

In [8]:
# Removing insignificant element and keeping the largest compound in each SMILES expression
selected_features=["molecule_chembl_id", "canonical_smiles", "standard_value"]
df = df[selected_features]

smileless= df.drop(columns=['canonical_smiles'])

smiles=[]
for i in df['canonical_smiles'].tolist():
  strg= str(i).split(".")
  strg = max(strg,key= len)
  smiles.append(strg)

smiles=pd.DataFrame(data=smiles, columns=["canonical_smiles"])
df = pd.concat([smileless, smiles], axis=1)

### Discretization

Classify bioactivity to active, inactive and intermediate according to the standard value.

The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be active while those greater than 10,000 nM will be considered to be inactive. As for those values in between 1,000 and 10,000 nM will be referred to as intermediate.

In [10]:
# if <= 1000 then active
# if >=10000 then inactive
# otherwise it is intermediate
bioactivity_class=[]
for i in df['standard_value']:
  if float(i) <= 1000:
    bioactivity_class.append("active")
  elif float(i) >= 10000:
    bioactivity_class.append("inactive")
  else:
    bioactivity_class.append("intermediate")
act_series=pd.Series(bioactivity_class, name="class")
df = pd.concat([df, act_series], axis=1)
df.head()

Unnamed: 0,molecule_chembl_id,standard_value,canonical_smiles,class
0,CHEMBL828,26000.0,c1ccc2c(c1)Nc1ccccc1S2,inactive
1,CHEMBL1629795,26000.0,CN(C)CCC(=O)NN1c2ccccc2Sc2cc(Cl)ccc21,inactive
2,CHEMBL3142202,16000.0,CN1CCN(NCCN2c3ccccc3Sc3ccccc32)CC1,inactive
3,CHEMBL2063785,507.1,COc1ccc2c(c1)Sc1cc(OC)ccc1N2,active
4,CHEMBL2063786,1440.0,COc1ccc2c(c1)Sc1cc(C#N)ccc1N2,intermediate


In [11]:
df["class"].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
intermediate,39
inactive,32
active,17
