# **Bioinformatics Project - Computational Drug Discovery **



---

## **Download PaDEL-Descriptor**

In [None]:
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh

In [None]:
! unzip padel.zip

## **Load bioactivity data**

In [None]:
import pandas as pd

In [None]:
df3 = pd.read_csv('acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv')

In [None]:
df3

In [None]:
selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df3[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [None]:
! cat molecule.smi | head -5

CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1	CHEMBL133897
O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1	CHEMBL336398
CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1	CHEMBL131588
O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F	CHEMBL130628
CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C	CHEMBL130478


In [None]:
! cat molecule.smi | wc -l

4695


## **Calculate fingerprint descriptors**


### **Calculate PaDEL descriptors**

In [None]:
! cat padel.sh

java -Xms1G -Xmx1G -Djava.awt.headless=true -jar ./PaDEL-Descriptor/PaDEL-Descriptor.jar -removesalt -standardizenitro -fingerprints -descriptortypes ./PaDEL-Descriptor/PubchemFingerprinter.xml -dir ./ -file descriptors_output.csv


In [None]:
! bash padel.sh

In [None]:
! ls -l

## **Preparing the X and Y Data Matrices**

### **X data matrix**

In [None]:
df3_X = pd.read_csv('descriptors_output.csv')

In [None]:
df3_X

In [None]:
df3_X = df3_X.drop(columns=['Name'])
df3_X

## **Y variable**

### **Convert IC50 to pIC50**

In [None]:
df3_Y = df3['pIC50']
df3_Y

Unnamed: 0,pIC50
0,6.12
1,7.00
2,4.30
3,6.52
4,6.10
...,...
4690,5.61
4691,5.60
4692,5.42
4693,5.46


## **Combining X and Y variable**

In [None]:
dataset3 = pd.concat([df3_X,df3_Y], axis=1)
dataset3

In [None]:
dataset3.to_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)

# **Regression Models with Random Forest**

## **Load bioactivity data**

## **1. Import libraries**

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

## **2. Load the data set**

In [None]:
df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

## **3. Input features**

### **3.1. Input features**

In [None]:
X = df.drop('pIC50', axis=1)
X

### **3.2. Output features**

In [None]:
Y = df.pIC50
Y

Unnamed: 0,pIC50
0,6.12
1,7.00
2,4.30
3,6.52
4,6.10
...,...
4690,5.61
4691,5.60
4692,5.42
4693,5.46


### **3.3. Let's examine the data dimension**

In [None]:
X.shape

(4695, 881)

In [None]:
Y.shape

(4695,)

### **3.4. Remove low variance features**

In [None]:
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X = selection.fit_transform(X)

In [None]:
X.shape

(4695, 137)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=100)  # Mantém as 100 melhores features
X_train_new = selector.fit_transform(X_train, Y_train)
X_test_new = selector.transform(X_test)

## **4. Data split (80/20 ratio)**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
X_train_new.shape, Y_train.shape

((3756, 100), (3756,))

In [None]:
X_test_new.shape, Y_test.shape

((939, 100), (939,))

Remove irrelevant descriptors

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=100)  # Mantém as 100 melhores features
X_train_new = selector.fit_transform(X_train, Y_train)
X_test_new = selector.transform(X_test)

## **5. Building a Regression Model using Random Forest**

In [None]:
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train_new, Y_train)
r2 = model.score(X_test_new, Y_test)
r2

0.5418173437077434

In [None]:
Y_pred = model.predict(X_test_new)

## **6. Scatter Plot of Experimental vs Predicted pIC50 Values**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Configurações do estilo do Seaborn
sns.set(color_codes=True)
sns.set_style("white")

# Criar o gráfico de regressão
ax = sns.regplot(x=Y_test, y=Y_pred, scatter_kws={'alpha': 0.4})

# Configurações dos rótulos e limites do gráfico
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.figure.set_size_inches(5, 5)

# Exibir o gráfico
plt.show()

Top 10 compounds

In [None]:
from sklearn.pipeline import Pipeline

# Carregar os dados originais (com SMILES e IDs)
df_original = pd.read_csv('acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv')
df_descriptors = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

# Combinar os dados mantendo as colunas essenciais
df_completo = pd.concat([
    df_original[['molecule_chembl_id', 'canonical_smiles']],
    df_descriptors.drop('pIC50', axis=1),
    df_descriptors['pIC50']
], axis=1)

#  Pré-processamento completo
X = df_completo.drop(['molecule_chembl_id', 'canonical_smiles', 'pIC50'], axis=1)
y = df_completo['pIC50']

# Aplicar os mesmos filtros usados no treino
X = selection.transform(X)  # VarianceThreshold
X = selector.transform(X)   # SelectKBest

# Fazer as predições
df_completo['pIC50_predito'] = model.predict(X)

# Selecionar os top 10 compostos
top_10 = df_completo.sort_values('pIC50_predito', ascending=False)[
    ['molecule_chembl_id', 'canonical_smiles', 'pIC50', 'pIC50_predito']
].head(10)

print("\n💊 Top 10 Compostos Mais Promissores:")
print(top_10.to_markdown(index=False, tablefmt="grid"))


💊 Top 10 Compostos Mais Promissores:
+----------------------+----------------------------------------------------------+---------+-----------------+
| molecule_chembl_id   | canonical_smiles                                         |   pIC50 |   pIC50_predito |
| CHEMBL4209803        | Cc1cccc(C(=O)Nc2ccc3c(c2)CN(C(=O)c2cccc(C)c2)C(=O)C3)c1  | 10.869  |         10.7419 |
+----------------------+----------------------------------------------------------+---------+-----------------+
| CHEMBL4214707        | Cc1cccc(C(=O)Nc2ccc3c(c2)CN(C(=O)c2cccc(Cl)c2)C(=O)C3)c1 | 10.679  |         10.7419 |
+----------------------+----------------------------------------------------------+---------+-----------------+
| CHEMBL4210316        | Cc1cccc(C(=O)N2Cc3cc(NC(=O)c4cccc(Cl)c4)ccc3CC2=O)c1     | 10.8413 |         10.7419 |
+----------------------+----------------------------------------------------------+---------+-----------------+
| CHEMBL4218191        | Cc1cccc(C(=O)N2Cc3cc(NC(=O)c4ccccc4)ccc3C

# **Comparing Regressors**


Comparing several ML algorithms for build regression models of acetylcholinesterase inhibitors.



## **1. Import libraries**

In [None]:
! pip install lazypredict



In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor

## **2. Load the data set**


In [None]:
df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

In [None]:
X = df.drop('pIC50', axis=1)
Y = df.pIC50

## **3. Data pre-processing**

In [None]:
# Examine X dimension
X.shape

(4695, 881)

In [None]:
# Remove low variance features
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X = selection.fit_transform(X)
X.shape

(4695, 137)

In [None]:
# Perform data splitting using 80/20 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## **4. Compare ML algorithms**

In [None]:
# Defines and builds the lazyclassifier
clf = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = clf.fit(X_train, X_train, Y_train, Y_train)
models_test,predictions_test = clf.fit(X_train, X_test, Y_train, Y_test)

In [None]:
# Performance table of the training set (80% subset)
predictions_train

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreeRegressor,0.86,0.86,0.58,0.22
DecisionTreeRegressor,0.86,0.86,0.58,0.15
ExtraTreesRegressor,0.86,0.86,0.58,7.71
GaussianProcessRegressor,0.86,0.86,0.58,7.14
RandomForestRegressor,0.82,0.83,0.64,6.31
XGBRegressor,0.82,0.82,0.65,0.41
BaggingRegressor,0.8,0.81,0.68,0.56
MLPRegressor,0.77,0.78,0.73,9.62
HistGradientBoostingRegressor,0.68,0.69,0.87,1.04
LGBMRegressor,0.65,0.66,0.91,0.4


In [None]:
# Performance table of the test set (20% subset)
predictions_test

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,0.45,0.53,1.06,1.27
XGBRegressor,0.44,0.52,1.07,0.34
LGBMRegressor,0.44,0.52,1.08,0.34
RandomForestRegressor,0.43,0.52,1.08,5.8
BaggingRegressor,0.42,0.5,1.1,0.53
MLPRegressor,0.4,0.49,1.11,9.71
SVR,0.38,0.47,1.13,2.32
KNeighborsRegressor,0.38,0.47,1.13,0.12
NuSVR,0.38,0.47,1.13,1.68
GradientBoostingRegressor,0.31,0.41,1.2,2.51


## **5. Data visualization of model performance**

In [None]:
# Bar plot of R-squared values
import matplotlib.pyplot as plt
import seaborn as sns

#train["R-Squared"] = [0 if i < 0 else i for i in train.iloc[:,0] ]

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="R-Squared", data=predictions_train)
ax.set(xlim=(0, 1))

In [None]:
# Bar plot of RMSE values
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="RMSE", data=predictions_train)
ax.set(xlim=(0, 10))

In [None]:
# Bar plot of calculation time
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="Time Taken", data=predictions_train)
ax.set(xlim=(0, 10))