In [7]:
# install some packages
!pip install rdkit
!pip install keras-tuner



In [30]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
from rdkit import Chem
from rdkit.Chem import AllChem


# df = pd.read_csv('/content/smiles_embeddings_all.csv')
df = pd.read_csv('/content/drive/MyDrive/Mydataset/smiles_embeddings_all.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,DrugBankID,SMILES,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_390,embedding_391,embedding_392,embedding_393,embedding_394,embedding_395,embedding_396,embedding_397,embedding_398,embedding_399
0,0,Compound::DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,-0.659347,-0.34423,-0.578348,-0.670077,0.073021,-0.844866,-0.529099,...,0.72216,-0.361777,-0.069529,0.719951,0.765324,-0.828312,-0.750704,-0.360401,-0.359896,-0.820253
1,1,Compound::DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,-0.606968,-0.781302,-0.730112,-0.868258,0.363371,-0.38829,0.217138,...,-0.355017,0.657896,0.039386,-0.395858,-0.218164,-0.540272,-0.603087,-0.853275,0.545669,-0.836144
2,2,Compound::DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,-0.470256,-0.885203,-0.623956,-0.507102,0.395201,-0.298908,-0.156826,...,-0.430036,0.563329,0.419391,-0.074537,0.266462,-0.631508,-0.637233,-0.816603,0.416263,-0.720862
3,3,Compound::DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,-0.761337,-0.709398,0.665223,-0.388625,0.218644,-0.416196,0.594226,...,-0.721291,0.630432,0.817134,0.515158,-0.247925,-0.523338,-0.827212,-0.751571,-0.640211,-0.756264
4,4,Compound::DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,-0.764784,-0.956786,-0.445434,-0.611626,0.335249,0.044306,-0.63537,...,-0.204678,-0.404126,-0.018485,-0.401169,0.025806,-0.664728,-0.856459,-0.417736,-0.386456,-0.626406


Using all the SMILES molecules from the `drugbank_smiles.txt` file

In [31]:
df.shape

(8807, 403)

In [32]:
# Function to calculate Morgan fingerprints
def calculate_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    '''Function to convert SMILES to fingerprint using the Morgan Fingerprint.

    Parameters
    -----------
    smiles (str): SMILES of the compound.
    radius (int): controls the radius of the fingerprint.
    n_bits (int): controls the length of the fingerprint bit vector.

    Returns
    -------
    arr (NumPy Array): fingerprint of SMILES
    '''
    # Convert the input SMILES string into an RDKit molecule object.
    mol = Chem.MolFromSmiles(smiles)
    # If the molecule conversion is successful, then generate the fingerprint
    if mol is not None:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        arr = np.zeros((1,))
        AllChem.DataStructs.ConvertToNumpyArray(fingerprint, arr)
        return arr
    else:
        return None


# Function to preprocess data and create embeddings
def preprocess_data(df):
    # Get the target embeddings
    embeddings = df.iloc[:, 3:].values
    # create fingerprint column
    df['morgan_fingerprint'] = df['SMILES'].apply(calculate_morgan_fingerprint)
    df = df.dropna()
    # Extract the fingerprints as a NumPy array
    morgan_fingerprints = np.array(df['morgan_fingerprint'].tolist())
    return morgan_fingerprints, embeddings


# The morgan_fingerprints is X, our features
# The embeddings is y, our target variable
X, y = preprocess_data(df)
print("The length of X is:", len(X))
print(X)

[11:25:25] Unusual charge on atom 0 number of radical electrons set to zero


The length of X is: 8807
[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [33]:
print("The length of y is:", len(y))
print(y)

The length of y is: 8807
[[-0.6593468  -0.3442301  -0.57834786 ... -0.36040124 -0.35989577
  -0.8202533 ]
 [-0.6069678  -0.7813019  -0.7301117  ... -0.8532749   0.54566926
  -0.8361439 ]
 [-0.4702557  -0.8852031  -0.62395585 ... -0.816603    0.4162627
  -0.720862  ]
 ...
 [-0.56150216 -0.5052724   0.52687913 ...  0.44958693  0.63655037
   0.42234862]
 [-0.49201965  0.48620903  0.44930667 ... -0.26045257 -0.3269118
   0.47994253]
 [-0.6385004  -0.7725587   0.4874537  ... -0.19911917 -0.8574903
  -0.07390127]]


In [34]:
len(y[0])

400

In [35]:
# checking the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,DrugBankID,SMILES,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_391,embedding_392,embedding_393,embedding_394,embedding_395,embedding_396,embedding_397,embedding_398,embedding_399,morgan_fingerprint
0,0,Compound::DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,-0.659347,-0.34423,-0.578348,-0.670077,0.073021,-0.844866,-0.529099,...,-0.361777,-0.069529,0.719951,0.765324,-0.828312,-0.750704,-0.360401,-0.359896,-0.820253,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,Compound::DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,-0.606968,-0.781302,-0.730112,-0.868258,0.363371,-0.38829,0.217138,...,0.657896,0.039386,-0.395858,-0.218164,-0.540272,-0.603087,-0.853275,0.545669,-0.836144,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,Compound::DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,-0.470256,-0.885203,-0.623956,-0.507102,0.395201,-0.298908,-0.156826,...,0.563329,0.419391,-0.074537,0.266462,-0.631508,-0.637233,-0.816603,0.416263,-0.720862,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,Compound::DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,-0.761337,-0.709398,0.665223,-0.388625,0.218644,-0.416196,0.594226,...,0.630432,0.817134,0.515158,-0.247925,-0.523338,-0.827212,-0.751571,-0.640211,-0.756264,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,Compound::DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,-0.764784,-0.956786,-0.445434,-0.611626,0.335249,0.044306,-0.63537,...,-0.404126,-0.018485,-0.401169,0.025806,-0.664728,-0.856459,-0.417736,-0.386456,-0.626406,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Split Data into Training and Test Set

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=22)

print("Number of records in the training set: ", len(X_train))
print("Number of records in the test set: ", len(X_test))

Number of records in the training set:  7045
Number of records in the test set:  1762


In [38]:
print("Matrix of features (Training set)", X_train, sep='\n')
print("--------------------------------------------------")
print("Target Variable (Training set)", y_train, sep='\n')
print("--------------------------------------------------")
print("Matrix of features (Test set)", X_test, sep='\n')
print("--------------------------------------------------")
print("Target Variable (Test set)", y_test, sep='\n')

Matrix of features (Training set)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
--------------------------------------------------
Target Variable (Training set)
[[-0.6475993  -0.33205217 -0.6282438  ...  0.48398632  0.39195028
  -0.4995752 ]
 [ 0.54510766 -0.44672516 -0.55598974 ...  0.53110033  0.4191979
   0.5081751 ]
 [-0.44276494  0.35407907  0.4147306  ...  0.65115905  0.6978918
  -0.6097889 ]
 ...
 [ 0.61884636  0.4262558  -0.846139   ... -0.03980275 -0.590772
   0.46639615]
 [-0.5929575   0.55380386  0.27138382 ... -0.5842689   0.6167014
  -0.5904082 ]
 [ 0.6723257   0.694391   -0.63432586 ... -0.5061228   0.5818472
  -0.6481115 ]]
--------------------------------------------------
Matrix of features (Test set)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
-----

In [39]:
TUNER_PROJECT_NAME_A = "tuner_trials"


class TunerRegressorAutoTrainer(object):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.input_shape = X.shape[1]
        self.output_shape = y.shape[1]

    def _model_builder(self, hp):
        model = keras.Sequential()
        hp_units = hp.Int("units", min_value=32, max_value=512, step=32)
        model.add(
            keras.layers.Dense(
                units=hp_units,
                activation="relu", input_shape=(self.input_shape,)
            )
        )
        for i in range(hp.Int('layers', 1, 6)):
            model.add(
                keras.layers.Dense(
                    units=hp.Int("units_"+str(i), min_value=32, max_value=512, step=32),
                    activation="relu"
                )
            )
        model.add(keras.layers.Dense(self.output_shape))
        # Tune the learning rate for the optimizer
        # Choose an optimal value from 0.01, 0.001, or 0.0001
        hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])

        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
            loss="mean_squared_error",
            metrics=None,
        )

        return model

    def _search(self, X, y):
        self.tuner = kt.Hyperband(
            self._model_builder,
            objective="val_loss",
            max_epochs=10,
            factor=3,
            directory=TUNER_PROJECT_NAME_A,
            project_name="trials",
        )
        stop_early = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)
        self.tuner.search(
            X, y, epochs=50, validation_split=0.2, callbacks=[stop_early], verbose=True
        )
        self.best_hps = self.tuner.get_best_hyperparameters(num_trials=1)[0]

    def _get_best_epoch(self, X, y):
        # Build the model with the optimal hyperparameters and train it on the data for 50 epochs
        model = self.tuner.hypermodel.build(self.best_hps)
        history = model.fit(X, y, epochs=50, validation_split=0.2)

        val_per_epoch = history.history["val_loss"]
        self.best_epoch = val_per_epoch.index(min(val_per_epoch)) + 1
        print("Best epoch: %d" % (self.best_epoch,))

    def _final_train(self, X, y):
        self.hypermodel = self.tuner.hypermodel.build(self.best_hps)

        # Retrain the model
        self.hypermodel.fit(X, y, epochs=self.best_epoch, validation_split=0.2)

    def fit(self):
        self._search(self.X, self.y)
        self._get_best_epoch(self.X, self.y)
        self._final_train(self.X, self.y)
        self.hypermodel.save(os.path.join(TUNER_PROJECT_NAME_A, "model"))

    def export_model(self):
        return self.hypermodel

In [41]:
# Instantiate the model
trainer = TunerRegressorAutoTrainer(X_train, y_train)
# Train model
trainer.fit()

Trial 30 Complete [00h 00m 43s]
val_loss: 0.23405392467975616

Best val_loss So Far: 0.22387376427650452
Total elapsed time: 00h 07m 51s
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Best epoch: 3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [42]:
# save the best model
model = trainer.export_model()

In [44]:
# Make predictions on the test set
y_pred = model.predict(X_test)
print(y_pred)

[[-0.09701673  0.21846679 -0.15392062 ...  0.08500749  0.3237924
   0.0655628 ]
 [ 0.08321767  0.39812422 -0.06321672 ...  0.04095723  0.16374755
   0.09303423]
 [-0.19893427  0.12618722 -0.22751634 ...  0.06045969  0.32436612
  -0.34084874]
 ...
 [-0.1610446   0.27712503 -0.13203548 ... -0.02786058  0.23687965
  -0.38614362]
 [-0.02988506  0.24957454  0.11283201 ...  0.01215155  0.11195957
   0.15565857]
 [ 0.10247502  0.45808384 -0.14848125 ...  0.23385845  0.09047642
  -0.17366096]]


### Evaluate model on the test set

In [47]:
loaded_model = keras.models.load_model(os.path.join(TUNER_PROJECT_NAME_A, "model"))
train_loss = loaded_model.evaluate(X_train, y_train)
print("Train Loss:", train_loss)

test_loss = loaded_model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)

Train Loss: 0.1887272149324417
Test Loss: 0.22458785772323608
