In [31]:
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem, DataStructs
import numpy as np
from rdkit.Chem import (
    PandasTools,
    Draw,
    Descriptors,
    MACCSkeys,
    rdFingerprintGenerator,
)
import math
import keras_tuner
import tensorflow as tf
from tensorflow import keras
#import dropout
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from keras.saving import load_model
from tensorflow.keras import backend as K

In [2]:
beta = pd.read_csv('beta_activity_class.csv') #Clean CSV file with beta secretase smiles and activity

In [3]:
beta["activity_class"].value_counts()
#dropna of activity_class
beta = beta.dropna(subset=["activity_class"])

In [4]:
#Create morgan fingerprints for compound smiles using RDKit (refer back to exploratory_DA)
rd_mols = [Chem.MolFromSmiles(smiles) for smiles in beta["Ligand SMILES"]]
beta["RDkit Molecule"] = rd_mols
del rd_mols
beta.dropna(subset=["RDkit Molecule"], inplace=True)
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
morgan = [morgan_gen.GetFingerprint(mol) for mol in beta["RDkit Molecule"]]
tmpX = [list(m) for m in morgan]
print(len(tmpX))

[09:38:30] Explicit valence for atom # 18 N, 4, is greater than permitted


13154


In [5]:
np.unique(beta["activity_class"].values)

array(['inactive', 'moderately_active', 'very_active'], dtype=object)

In [6]:
#transfor activity_class to 0,1,2
beta["activity_class"] = beta["activity_class"].replace("moderately_active", 1)
beta["activity_class"] = beta["activity_class"].replace("inactive", 0)
beta["activity_class"] = beta["activity_class"].replace("very_active", 2)

  beta["activity_class"] = beta["activity_class"].replace("very_active", 2)


In [7]:
beta["activity_class"].value_counts()

activity_class
1    9286
2    2182
0    1686
Name: count, dtype: int64

In [8]:
tmp = beta["activity_class"].values

In [9]:
from sklearn.model_selection import train_test_split
Y = tmp
print(len(Y))
X = pd.DataFrame(tmpX)
print(len(X))
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, train_size=0.8,random_state=42)
#Y_train


13154
13154


In [10]:
np.unique(Y, return_counts=True)

(array([0, 1, 2]), array([1686, 9286, 2182]))

In [11]:
#Train three non-neural net models for classification 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report



rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


rf_model.fit(X_train, Y_train)
cv_scores = cross_val_score(rf_model, X_train, Y_train, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV score: {cv_scores.mean()}')
# Predict and evaluate
y_pred = rf_model.predict(X_test)
print(classification_report(Y_test, y_pred))


Cross-validation scores: [0.83182898 0.81045131 0.81852732 0.81178707 0.79847909]
Mean CV score: 0.8142147521291871
              precision    recall  f1-score   support

           0       0.76      0.65      0.70       358
           1       0.85      0.92      0.88      1838
           2       0.75      0.59      0.66       435

    accuracy                           0.83      2631
   macro avg       0.79      0.72      0.75      2631
weighted avg       0.82      0.83      0.82      2631



In [12]:
from sklearn.svm import SVC
svm_model = SVC(kernel='rbf', C=1.0, gamma='auto')
svm_model.fit(X_train, Y_train)
y_pred = svm_model.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       358
           1       0.70      1.00      0.82      1838
           2       1.00      0.01      0.03       435

    accuracy                           0.70      2631
   macro avg       0.57      0.34      0.28      2631
weighted avg       0.65      0.70      0.58      2631



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(penalty='l2', C=1.0, random_state=42, max_iter=200)
lr_model.fit(X_train, Y_train)

y_pred = lr_model.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.64      0.69       358
           1       0.85      0.91      0.88      1838
           2       0.72      0.61      0.66       435

    accuracy                           0.82      2631
   macro avg       0.77      0.72      0.74      2631
weighted avg       0.82      0.82      0.82      2631



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
#measure accuracy of the models
from sklearn.metrics import accuracy_score
rf_acc = accuracy_score(Y_test, rf_model.predict(X_test))
svm_acc = accuracy_score(Y_test, svm_model.predict(X_test))
lr_acc = accuracy_score(Y_test, lr_model.predict(X_test))
#print
print(f'Random Forest Accuracy: {rf_acc}')
print(f'SVM Accuracy: {svm_acc}')
print(f'Logistic Regression Accuracy: {lr_acc}')


Random Forest Accuracy: 0.8259217027746104
SVM Accuracy: 0.700874192322311
Logistic Regression Accuracy: 0.8209806157354618


In [15]:
from keras.models import Sequential
from keras.layers import Dense
#Neural net model for classification (you can refer to https://psrivasin.medium.com/multiclass-classification-using-tensorflow-850ee138d0ca for a guide)
#Input layer from morgan fingerprints


In [16]:
def build(hp):
    num_layer = hp.Int('num_layer', min_value=1, max_value=4, step=1)
    num_units = hp.Int('num_units', min_value=32, max_value=512, step=16)
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.3, step=0.05)
    learning_rate = hp.Float('learning_rate', min_value=1e-05, max_value =5e-05, step = 1e-05)
    model = Sequential()
    model.add(Dense(num_units,kernel_initializer='normal', activation='relu', input_dim = X_train.shape[1]))
    for i in range(num_layer):
        model.add(Dense(num_units, activation='relu'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [17]:
tuner = keras_tuner.RandomSearch(
    hypermodel=build,
    objective="val_loss",
    max_trials=100,
    executions_per_trial=1,
    overwrite=True,
    directory="hyper_tuning",
    project_name="experiment",
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-09-06 09:42:14.160806: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2024-09-06 09:42:14.161139: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:135] retrieving CUDA diagnostic information for host: pie1
2024-09-06 09:42:14.161175: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:142] hostname: pie1
2024-09-06 09:42:14.161673: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:166] libcuda reported version is: 550.107.2
2024-09-06 09:42:14.161847: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:170] kernel reported version is: 550.90.7
2024-09-06 09:42:14.161894: E external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:252] kernel version 550.90.7 does not match DSO version 550.107.2 -- cannot find working devices in this configuration


In [18]:
tuner.search_space_summary()


Search space summary
Default search space size: 4
num_layer (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 4, 'step': 1, 'sampling': 'linear'}
num_units (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 16, 'sampling': 'linear'}
dropout_rate (Float)
{'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.3, 'step': 0.05, 'sampling': 'linear'}
learning_rate (Float)
{'default': 1e-05, 'conditions': [], 'min_value': 1e-05, 'max_value': 5e-05, 'step': 1e-05, 'sampling': 'linear'}


In [19]:
trainX = np.array(X_train)
testX = np.array(X_test)
trainY = np.array(Y_train)
testY = np.array(Y_test)

In [20]:
import gc
gc.collect()

1571

In [21]:
tuner.search(x=trainX, y=trainY, validation_data=[testX,testY], epochs=40, callbacks=[keras.callbacks.TensorBoard("tensorboard/tb_logs")])

Trial 100 Complete [00h 01m 52s]
val_loss: 0.4454018175601959

Best val_loss So Far: 0.41266706585884094
Total elapsed time: 02h 36m 14s


In [42]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

In [34]:
best_model.get_config()

{'name': 'sequential',
 'trainable': True,
 'dtype': {'module': 'keras',
  'class_name': 'DTypePolicy',
  'config': {'name': 'float32'},
  'registered_name': None},
 'layers': [{'module': 'keras.layers',
   'class_name': 'InputLayer',
   'config': {'batch_shape': (None, 2048),
    'dtype': 'float32',
    'sparse': False,
    'name': 'input_layer'},
   'registered_name': None},
  {'module': 'keras.layers',
   'class_name': 'Dense',
   'config': {'name': 'dense',
    'trainable': True,
    'dtype': {'module': 'keras',
     'class_name': 'DTypePolicy',
     'config': {'name': 'float32'},
     'registered_name': None},
    'units': 496,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'module': 'keras.initializers',
     'class_name': 'RandomNormal',
     'config': {'mean': 0.0, 'stddev': 0.05, 'seed': None},
     'registered_name': None},
    'bias_initializer': {'module': 'keras.initializers',
     'class_name': 'Zeros',
     'config': {},
     'registered_name'

In [23]:
best_model.save('best_model.keras')

In [43]:
best_model.evaluate(testX,testY)

[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8329 - loss: 0.4230


[0.41266706585884094, 0.8346636295318604]