In [56]:
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem, DataStructs
import numpy as np
from rdkit.Chem import (
    PandasTools,
    Draw,
    Descriptors,
    MACCSkeys,
    rdFingerprintGenerator,
)
import math

In [100]:
beta = pd.read_csv('beta_activity_class.csv') #Clean CSV file with beta secretase smiles and activity

In [94]:
beta

Unnamed: 0,Ligand SMILES,activity_class
0,CCSc1cnc(cn1)C(=O)Nc1cccc(c1)[C@]1(C)CCSC(N)=N1,moderately_active
1,C[C@]1(CCSC(N)=N1)c1cc(NC(=O)CCc2ccc(O)c(O)c2)...,moderately_active
2,COc1nc(nc(C)c1F)N1C[C@H]2C(=O)N(C)C(=N)N[C@]2(...,moderately_active
3,COc1nc(nc(C)c1F)N1C[C@H]2C(=O)N(C)C(=N)N[C@]2(...,moderately_active
4,COc1nc(nc(C)c1F)N1C[C@H]2C(=O)N(C)C(=N)N[C@]2(...,moderately_active
...,...,...
15992,FC(F)(F)c1ccc(N\N=C\c2coc3cc4oc(cc4cc3c2=O)-c2...,inactive
15993,COc1ccc(cc1)-c1cc2cc3c(cc2o1)occ(\C=N\Nc1ccc(c...,inactive
15994,COc1ccc(CN(CCN(C)CCN)c2ccccn2)cc1,inactive
15995,Fc1cccc(c1)-c1cc2c(ccc3c2occ(\C=N\Nc2ccc(cc2)C...,inactive


In [110]:
#Create morgan fingerprints for compound smiles using RDKit (refer back to exploratory_DA)
rd_mols = [Chem.MolFromSmiles(smiles) for smiles in beta["Ligand SMILES"]]
beta["RDkit Molecule"] = rd_mols
del rd_mols
beta.dropna(subset=["RDkit Molecule"], inplace=True)
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
morgan = [morgan_gen.GetFingerprint(mol) for mol in beta["RDkit Molecule"]]
tmpX = [list(m) for m in morgan]
print(len(tmpX))

15996


In [109]:
#transform the activty_class into integer labels for training
#print(beta)
tmp = [l for l in beta["activity_class"]]
#print(beta['activity_class'])
for i,t in enumerate(tmp):
    if t=='inactive': tmp[i]=0
    if t=='moderately_active': tmp[i]=1
    else: tmp[i]=2
print(len(tmp))
print(len(list(beta['activity_class'])))

15996
15996


In [112]:
from sklearn.model_selection import train_test_split
Y = tmp
print(len(Y))
X = pd.DataFrame(tmpX)
print(len(X))
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, train_size=0.8,random_state=42)
#Y_train


15996
15996


In [117]:
#Train three non-neural net models for classification 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report



rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


rf_model.fit(X_train, Y_train)
cv_scores = cross_val_score(rf_model, X_train, Y_train, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV score: {cv_scores.mean()}')
# Predict and evaluate
y_pred = rf_model.predict(X_test)
print(classification_report(Y_test, y_pred))


Cross-validation scores: [0.7984375  0.7971864  0.80031262 0.80461118 0.80695584]
Mean CV score: 0.801500708284486
              precision    recall  f1-score   support

           1       0.82      0.87      0.84      1828
           2       0.81      0.74      0.77      1372

    accuracy                           0.81      3200
   macro avg       0.81      0.80      0.81      3200
weighted avg       0.81      0.81      0.81      3200



In [115]:
from sklearn.metrics import mean_absolute_error, r2_score
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)
print('Mean Absolute Error:', mae)
print('R2 Score:', r2)

Mean Absolute Error: 0.18625
R2 Score: 0.2395582803299502


In [118]:
from sklearn.svm import SVC
svm_model = SVC(kernel='rbf', C=1.0, gamma='auto')
svm_model.fit(X_train, Y_train)
y_pred = svm_model.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           1       0.67      0.96      0.79      1828
           2       0.88      0.36      0.51      1372

    accuracy                           0.70      3200
   macro avg       0.77      0.66      0.65      3200
weighted avg       0.76      0.70      0.67      3200



In [119]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(penalty='l2', C=1.0, random_state=42, max_iter=200)
lr_model.fit(X_train, Y_train)

y_pred = lr_model.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           1       0.79      0.85      0.82      1828
           2       0.78      0.70      0.74      1372

    accuracy                           0.79      3200
   macro avg       0.78      0.78      0.78      3200
weighted avg       0.79      0.79      0.78      3200



In [None]:
#Neural net model for classification (you can refer to https://psrivasin.medium.com/multiclass-classification-using-tensorflow-850ee138d0ca for a guide)
#Input layer from morgan fingerprints
#hidden layer > hyperparameter tune 
#hyperparameter to tune: number of hidden layers (1 to 4), number of units in each layer (32-128), learning rate ( 1e-05 to 5e-05)
#dropout layer > tune dropout rate (0.1 to 0.3)
#Output 1 dense layer with 3 units, softmax activation