# Model Building Setup for Safety Models using RdKit descriptors and ChEMBL31 data only

# Setup

In [None]:
# input
model_dir = "ACHE_PIC50"
data_file_name = "./ACHE_PIC50_train.tsv"
# output
scalar_file_name = "./scalar_ACHE_PIC50.pkl"
svm_model_file = "./ACHE_PIC50_svm.pkl"
xgb_model_file = "./ACHE_PIC50_xgb.pkl"

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
import numpy as np
import pandas as pd
import joblib
from sklearn.svm import SVC
import xgboost
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef, confusion_matrix, balanced_accuracy_score, recall_score
from scripts.utils import rdkit_fpconvert_numpy, rdkit_get_physchem_descr

seed = 1234

# Get Mols & Data then calculate descriptors

In [3]:
df = pd.read_csv(data_file_name, sep="\t")
df.head()

Unnamed: 0,parentised_smiles,ID,class_label
0,BrC1=C[C@@]2(CCC1=O)CC=NC1=C2C2=NCCc3c[nH]c(c2...,chembl_606,NEGATIVE
1,BrC1=C[C@@]23C[C@@H](NC4=C2C2=NCCc5c[nH]c(c25)...,chembl_2086,NEGATIVE
2,Brc1ccc(CN2C(=O)C(=O)c3cc(Br)ccc23)cc1,chembl_4127,POSITIVE
3,Brc1ccc(CN2CCN(CC2)c2nccc(NCc3ccccc3)n2)cc1,chembl_3003,NEGATIVE
4,Brc1ccc(CNC2CCN(Cc3ccccc3)C2)cc1,chembl_3482,POSITIVE


In [4]:
df.value_counts("class_label")

class_label
POSITIVE    3974
NEGATIVE     971
dtype: int64

In [5]:
# temp test to see why models failing
#df = df.head(2000)
#df

## Preparing descriptors and responses now

In [6]:
y_str = df.class_label.tolist() # convert str to list
y_int = pd.get_dummies(y_str) # one hot encode the str labels
y_new = y_int["POSITIVE"] # set new y to one hot encoded POSITIVE label column 
df['class_label_binary'] = y_new # set new encoded label into dataframe
y = df.class_label_binary.to_list()

In [7]:
df.head(3)

Unnamed: 0,parentised_smiles,ID,class_label,class_label_binary
0,BrC1=C[C@@]2(CCC1=O)CC=NC1=C2C2=NCCc3c[nH]c(c2...,chembl_606,NEGATIVE,0
1,BrC1=C[C@@]23C[C@@H](NC4=C2C2=NCCc5c[nH]c(c25)...,chembl_2086,NEGATIVE,0
2,Brc1ccc(CN2C(=O)C(=O)c3cc(Br)ccc23)cc1,chembl_4127,POSITIVE,1


In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in df.parentised_smiles]
# generate binary Morgan fingerprint with radius 2
fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
# generate binary Morgan fingerprint with radius 2 with physchem as numpy array
x = rdkit_fpconvert_numpy(fp)
x = np.concatenate((x, rdkit_get_physchem_descr(mols)), axis=1)

## Scale Data and Calculate Splits

In [9]:
# randomly select 20% of compounds as test set but with stratified selection
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.20, random_state=seed, stratify=y)

### Investigate size of splits to confirm

In [10]:
len(y_tr), len(x_tr), len(x_ts), len(y_ts)

(3956, 3956, 989, 989)

In [11]:
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)
# scale descriptors of the test set compounds
x_ts = scale.transform(x_ts)

In [12]:
# save scalar
joblib.dump(scale, 
            scalar_file_name, 
            compress=3)

['./scalar_ACHE_PIC50.pkl']

In [13]:
# cross validation splits
cv = StratifiedKFold(n_splits=5)

# SVM Model

Search for optimal tuning parameters and build the model

In [14]:
# create grid search dictionary
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [15]:
# setup model building
svm = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, n_jobs=2, cv=cv, verbose=1)

In [16]:
# run model building
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=SVC(probability=True), n_jobs=2,
             param_grid={'C': [1, 10, 100, 1000, 10000],
                         'gamma': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]},
             verbose=1)

In [17]:
svm.best_score_

0.8579403389137902

In [18]:
svm.best_params_

{'C': 10, 'gamma': 0.0001}

In [19]:
# save model
joblib.dump(svm, 
            svm_model_file,
            compress=3)

['./ACHE_PIC50_svm.pkl']

# Test Set Validation

In [20]:
# predict for the test set compounds
pred_svm = svm.predict(x_ts)

In [21]:
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred_svm))
print("Balanced Accuracy Score = ", balanced_accuracy_score(y_ts, pred_svm))
print("MCC = ", matthews_corrcoef(y_ts, pred_svm))
print("Kappa = ", cohen_kappa_score(y_ts, pred_svm))
print("Recall = ", recall_score(y_ts, pred_svm))
print("Performance reported as average of matthews and recall is", (((matthews_corrcoef(y_ts, pred_svm))+(recall_score(y_ts, pred_svm)))/2))

Accuracy =  0.8725985844287159
Balanced Accuracy Score =  0.7687803929196655
MCC =  0.5739355302194166
Kappa =  0.5709327548806942
Recall =  0.939622641509434
Performance reported as average of matthews and recall is 0.7567790858644252


In [22]:
# confusion matrix
confusion_matrix(y_ts, pred_svm)

array([[116,  78],
       [ 48, 747]])

In [23]:
tn, fp, fn, tp = confusion_matrix(y_ts, pred_svm).ravel()
tn, fp, fn, tp

(116, 78, 48, 747)

# XGB Model

In [24]:
# create grid search dictionary
param_grid = {"max_depth": [i for i in range(2, 8)],
              "n_estimators": [i for i in [10, 30, 50, 100]],
              "learning_rate": [i for i in [0.01, 0.05, 0.1]]}

In [25]:
# setup model building
xgb = GridSearchCV(xgboost.XGBClassifier(base_score=0.5), param_grid, n_jobs=2, cv=cv, verbose=1)

In [26]:
# run model building
xgb.fit(x_tr, y_tr)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=XGBClassifier(base_score=0.5, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_t...
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimator

In [27]:
xgb.best_score_

0.8614760116972506

In [28]:
xgb.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}

In [29]:
# save model
joblib.dump(xgb, 
            xgb_model_file,
            compress=3)

['./ACHE_PIC50_xgb.pkl']

## Test Set Validation

In [30]:
# predict for the test set compounds
pred_xgb = xgb.predict(x_ts)

In [None]:
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred_xgb))
print("Balanced Accuracy Score = ", balanced_accuracy_score(y_ts, pred_xgb))
print("MCC = ", matthews_corrcoef(y_ts, pred_xgb))
print("Kappa = ", cohen_kappa_score(y_ts, pred_xgb))
print("Recall = ", recall_score(y_ts, pred_xgb))

In [32]:
# confusion matrix
confusion_matrix(y_ts, pred_xgb)

array([[ 89, 105],
       [ 28, 767]])

In [33]:
tn, fp, fn, tp = confusion_matrix(y_ts, pred_xgb).ravel()
tn, fp, fn, tp

(89, 105, 28, 767)

# Pipeline for making new predictions

In [36]:
predictor_model = joblib.load(best_model)
smi_file = "../smi_file.txt"
pred_example = pd.read_csv(smi_file)
pred_example.head(3)

Unnamed: 0,smiles,id
0,c1ccccc1,example_0
1,c1ccccc1(C(=O)NCC),example_1
2,c1cc(OC)ccn1,example_2


In [37]:
pred_mols = [Chem.MolFromSmiles(smi) for smi in pred_example.smiles]
pred_fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
pred_x = rdkit_fpconvert_numpy(pred_fp)

x = np.concatenate((x, rdkit_get_physchem_descr(mols)), axis=1)
x = scale.transform(x)

In [38]:
pred_best_model = predictor_model.predict(x)
pred_best_model

array([1, 1, 1])

A prediction of 1 suggests a positive class label prediction and 0 a negative prediction