# Model Building Setup for Safety Models using RdKit descriptors and ChEMBL31 data only

# Setup

In [1]:
# input
model_dir = "AHR_PEC50"
data_file_name = "./AHR_PEC50_train.tsv"
# output
scalar_file_name = "./scalar_AHR_PEC50.pkl"
ecc_model_file = "./AHR_PEC50_ecc.pkl"
brf_model_file = "./AHR_PEC50_brf.pkl"
rusb_model_file = "./AHR_PEC50_rusb.pkl"

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score, balanced_accuracy_score, matthews_corrcoef, cohen_kappa_score 
from utils import rdkit_fpconvert_numpy, rdkit_get_physchem_descr

seed = 1234

In [3]:
from imblearn.ensemble import EasyEnsembleClassifier,BalancedRandomForestClassifier,RUSBoostClassifier

# Get Mols, Data and generate descriptors

In [4]:
df = pd.read_csv(data_file_name, sep="\t")

In [5]:
y_str = df.class_label.tolist() 
y_int = pd.get_dummies(y_str)
y_new = y_int["POSITIVE"] 
df['class_label_binary'] = y_new 
y = df.class_label_binary.to_list()

In [6]:
mols = [Chem.MolFromSmiles(smi) for smi in df.parentised_smiles]
# generate binary Morgan fingerprint with radius 2
fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
# generate binary Morgan fingerprint with radius 2 with physchem as numpy array
x = rdkit_fpconvert_numpy(fp)
x = np.concatenate((x, rdkit_get_physchem_descr(mols)), axis=1)

## Scale using scalar from last run and calculate splits

In [7]:
# randomly select 20% of compounds as test set but with stratified selection
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.20, random_state=seed, stratify=y)

In [8]:
# save scalar
scale = joblib.load(scalar_file_name)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [9]:
x_tr = scale.transform(x_tr)
x_ts = scale.transform(x_ts)

# IMBLearn Models

In [10]:
# create grid search dictionary
eec_param_grid = {"n_estimators": [i for i in [10, 30, 50, 100]]}
# model build
eec = GridSearchCV(EasyEnsembleClassifier(n_jobs=-1),
                  verbose=2,
                  scoring='balanced_accuracy',
                  param_grid=eec_param_grid, cv=5,
                  n_jobs=-1)
eec.fit(x_tr, y_tr)
# save model
joblib.dump(eec, ecc_model_file, compress=3)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ....................................n_estimators=10; total time=   1.8s
[CV] END ....................................n_estimators=10; total time=   2.0s
[CV] END ....................................n_estimators=10; total time=   2.0s
[CV] END ....................................n_estimators=10; total time=   2.1s
[CV] END ....................................n_estimators=10; total time=   2.1s
[CV] END ....................................n_estimators=30; total time=   6.2s
[CV] END ....................................n_estimators=30; total time=   6.3s
[CV] END ....................................n_estimators=30; total time=   6.3s
[CV] END ....................................n_estimators=30; total time=   6.1s
[CV] END ....................................n_estimators=30; total time=   6.0s
[CV] END ....................................n_estimators=50; total time=  10.6s
[CV] END ....................................n_es

['./AHR_PEC50_ecc.pkl']

In [11]:
# create grid search dictionary
brf_param_grid = {"n_estimators": [i for i in [10, 30, 50, 100]]}
# model build
brf = GridSearchCV(BalancedRandomForestClassifier(n_jobs=-1, replacement=True, sampling_strategy='all'),
                  verbose=2,
                  scoring='balanced_accuracy',
                  param_grid=eec_param_grid, cv=5,
                  n_jobs=-1)
brf.fit(x_tr, y_tr)
# save model
joblib.dump(brf, brf_model_file, compress=3)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=30; total time=   0.1s
[CV] END ....................................n_estimators=30; total time=   0.1s
[CV] END ....................................n_estimators=30; total time=   0.1s
[CV] END ....................................n_estimators=30; total time=   0.1s
[CV] END ....................................n_estimators=30; total time=   0.1s
[CV] END ....................................n_estimators=50; total time=   0.2s
[CV] END ....................................n_es

['./AHR_PEC50_brf.pkl']

In [12]:
# create grid search dictionary
rusb_param_grid = {"n_estimators": [i for i in [10, 30, 50, 100]]}
# model build
rusb = GridSearchCV(RUSBoostClassifier(),
                  verbose=2,
                  scoring='balanced_accuracy',
                  param_grid=eec_param_grid, cv=5,
                  n_jobs=-1)
rusb.fit(x_tr, y_tr)
# save model
joblib.dump(eec, rusb_model_file, compress=3)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ....................................n_estimators=10; total time=   0.1s
[CV] END ....................................n_estimators=10; total time=   0.1s
[CV] END ....................................n_estimators=10; total time=   0.1s
[CV] END ....................................n_estimators=10; total time=   0.1s
[CV] END ....................................n_estimators=10; total time=   0.1s
[CV] END ....................................n_estimators=30; total time=   0.2s
[CV] END ....................................n_estimators=30; total time=   0.1s
[CV] END ....................................n_estimators=30; total time=   0.2s
[CV] END ....................................n_estimators=30; total time=   0.2s
[CV] END ....................................n_estimators=30; total time=   0.2s
[CV] END ....................................n_estimators=50; total time=   0.3s
[CV] END ....................................n_es

['./AHR_PEC50_rusb.pkl']

## Test Set Validation

In [13]:
# predict for the test set compounds
pred_eec_tr = eec.predict(x_tr)
# calc statistics
print("EEC Train:")
print("Precision = ", precision_score(y_tr, pred_eec_tr))
print("ROC-AUC = ", roc_auc_score(y_tr, pred_eec_tr))
print("Balanced Accuracy Score = ", balanced_accuracy_score(y_tr, pred_eec_tr))
print("MCC = ", matthews_corrcoef(y_tr, pred_eec_tr))
print("Kappa = ", cohen_kappa_score(y_tr, pred_eec_tr))

EEC Train:
Precision =  1.0
ROC-AUC =  0.9663461538461539
Balanced Accuracy Score =  0.9663461538461539
MCC =  0.9162003475894762
Kappa =  0.9127025614218505


In [14]:
# predict for the test set compounds
pred_eec_ts = eec.predict(x_ts)
# calc statistics
print("EEC Test:")
print("Precision = ", precision_score(y_ts, pred_eec_ts))
print("ROC-AUC = ", roc_auc_score(y_ts, pred_eec_ts))
print("Balanced Accuracy Score = ", balanced_accuracy_score(y_ts, pred_eec_ts))
print("MCC = ", matthews_corrcoef(y_ts, pred_eec_ts))
print("Kappa = ", cohen_kappa_score(y_ts, pred_eec_ts))

EEC Test:
Precision =  0.72
ROC-AUC =  0.6274038461538461
Balanced Accuracy Score =  0.6274038461538461
MCC =  0.2520952919018819
Kappa =  0.25178147268408546


In [15]:
# calc statistics
# predict for the test set compounds
pred_brf_tr = brf.predict(x_tr)
# calc statistics
print("BRF Train:")
print("Precision = ", precision_score(y_tr, pred_brf_tr))
print("ROC-AUC = ", roc_auc_score(y_tr, pred_brf_tr))
print("Balanced Accuracy Score = ", balanced_accuracy_score(y_tr, pred_brf_tr))
print("MCC = ", matthews_corrcoef(y_tr, pred_brf_tr))
print("Kappa = ", cohen_kappa_score(y_tr, pred_brf_tr))

BRF Train:
Precision =  0.9897959183673469
ROC-AUC =  0.958409645909646
Balanced Accuracy Score =  0.9584096459096458
MCC =  0.902470686576531
Kappa =  0.8999250936329588


In [16]:
# calc statistics
# predict for the test set compounds
pred_brf_ts = brf.predict(x_ts)
# calc statistics
print("BRF Train:")
print("Precision = ", precision_score(y_ts, pred_brf_ts))
print("ROC-AUC = ", roc_auc_score(y_ts, pred_brf_ts))
print("Balanced Accuracy Score = ", balanced_accuracy_score(y_ts, pred_brf_ts))
print("MCC = ", matthews_corrcoef(y_ts, pred_brf_ts))
print("Kappa = ", cohen_kappa_score(y_ts, pred_brf_ts))

BRF Train:
Precision =  0.75
ROC-AUC =  0.6850961538461539
Balanced Accuracy Score =  0.6850961538461539
MCC =  0.381356384904845
Kappa =  0.3793103448275862


In [17]:
# calc statistics
# predict for the test set compounds
pred_rusb_tr = rusb.predict(x_tr)
# calc statistics
print("RUSB Train:")
print("Precision = ", precision_score(y_tr, pred_rusb_tr))
print("ROC-AUC = ", roc_auc_score(y_tr, pred_rusb_tr))
print("Balanced Accuracy Score = ", balanced_accuracy_score(y_tr, pred_rusb_tr))
print("MCC = ", matthews_corrcoef(y_tr, pred_rusb_tr))
print("Kappa = ", cohen_kappa_score(y_tr, pred_rusb_tr))

RUSB Train:
Precision =  1.0
ROC-AUC =  0.9951923076923077
Balanced Accuracy Score =  0.9951923076923077
MCC =  0.9873752355458539
Kappa =  0.9872955496386459


In [18]:
# calc statistics
# predict for the test set compounds
pred_rusb_ts = rusb.predict(x_ts)
# calc statistics
print("RUSB Train:")
print("Precision = ", precision_score(y_ts, pred_rusb_ts))
print("ROC-AUC = ", roc_auc_score(y_ts, pred_rusb_ts))
print("Balanced Accuracy Score = ", balanced_accuracy_score(y_ts, pred_rusb_ts))
print("MCC = ", matthews_corrcoef(y_ts, pred_rusb_ts))
print("Kappa = ", cohen_kappa_score(y_ts, pred_rusb_ts))

RUSB Train:
Precision =  0.7083333333333334
ROC-AUC =  0.6081730769230769
Balanced Accuracy Score =  0.6081730769230769
MCC =  0.2123019439021117
Kappa =  0.21126760563380276
