# Hyperparemeter Grid Search
### In this module, we perform hyperparameter grid search for the classifiers.

# 1. Import library
Note: Combined from various sources, much more comprehensive than the original code provided. Import 也很讲究 XDXD

Also note: For this "Ab_Virus_02_Model_Selection_v01.ipynb" Jupyter Notebook, you can switch back to your normal pip3 or conda environment.

### 1.0 Import Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1.1 Import OS and Path

In [None]:
import os
from pathlib import Path

### 1.2 Import data structures

In [None]:
import numpy as np
from numpy import arange, logspace
import pandas as pd
import multiprocessing
import logging
import csv
import json

### 1.3 Import visualisation tools

In [None]:
import seaborn as sb
sb.set()
from matplotlib import pyplot
import matplotlib.pyplot as plt
%matplotlib inline

### 1.4 Import Scikit Learn - data analytics

In [None]:
# The code for featurization was borrowed from deepchem. Please refer https://deepchem.io for more information
from scipy import stats
from scipy.stats import randint, uniform
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, average_precision_score, mean_squared_error, r2_score, precision_score,recall_score, f1_score
from sklearn.model_selection import cross_val_score

### 1.5 Import Scikit Learn - classifiers

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

### 1.5 Import XGBoost

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_importance

### 1.6 Import other classifiers

In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
!pip install catboost
import catboost as cb
from catboost import CatBoostClassifier

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


### 1.7 Import Pytorch Tabnet

In [None]:
import torch
# !pip install pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
# !pip install pytorch_tabular
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1
Collecting pytorch_tabular
  Downloading pytorch_tabular-0.7.0.tar.gz (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 4.1 MB/s 
Collecting category-encoders==2.2.2
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 8.9 MB/s 
Collecting pytorch-lightning==1.3.6
  Downloading pytorch_lightning-1.3.6-py3-none-any.whl (809 kB)
[K     |████████████████████████████████| 809 kB 39.4 MB/s 
[?25hCollecting omegaconf>=2.0.1
  Downloading omegaconf-2.1.1-py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 3.2 MB/s 
[?25hCollecting torchmetrics>=0.3.2
  Downloading torchmetrics-0.6.2-py3-none-any.whl (332 kB)
[K     |████████████████████████████████| 332 kB 53.1 MB/s 
Collecting pytorch-tabnet==3.0.0
  Downloading pytorch_ta

  import pandas.util.testing as tm


# 2. Data structures loading and preparation

### 2.1 Load NumPy files from directory

In [None]:
CURRENT_PATH = "/content/drive/My Drive/URECA_Year_2/PotentialAB"
print(CURRENT_PATH)
os.listdir(CURRENT_PATH)

mean_final_elementwise_sum = np.load(os.path.join(CURRENT_PATH, "mean_final_elementwise_sum.npy"))
print("Loaded file 'mean_final_elementwise_sum.npy'")

mean_final_concatenate = np.load(os.path.join(CURRENT_PATH, "mean_final_concatenate.npy"))
print("Loaded file 'mean_final_concatenate.npy'")

IC50_class = np.load(os.path.join(CURRENT_PATH, "IC50_class.npy"))
print("Loaded file 'IC50_class.npy'")

print(mean_final_elementwise_sum.shape)
print(mean_final_concatenate.shape)
print(IC50_class.shape)

/content/drive/My Drive/URECA_Year_2/PotentialAB
Loaded file 'mean_final_elementwise_sum.npy'
Loaded file 'mean_final_concatenate.npy'
Loaded file 'IC50_class.npy'
(1933, 37)
(1933, 74)
(1933,)


### 2.2 CSV file preparation

In [None]:
csv_file_name = "Ab_Virus_02_Model_Selection_v02.csv"
print(csv_file_name)

Ab_Virus_02_Model_Selection_v02.csv


# 3. Model 1: Random Forest Classifier

### 3.1 Apply Random Forest Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'bootstrap': [True],
          'max_depth': arange(10, 110, 10),         # k belongs {low, ..., high-1}
          'max_features': randint(1, 6),            # k belongs {low, ..., high-1}
          'min_samples_leaf': randint(1, 6),        # k belongs {low, ..., high-1}
          'min_samples_split': randint(1, 13),      # k belongs {low, ..., high-1}
          'n_estimators': arange(100, 110, 100)     # k belongs {low, ..., high-1}
          }

# Random Forest Classifier
rf = RandomForestClassifier()   # change the classifier here

# Apply Randon Forest Classifier with Repeated Stratified 5-Fold 10 times
rf_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# rf_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# rf_rskf = KFold(n_splits=5, randon_state = 1001)

rf_random_search = RandomizedSearchCV(estimator = rf, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = rf_rskf.split(X, y),        # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
rf_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits


400 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 459, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3

RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7fc124cb7b50>,
                   estimator=RandomForestClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100]),
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc124ce2d90>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc124ce2e10>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc124c6f190>,
                                        'n_estimators': array([100])},
                   random_state=1001, scoring='accuracy', verbose=3)

In [None]:
# Make predictions for validation data
y_pred = rf_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.59%
ROC AUC score: 95.53%


In [None]:
rf_random_search.cv_results_

{'mean_fit_time': array([0.35040246, 0.32083284, 0.37078548, 0.32106816, 0.36439448,
        0.33517504, 0.08442547, 0.34709805, 0.35600809, 0.33598808,
        0.32409217, 0.360697  , 0.36046649, 0.33816507, 0.35659406,
        0.33186598, 0.32911474, 0.35100709, 0.36489248, 0.327575  ,
        0.34450527, 0.32169772, 0.33382061, 0.35703083, 0.35556499,
        0.33201538, 0.33679632, 0.31437977, 0.33169834, 0.08307033,
        0.32598605, 0.31722428, 0.33608819, 0.32094732, 0.08479584,
        0.35853611, 0.33387226, 0.32694508, 0.36001935, 0.32183026,
        0.31656702, 0.32201081, 0.31566776, 0.35200809, 0.31911253,
        0.33003775, 0.34807699, 0.34384051, 0.34752955, 0.32273472,
        0.35661333, 0.33282954, 0.33895952, 0.082688  , 0.36015223,
        0.33270274, 0.34444561, 0.08545431, 0.33846612, 0.33568683,
        0.31745883, 0.32915765, 0.08494045, 0.32982278, 0.33671397,
        0.31646522, 0.34450951, 0.35122894, 0.32875678, 0.32409389,
        0.34231313, 0.33028875,

In [None]:
rf_random_search.best_estimator_

RandomForestClassifier(max_depth=80, max_features=4, min_samples_split=11)

In [None]:
rf_random_search.best_params_

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 11,
 'n_estimators': 100}

In [None]:
print(rf_random_search.best_score_)
print(rf_random_search.best_index_)
print(rf_random_search.scorer_)
print(rf_random_search.n_splits_)
print(rf_random_search.refit_time_)
print(rf_random_search.multimetric_)
print(rf_random_search.classes_)
print(rf_random_search.n_features_in_)
print(rf_random_search.n_features_in_)

0.9584588504639113
8
make_scorer(accuracy_score)
50
0.21972274780273438
False
[0 1]
37
37


### 3.2 Apply Random Forest Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'bootstrap': [True],
          'max_depth': arange(10, 110, 10),         # k belongs {low, ..., high-1}
          'max_features': randint(1, 6),            # k belongs {low, ..., high-1}
          'min_samples_leaf': randint(1, 6),        # k belongs {low, ..., high-1}
          'min_samples_split': randint(1, 13),      # k belongs {low, ..., high-1}
          'n_estimators': arange(100, 110, 100)     # k belongs {low, ..., high-1}
          }

# Random Forest Classifier
rf = RandomForestClassifier()   # change the classifier here

# Apply Randon Forest Classifier with Repeated Stratified 5-Fold 10 times
rf_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# rf_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# rf_rskf = KFold(n_splits=5, randon_state = 1001)

rf_random_search = RandomizedSearchCV(estimator = rf, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = rf_rskf.split(X, y),        # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
rf_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits


400 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 459, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3

RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7fc124204650>,
                   estimator=RandomForestClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100]),
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc124d8e6d0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc124d8e610>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc124d8e910>,
                                        'n_estimators': array([100])},
                   random_state=1001, scoring='accuracy', verbose=3)

In [None]:
# Make predictions for validation data
y_pred = rf_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.74%
ROC AUC score: 95.78%


In [None]:
rf_random_search.cv_results_

{'mean_fit_time': array([0.31878813, 0.30192019, 0.34238791, 0.30375014, 0.33602475,
        0.3118267 , 0.08335588, 0.32720975, 0.33413339, 0.31255364,
        0.30987069, 0.33389536, 0.3283717 , 0.31645181, 0.32837576,
        0.3059446 , 0.3186039 , 0.32991267, 0.34134758, 0.30633515,
        0.32306163, 0.31758754, 0.32168187, 0.32749249, 0.33711219,
        0.32294179, 0.31741427, 0.29627182, 0.32563797, 0.08293076,
        0.30608699, 0.29652305, 0.31523136, 0.30472377, 0.08271059,
        0.34173536, 0.32787118, 0.3184664 , 0.34873491, 0.31179254,
        0.3106234 , 0.3089225 , 0.30402469, 0.32771535, 0.3077912 ,
        0.32035121, 0.33979957, 0.32584101, 0.33973733, 0.32445095,
        0.33257181, 0.31774971, 0.32407289, 0.08447912, 0.33477292,
        0.31774927, 0.3298271 , 0.08475556, 0.31667889, 0.31862148,
        0.30294209, 0.3149195 , 0.08301028, 0.32505994, 0.32915431,
        0.30850052, 0.32026039, 0.32867587, 0.31096605, 0.32704051,
        0.32774567, 0.31156443,

In [None]:
rf_random_search.best_estimator_

RandomForestClassifier(max_depth=60, max_features=1, min_samples_split=9)

In [None]:
rf_random_search.best_params_

{'bootstrap': True,
 'max_depth': 60,
 'max_features': 1,
 'min_samples_leaf': 1,
 'min_samples_split': 9,
 'n_estimators': 100}

In [None]:
print(rf_random_search.best_score_)
print(rf_random_search.best_index_)
print(rf_random_search.scorer_)
print(rf_random_search.n_splits_)
print(rf_random_search.refit_time_)
print(rf_random_search.multimetric_)
print(rf_random_search.classes_)
print(rf_random_search.n_features_in_)
print(rf_random_search.n_features_in_)

0.9626482440990212
16
make_scorer(accuracy_score)
50
0.20694422721862793
False
[0 1]
74
74


# 4. Model 2: Decision Tree Classifier

### 4.1 Apply Decision Tree Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, None],
          'max_features': randint(1, 11),        # k belongs {low, ..., high-1}
          'min_samples_leaf': randint(1, 11),    # k belongs {low, ..., high-1}
          'criterion': ["gini", "entropy"]
          }

# Decision Tree Classifier
dectree = DecisionTreeClassifier()   # change the classifier here

# Apply Decision Tree Classifier with Repeated Stratified 5-Fold 10 times
dectree_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# dectree_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# dectree_rskf = KFold(n_splits=5, randon_state = 1001)

dectree_random_search = RandomizedSearchCV(estimator = dectree, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = dectree_rskf.split(X, y),        # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
dectree_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits


RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7fc124204a50>,
                   estimator=DecisionTreeClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10,
                                                      None],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc1243dc8d0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc1243dc050>},
                   random_state=1001, scoring='accuracy', verbose=3)

In [None]:
# Make predictions for validation data
y_pred = dectree_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.84%
ROC AUC score: 95.70%


In [None]:
dectree_random_search.cv_results_

{'mean_fit_time': array([0.00430612, 0.00404335, 0.00344558, 0.00287454, 0.00281126,
        0.00375791, 0.00275461, 0.00394738, 0.00440507, 0.00196949,
        0.00382536, 0.00227232, 0.00380351, 0.00190844, 0.00319366,
        0.00398358, 0.002787  , 0.0038242 , 0.00280744, 0.00260068,
        0.00277515, 0.00396112, 0.00210096, 0.00519221, 0.00328088,
        0.00470405, 0.00231236, 0.00389899, 0.00406397, 0.0025464 ,
        0.0044941 , 0.00276233, 0.00333694, 0.0035525 , 0.0046181 ,
        0.00415851, 0.00258235, 0.00324557, 0.00267772, 0.00334815,
        0.00392663, 0.00375833, 0.00490767, 0.0019965 , 0.00414425,
        0.00404527, 0.00189095, 0.00307334, 0.00446116, 0.0033137 ,
        0.004496  , 0.00209653, 0.00247958, 0.00299782, 0.00232688,
        0.00242714, 0.00309713, 0.0029204 , 0.00448144, 0.00254577,
        0.0033882 , 0.00363396, 0.00318374, 0.0045246 , 0.00235821,
        0.00264722, 0.00166854, 0.00303614, 0.00437671, 0.00378696,
        0.00286929, 0.00252205,

In [None]:
dectree_random_search.best_estimator_

DecisionTreeClassifier(max_depth=10, max_features=6)

In [None]:
dectree_random_search.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 6,
 'min_samples_leaf': 1}

In [None]:
print(dectree_random_search.best_score_)
print(dectree_random_search.best_index_)
print(dectree_random_search.scorer_)
print(dectree_random_search.n_splits_)
print(dectree_random_search.refit_time_)
print(dectree_random_search.multimetric_)
print(dectree_random_search.classes_)
print(dectree_random_search.n_features_in_)
print(dectree_random_search.n_features_in_)

0.9498694621841989
7
make_scorer(accuracy_score)
50
0.0019788742065429688
False
[0 1]
37
37


### 4.2 Apply Decision Tree Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, None],
          'max_features': randint(1, 11),        # k belongs {low, ..., high-1}
          'min_samples_leaf': randint(1, 11),    # k belongs {low, ..., high-1}
          'criterion': ["gini", "entropy"]
          }

# Decision Tree Classifier
dectree = DecisionTreeClassifier()   # change the classifier here

# Apply Decision Tree Classifier with Repeated Stratified 5-Fold 10 times
dectree_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# dectree_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# dectree_rskf = KFold(n_splits=5, randon_state = 1001)

dectree_random_search = RandomizedSearchCV(estimator = dectree, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = dectree_rskf.split(X, y),        # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
dectree_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits


RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7fc124338650>,
                   estimator=DecisionTreeClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10,
                                                      None],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc12426eb50>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc124427410>},
                   random_state=1001, scoring='accuracy', verbose=3)

In [None]:
# Make predictions for validation data
y_pred = dectree_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.84%
ROC AUC score: 95.77%


In [None]:
dectree_random_search.cv_results_

{'mean_fit_time': array([0.00406605, 0.00376629, 0.00389987, 0.00262931, 0.00346081,
        0.00416567, 0.00264105, 0.00464177, 0.00369607, 0.00262835,
        0.00400502, 0.00298246, 0.00338512, 0.00320025, 0.00378887,
        0.00298656, 0.0030798 , 0.00406092, 0.0026595 , 0.00282163,
        0.00275717, 0.00350756, 0.0018986 , 0.00377289, 0.0031524 ,
        0.00462831, 0.00255492, 0.00313764, 0.0037966 , 0.00272666,
        0.00393956, 0.00327466, 0.00272029, 0.00365014, 0.00409301,
        0.00343906, 0.00343133, 0.00305392, 0.00285104, 0.00329217,
        0.00409351, 0.00412162, 0.00403317, 0.00282849, 0.0036441 ,
        0.00331303, 0.00288295, 0.0030736 , 0.00447461, 0.00287931,
        0.00367133, 0.00262571, 0.00258198, 0.00321578, 0.00252141,
        0.00208311, 0.00356446, 0.0033698 , 0.0041115 , 0.00272179,
        0.00350091, 0.00291595, 0.00368992, 0.00369495, 0.00201057,
        0.0026026 , 0.00235067, 0.0032444 , 0.00418518, 0.00282537,
        0.00281804, 0.00296826,

In [None]:
dectree_random_search.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=9, max_features=5)

In [None]:
dectree_random_search.best_params_

{'criterion': 'entropy',
 'max_depth': 9,
 'max_features': 5,
 'min_samples_leaf': 1}

In [None]:
print(dectree_random_search.best_score_)
print(dectree_random_search.best_index_)
print(dectree_random_search.scorer_)
print(dectree_random_search.n_splits_)
print(dectree_random_search.refit_time_)
print(dectree_random_search.multimetric_)
print(dectree_random_search.classes_)
print(dectree_random_search.n_features_in_)
print(dectree_random_search.n_features_in_)

0.9541117403703258
63
make_scorer(accuracy_score)
50
0.0019474029541015625
False
[0 1]
74
74


# 5. Model 3: Logistic Regression

### 5.1 Apply Logistic Regression with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'penalty' : ['l1', 'l2'],
          'C' : logspace(-4, 4, 20),
          'solver' : ['liblinear']
          }

# Logistic Regression
lr = LogisticRegression()   # change the classifier here

# Apply Logistic Regression Classifier with Repeated Stratified 5-Fold 10 times
lr_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# lr_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# lr_rskf = KFold(n_splits=5, randon_state = 1001)

lr_random_search = RandomizedSearchCV(estimator = lr, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = lr_rskf.split(X, y),        # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
lr_random_search.fit(X, y)

Fitting 50 folds for each of 40 candidates, totalling 2000 fits




RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7efe3644c6d0>,
                   estimator=LogisticRegression(), n_iter=100, n_jobs=-1,
                   param_distributions={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear']},
                   random_state=1001, scoring='accuracy', verbose=3)

In [None]:
# Make predictions for validation data
y_pred = lr_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 91.93%
ROC AUC score: 86.68%


In [None]:
lr_random_search.cv_results_

{'mean_fit_time': array([3.41684818e-03, 4.23717976e-03, 3.84040356e-03, 5.78008652e-03,
        3.73350620e-03, 5.03538132e-03, 4.90753651e-03, 4.61747646e-03,
        5.48743248e-03, 6.19120598e-03, 2.06666565e-02, 5.62567234e-03,
        1.03090477e-02, 6.74319744e-03, 2.27465549e-01, 7.26195335e-03,
        3.58439517e-01, 7.27409840e-03, 6.63981638e-01, 8.49326611e-03,
        2.34637967e+00, 8.85556221e-03, 3.52076918e+00, 1.03345060e-02,
        3.34208228e+00, 1.21041727e-02, 3.28385177e+00, 1.39111423e-02,
        2.79645119e+00, 1.59545040e-02, 3.42962019e+00, 1.69488287e-02,
        3.50015450e+00, 1.85574102e-02, 3.79791414e+00, 2.21498632e-02,
        2.91728312e+00, 2.28228045e-02, 2.83062104e+00, 2.44478703e-02]),
 'mean_score_time': array([0.00072021, 0.00067733, 0.00089896, 0.00074199, 0.00080094,
        0.00081029, 0.00084651, 0.00080564, 0.00079056, 0.00068952,
        0.00088619, 0.00071811, 0.00115209, 0.00085961, 0.00079201,
        0.00073503, 0.00079113, 0.0007

In [None]:
lr_random_search.best_estimator_

LogisticRegression(C=3792.690190732246, solver='liblinear')

In [None]:
lr_random_search.best_params_

{'C': 3792.690190732246, 'penalty': 'l2', 'solver': 'liblinear'}

In [None]:
print(lr_random_search.best_score_)
print(lr_random_search.best_index_)
print(lr_random_search.scorer_)
print(lr_random_search.n_splits_)
print(lr_random_search.refit_time_)
print(lr_random_search.multimetric_)
print(lr_random_search.classes_)
print(lr_random_search.n_features_in_)
print(lr_random_search.n_features_in_)

0.9175365171171896
37
make_scorer(accuracy_score)
50
0.02026844024658203
False
[0 1]
37
37


### 5.2 Apply Logistic Regression with 5 K-Fold, X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'penalty' : ['l1', 'l2'],
          'C' : logspace(-4, 4, 20),
          'solver' : ['liblinear']
          }

# Logistic Regression
lr = LogisticRegression()   # change the classifier here

# Apply Logistic Regression Classifier with Repeated Stratified 5-Fold 10 times
lr_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# lr_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# lr_rskf = KFold(n_splits=5, randon_state = 1001)

lr_random_search = RandomizedSearchCV(estimator = lr, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = lr_rskf.split(X, y),        # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
lr_random_search.fit(X, y)

Fitting 50 folds for each of 40 candidates, totalling 2000 fits




RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7efe35381dd0>,
                   estimator=LogisticRegression(), n_iter=100, n_jobs=-1,
                   param_distributions={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear']},
                   random_state=1001, scoring='accuracy', verbose=3)

In [None]:
# Make predictions for validation data
y_pred = lr_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 93.43%
ROC AUC score: 89.02%


In [None]:
lr_random_search.cv_results_

{'mean_fit_time': array([6.18459225e-03, 7.71436214e-03, 4.59557533e-03, 7.34085083e-03,
        4.75947857e-03, 6.93861485e-03, 6.75528526e-03, 8.20983887e-03,
        7.78837204e-03, 8.90407085e-03, 9.74507332e-03, 1.03630924e-02,
        7.76434422e-03, 1.10092545e-02, 2.96732426e-02, 1.29405117e-02,
        2.09723425e-01, 1.32792377e-02, 1.00163089e+00, 1.37596846e-02,
        4.86494217e+00, 1.63319492e-02, 7.24691364e+00, 1.86961746e-02,
        7.41522650e+00, 2.14060593e-02, 9.82570313e+00, 2.45049763e-02,
        1.08388845e+01, 2.93882656e-02, 1.15476758e+01, 3.30430079e-02,
        1.14457055e+01, 3.99043560e-02, 9.50463483e+00, 4.64532280e-02,
        1.27908974e+01, 5.27227688e-02, 8.61922555e+00, 5.87710571e-02]),
 'mean_score_time': array([0.00106413, 0.00100526, 0.00071698, 0.00077856, 0.00106723,
        0.00095926, 0.00088378, 0.00097831, 0.0008493 , 0.00112914,
        0.00102818, 0.00085027, 0.00088387, 0.00085119, 0.00085053,
        0.00094152, 0.00086944, 0.0008

In [None]:
lr_random_search.best_estimator_

LogisticRegression(C=206.913808111479, penalty='l1', solver='liblinear')

In [None]:
lr_random_search.best_params_

{'C': 206.913808111479, 'penalty': 'l1', 'solver': 'liblinear'}

In [None]:
print(lr_random_search.best_score_)
print(lr_random_search.best_index_)
print(lr_random_search.scorer_)
print(lr_random_search.n_splits_)
print(lr_random_search.refit_time_)
print(lr_random_search.multimetric_)
print(lr_random_search.classes_)
print(lr_random_search.n_features_in_)
print(lr_random_search.n_features_in_)

0.9276761591088619
30
make_scorer(accuracy_score)
50
9.094917058944702
False
[0 1]
74
74


# 6. Model 4: Support Vector Machine

### 6.1 Apply Support Vector Machine with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'C': arange(2, 10, 2),            # k belongs {low, ..., high-1}
          'gamma': arange(0.1, 1, 0.2)      # k belongs {low, ..., high-1}
          }

# Support Vector Machine
SVM = LinearSVC()   # change the classifier here

# Apply Support Vector Machine Classifier with Repeated Stratified 5-Fold 10 times
svm_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# svm_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# svm_rskf = KFold(n_splits=5, randon_state = 1001)

svm_random_search = RandomizedSearchCV(estimator = SVM, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = svm_rskf.split(X, y),        # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
svm_random_search.fit(X, y)

In [None]:
# Make predictions for validation data
y_pred = svm_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

In [None]:
svm_random_search.cv_results_

In [None]:
svm_random_search.best_estimator_

AttributeError: ignored

In [None]:
svm_random_search.best_params_

AttributeError: ignored

In [None]:
print(svm_random_search.best_score_)
print(svm_random_search.best_index_)
print(svm_random_search.scorer_)
print(svm_random_search.n_splits_)
print(svm_random_search.refit_time_)
print(svm_random_search.multimetric_)
print(svm_random_search.classes_)
print(svm_random_search.n_features_in_)
print(svm_random_search.n_features_in_)

### 6.2 Apply Support Vector Machine with Repeated Stratified 5-Fold 10 times, X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'C': arange(2, 10, 2),            # k belongs {low, ..., high-1}
          'gamma': arange(0.1, 1, 0.2)      # k belongs {low, ..., high-1}
          }

# Support Vector Machine
SVM = LinearSVC()   # change the classifier here

# Apply Support Vector Machine Classifier with Repeated Stratified 5-Fold 10 times
svm_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# svm_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# svm_rskf = KFold(n_splits=5, randon_state = 1001)

svm_random_search = RandomizedSearchCV(estimator = SVM, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = svm_rskf.split(X, y),        # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
svm_random_search.fit(X, y)

In [None]:
# Make predictions for validation data
y_pred = svm_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

In [None]:
svm_random_search.cv_results_

In [None]:
svm_random_search.best_estimator_

In [None]:
svm_random_search.best_params_

In [None]:
print(svm_random_search.best_score_)
print(svm_random_search.best_index_)
print(svm_random_search.scorer_)
print(svm_random_search.n_splits_)
print(svm_random_search.refit_time_)
print(svm_random_search.multimetric_)
print(svm_random_search.classes_)
print(svm_random_search.n_features_in_)
print(svm_random_search.n_features_in_)

# 7. Model 5: MLP Classifier

### 7.1 Apply MLP Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'hidden_layer_sizes'  : [(randint(100,600),randint(100,600),), (randint(100,600),)],    # k belongs {low, ..., high-1}
          'activation'          : ['tanh', 'relu', 'logistic'],
          'solver'              : ['sgd', 'adam', 'lbfgs'],
          'alpha'               : [0.0001, 0.001, 0.005],
          'max_iter'            : arange(100, 1000, 100),   # k belongs {low, ..., high-1}
          'learning_rate'       : ['constant','adaptive'],
          'learning_rate_init'  : [0.0001, 0.001, 0.005],
          'early_stopping'      : [True, False]
          }

# MLP Classifier
MLP = MLPClassifier() # change the classifier here

# Apply MLP Classifier with Repeated Stratified 5-Fold 10 times
mlp_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# mlp_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# mlp_rskf = KFold(n_splits=5, randon_state = 1001)

mlp_random_search = RandomizedSearchCV(estimator = MLP, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = mlp_rskf.split(X, y),       # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
mlp_random_search.fit(X, y)

In [None]:
# Make predictions for validation data
y_pred = mlp_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

In [None]:
mlp_random_search.cv_results_

In [None]:
mlp_random_search.best_estimator_

In [None]:
mlp_random_search.best_params_

In [None]:
print(mlp_random_search.best_score_)
print(mlp_random_search.best_index_)
print(mlp_random_search.scorer_)
print(mlp_random_search.n_splits_)
print(mlp_random_search.refit_time_)
print(mlp_random_search.multimetric_)
print(mlp_random_search.classes_)
print(mlp_random_search.n_features_in_)
print(mlp_random_search.n_features_in_)

### 7.2 Apply MLP Classifier with Repeated Stratified 5-Fold 10 times X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'hidden_layer_sizes'  : [(randint(100,600),randint(100,600),), (randint(100,600),)],    # k belongs {low, ..., high-1}
          'activation'          : ['tanh', 'relu', 'logistic'],
          'solver'              : ['sgd', 'adam', 'lbfgs'],
          'alpha'               : [0.0001, 0.001, 0.005],
          'max_iter'            : arange(100, 1000, 100),   # k belongs {low, ..., high-1}
          'learning_rate'       : ['constant','adaptive'],
          'learning_rate_init'  : [0.0001, 0.001, 0.005],
          'early_stopping'      : [True, False]
          }

# MLP Classifier
MLP = MLPClassifier() # change the classifier here

# Apply MLP Classifier with Repeated Stratified 5-Fold 10 times
mlp_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# mlp_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# mlp_rskf = KFold(n_splits=5, randon_state = 1001)

mlp_random_search = RandomizedSearchCV(estimator = MLP, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = mlp_rskf.split(X, y),       # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
mlp_random_search.fit(X, y)

In [None]:
# Make predictions for validation data
y_pred = mlp_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

In [None]:
mlp_random_search.cv_results_

In [None]:
mlp_random_search.best_estimator_

In [None]:
mlp_random_search.best_params_

In [None]:
print(mlp_random_search.best_score_)
print(mlp_random_search.best_index_)
print(mlp_random_search.scorer_)
print(mlp_random_search.n_splits_)
print(mlp_random_search.refit_time_)
print(mlp_random_search.multimetric_)
print(mlp_random_search.classes_)
print(mlp_random_search.n_features_in_)
print(mlp_random_search.n_features_in_)

# 8. Model 6: XGBoost Classifier

### 8.1 Apply XGBoost Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'min_child_weight': randint(1, 11),            # k belongs {low, ..., high-1}
          'gamma': arange(0.5, 3.0, 0.5),                # k belongs {low, ..., high-1}
          'subsample': arange(0.3, 1.0, 0.1),            # k belongs {low, ..., high-1}
          'colsample_bytree': arange(0.5, 1.0, 0.1),     # k belongs {low, ..., high-1}
          'max_depth': randint(3, 10),                   # k belongs {low, ..., high-1}
          'n_estimators': arange(150, 1001, 50),         # k belongs {low, ..., high-1}
          'learning_rate' : arange(0.01, 0.11, 0.01),    # k belongs {low, ..., high-1}
          }

# XGBoost Classifier
xgb = XGBClassifier() # change the classifier here

# Apply XGBoost Classifier with Repeated Stratified 5-Fold 10 times
xgb_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# xgb_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# xgb_rskf = KFold(n_splits=5, randon_state = 1001)

xgb_random_search = RandomizedSearchCV(estimator = xgb, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = xgb_rskf.split(X, y),       # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
xgb_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits


RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7fc124456a50>,
                   estimator=XGBClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'colsample_bytree': array([0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'gamma': array([0.5, 1. , 1.5, 2. , 2.5]),
                                        'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ]),
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc1243c4c10>,
                                        'min_child_weight': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc1243c4c90>,
                                        'n_estimators': array([ 150,  200,  250,  300,  350,  400,  450,  500,  550,  600,  650,
        700,  750,  800,  850,  900,  950, 1000]),
                                        'subsample': array([0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
           

In [None]:
# Make predictions for validation data
y_pred = xgb_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.33%
ROC AUC score: 95.00%


In [None]:
xgb_random_search.cv_results_

{'mean_fit_time': array([1.088187  , 1.12465996, 2.32978539, 1.43690752, 0.76696504,
        1.9360784 , 1.40139987, 1.83456213, 2.0426543 , 0.65557425,
        0.61567623, 0.29334964, 1.15742513, 1.96309071, 0.24586879,
        1.17589884, 0.26967905, 1.04427397, 2.04882743, 1.11711969,
        1.03126642, 0.68028211, 1.5909677 , 1.94629123, 1.90216988,
        1.03089282, 2.43299416, 0.80980443, 1.55705667, 2.14614942,
        2.00807694, 1.96805173, 0.61033595, 1.10034894, 0.87188711,
        0.57359673, 1.52194782, 2.98657181, 1.72350181, 1.7347248 ,
        1.24585554, 2.1447084 , 2.10295892, 0.49360515, 0.78876479,
        1.98526408, 1.72940645, 2.12277557, 1.88342714, 1.56201765,
        1.51866383, 1.29042579, 1.43008632, 2.3741901 , 1.42482845,
        2.0586517 , 2.18523734, 1.64266597, 2.01580772, 0.41212547,
        1.99956571, 0.42771369, 1.13552962, 0.98872708, 2.61003425,
        0.34112106, 1.0989234 , 3.16115195, 2.0692494 , 0.39233177,
        2.0172561 , 3.06247763,

In [None]:
xgb_random_search.best_estimator_

XGBClassifier(colsample_bytree=0.7999999999999999, gamma=2.5,
              learning_rate=0.02, max_depth=4, n_estimators=650,
              subsample=0.6000000000000001)

In [None]:
xgb_random_search.best_params_

{'colsample_bytree': 0.7999999999999999,
 'gamma': 2.5,
 'learning_rate': 0.02,
 'max_depth': 4,
 'min_child_weight': 1,
 'n_estimators': 650,
 'subsample': 0.6000000000000001}

In [None]:
print(xgb_random_search.best_score_)
print(xgb_random_search.best_index_)
print(xgb_random_search.scorer_)
print(xgb_random_search.n_splits_)
print(xgb_random_search.refit_time_)
print(xgb_random_search.multimetric_)
print(xgb_random_search.classes_)

0.9572161304574849
48
make_scorer(accuracy_score)
50
1.5747034549713135
False
[0 1]


### 8.2 Apply XGBoost Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'min_child_weight': randint(1, 11),            # k belongs {low, ..., high-1}
          'gamma': arange(0.5, 3.0, 0.5),                # k belongs {low, ..., high-1}
          'subsample': arange(0.3, 1.0, 0.1),            # k belongs {low, ..., high-1}
          'colsample_bytree': arange(0.5, 1.0, 0.1),     # k belongs {low, ..., high-1}
          'max_depth': randint(3, 10),                   # k belongs {low, ..., high-1}
          'n_estimators': arange(150, 1001, 50),         # k belongs {low, ..., high-1}
          'learning_rate' : arange(0.01, 0.11, 0.01),    # k belongs {low, ..., high-1}
          }

# XGBoost Classifier
xgb = XGBClassifier() # change the classifier here

# Apply XGBoost Classifier with Repeated Stratified 5-Fold 10 times
xgb_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# xgb_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# xgb_rskf = KFold(n_splits=5, randon_state = 1001)

xgb_random_search = RandomizedSearchCV(estimator = xgb, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = xgb_rskf.split(X, y),       # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
xgb_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits


RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7ff68d8b0c50>,
                   estimator=XGBClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'colsample_bytree': array([0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'gamma': array([0.5, 1. , 1.5, 2. , 2.5]),
                                        'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ]),
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff68d84fed0>,
                                        'min_child_weight': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff68d84fc50>,
                                        'n_estimators': array([ 150,  200,  250,  300,  350,  400,  450,  500,  550,  600,  650,
        700,  750,  800,  850,  900,  950, 1000]),
                                        'subsample': array([0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
           

In [None]:
# Make predictions for validation data
y_pred = xgb_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.69%
ROC AUC score: 95.67%


In [None]:
xgb_random_search.cv_results_

{'mean_fit_time': array([1.71675207, 1.83898014, 3.83968969, 2.28179969, 1.19528509,
        3.0248113 , 2.22630179, 3.02426257, 3.25346358, 1.01741397,
        0.99081274, 0.46080602, 1.85956317, 3.25467018, 0.37702841,
        1.88963549, 0.42440302, 1.67447906, 3.34484404, 1.73568274,
        1.65076121, 1.10420029, 2.60495547, 3.14073561, 3.00042772,
        1.66985513, 3.99694438, 1.32377021, 2.42550027, 3.49896663,
        3.2551316 , 3.21907146, 1.00355476, 1.79598167, 1.35544887,
        0.89974217, 2.5230724 , 4.87369216, 2.85871247, 2.77902537,
        2.05857248, 3.43738379, 3.44513585, 0.7868514 , 1.28376011,
        3.23060176, 2.70226202, 3.39434465, 3.11911948, 2.55813135,
        2.36648432, 2.06951978, 2.39300312, 3.90270513, 2.30917653,
        3.37813676, 3.5102613 , 2.70961501, 3.15716556, 0.65979876,
        3.21267421, 0.66737201, 1.86682005, 1.5302306 , 4.28693318,
        0.55076493, 1.7261385 , 5.31372264, 3.40034861, 0.63614636,
        3.2587281 , 5.11902485,

In [None]:
xgb_random_search.best_estimator_

XGBClassifier(colsample_bytree=0.8999999999999999, gamma=1.0,
              learning_rate=0.06999999999999999, max_depth=8,
              min_child_weight=7, n_estimators=650,
              subsample=0.8000000000000003)

In [None]:
xgb_random_search.best_params_

{'colsample_bytree': 0.8999999999999999,
 'gamma': 1.0,
 'learning_rate': 0.06999999999999999,
 'max_depth': 8,
 'min_child_weight': 7,
 'n_estimators': 650,
 'subsample': 0.8000000000000003}

In [None]:
print(xgb_random_search.best_score_)
print(xgb_random_search.best_index_)
print(xgb_random_search.scorer_)
print(xgb_random_search.n_splits_)
print(xgb_random_search.refit_time_)
print(xgb_random_search.multimetric_)
print(xgb_random_search.classes_)

0.9604762287290304
82
make_scorer(accuracy_score)
50
2.866130828857422
False
[0 1]


# 9. Model 7: LightGBM Classifier

### 9.1 Apply LightGBM Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'num_leaves': arange(100,600,100),             # k belongs {low, ..., high-1}
          'min_child_weight': arange(0.01,1.01,0.01),    # k belongs {low, ..., high-1}
          'feature_fraction': arange(0.1,0.401,0.01),    # k belongs {low, ..., high-1}
          'bagging_fraction':arange(0.3,0.501,0.01),     # k belongs {low, ..., high-1}
          'min_data_in_leaf': arange(100,1510,10),       # k belongs {low, ..., high-1}
          'objective': ['binary'],
          'max_depth': [-1],
          'learning_rate': arange(0.001,0.0201,0.001),   # k belongs {low, ..., high-1}
          'boosting_type': ['gbdt'],                     # k belongs {low, ..., high-1}
          'bagging_seed': arange(10,42,5),               # k belongs {low, ..., high-1}
          #   'metric': ['auc'],
          'verbosity': [1],
          'reg_alpha': arange(0.3,1,0.2),                # k belongs {low, ..., high-1}
          'reg_lambda':  arange(0.37,0.3901,0.001),      # k belongs {low, ..., high-1}
          'random_state': arange(100,600,100),           # k belongs {low, ..., high-1}
          'n_estimators': arange(100,600,100)            # k belongs {low, ..., high-1}
          }

# LightGBM Classifier
lgbm = LGBMClassifier() # change the classifier here

# Apply LightGBM Classifier with Repeated Stratified 5-Fold 10 times
lgbm_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# lgbm_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# lgbm_rskf = KFold(n_splits=5, randon_state = 1001)

lgbm_random_search = RandomizedSearchCV(estimator = lgbm, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = lgbm_rskf.split(X, y),       # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
lgbm_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits


RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7f3b5497c350>,
                   estimator=LGBMClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'bagging_fraction': array([0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 ,
       0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 ]),
                                        'bagging_seed': array([10, 15, 20, 25, 30, 35, 40]),
                                        'boosting_type': ['gbdt'],
                                        'fea...
                                        'num_leaves': array([100, 200, 300, 400, 500]),
                                        'objective': ['binary'],
                                        'random_state': array([100, 200, 300, 400, 500]),
                                        'reg_alpha': array([0.3, 0.5, 0.7, 0.9]),
                                        'reg_lambda': array([0.37 , 0.371, 0.372, 0.373, 0.374, 0.375, 0.376, 0.3

In [None]:
# Make predictions for validation data
y_pred = lgbm_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.28%
ROC AUC score: 94.97%


In [None]:
lgbm_random_search.cv_results_

{'mean_fit_time': array([0.08629567, 0.07262516, 0.12825355, 0.21831621, 0.05339012,
        0.14590785, 0.2051856 , 0.10165956, 0.03108716, 0.06619883,
        0.09586921, 0.19075669, 0.17118142, 0.35165751, 0.03193215,
        0.04782083, 0.2196843 , 0.17488851, 0.0854094 , 0.20772056,
        0.07892986, 0.25193639, 0.11891301, 0.10389417, 0.13112033,
        0.05184342, 0.08340581, 0.10268464, 0.05541089, 0.05221354,
        0.06895038, 0.06858287, 0.07268833, 0.1895815 , 0.05040751,
        0.06885142, 0.06200818, 0.03306485, 0.05070658, 0.22632255,
        0.0751224 , 0.03359119, 0.05280618, 0.05212121, 0.38341988,
        0.06830437, 0.0893735 , 0.05569499, 0.03320251, 0.07147491,
        0.15068262, 0.0881002 , 0.26572912, 0.11839315, 0.20214024,
        0.11096902, 0.09094422, 0.30586768, 0.22424016, 0.3388027 ,
        0.03353688, 0.08721535, 0.06221728, 0.07204519, 0.07504598,
        0.09855783, 0.28563216, 0.21474696, 0.26088792, 0.14777228,
        0.03107174, 0.24699008,

In [None]:
lgbm_random_search.best_estimator_

LGBMClassifier(bagging_fraction=0.34, bagging_seed=20,
               feature_fraction=0.3799999999999999, learning_rate=0.015,
               min_child_weight=0.14, min_data_in_leaf=110, n_estimators=500,
               num_leaves=200, objective='binary', random_state=200,
               reg_alpha=0.7, reg_lambda=0.387, verbosity=1)

In [None]:
lgbm_random_search.best_params_

{'bagging_fraction': 0.34,
 'bagging_seed': 20,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.3799999999999999,
 'learning_rate': 0.015,
 'max_depth': -1,
 'min_child_weight': 0.14,
 'min_data_in_leaf': 110,
 'n_estimators': 500,
 'num_leaves': 200,
 'objective': 'binary',
 'random_state': 200,
 'reg_alpha': 0.7,
 'reg_lambda': 0.387,
 'verbosity': 1}

In [None]:
print(lgbm_random_search.best_score_)
print(lgbm_random_search.best_index_)
print(lgbm_random_search.scorer_)
print(lgbm_random_search.n_splits_)
print(lgbm_random_search.refit_time_)
print(lgbm_random_search.multimetric_)
print(lgbm_random_search.classes_)

0.9508516420987803
98
make_scorer(accuracy_score)
50
0.352968692779541
False
[0 1]


### 9.2 Apply LightGBM Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'num_leaves': arange(100,600,100),             # k belongs {low, ..., high-1}
          'min_child_weight': arange(0.01,1.01,0.01),    # k belongs {low, ..., high-1}
          'feature_fraction': arange(0.1,0.401,0.01),    # k belongs {low, ..., high-1}
          'bagging_fraction':arange(0.3,0.501,0.01),     # k belongs {low, ..., high-1}
          'min_data_in_leaf': arange(100,1510,10),       # k belongs {low, ..., high-1}
          'objective': ['binary'],
          'max_depth': [-1],
          'learning_rate': arange(0.001,0.0201,0.001),   # k belongs {low, ..., high-1}
          'boosting_type': ['gbdt'],                     # k belongs {low, ..., high-1}
          'bagging_seed': arange(10,42,5),               # k belongs {low, ..., high-1}
          #   'metric': ['auc'],
          'verbosity': [1],
          'reg_alpha': arange(0.3,1,0.2),                # k belongs {low, ..., high-1}
          'reg_lambda':  arange(0.37,0.3901,0.001),      # k belongs {low, ..., high-1}
          'random_state': arange(100,600,100),           # k belongs {low, ..., high-1}
          'n_estimators': arange(100,600,100)            # k belongs {low, ..., high-1}
          }

# LightGBM Classifier
lgbm = LGBMClassifier() # change the classifier here

# Apply LightGBM Classifier with Repeated Stratified 5-Fold 10 times
lgbm_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# lgbm_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# lgbm_rskf = KFold(n_splits=5, randon_state = 1001)

lgbm_random_search = RandomizedSearchCV(estimator = lgbm, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = lgbm_rskf.split(X, y),       # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
lgbm_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits


RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7f3b53feb0d0>,
                   estimator=LGBMClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'bagging_fraction': array([0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 ,
       0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 ]),
                                        'bagging_seed': array([10, 15, 20, 25, 30, 35, 40]),
                                        'boosting_type': ['gbdt'],
                                        'fea...
                                        'num_leaves': array([100, 200, 300, 400, 500]),
                                        'objective': ['binary'],
                                        'random_state': array([100, 200, 300, 400, 500]),
                                        'reg_alpha': array([0.3, 0.5, 0.7, 0.9]),
                                        'reg_lambda': array([0.37 , 0.371, 0.372, 0.373, 0.374, 0.375, 0.376, 0.3

In [None]:
# Make predictions for validation data
y_pred = lgbm_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.48%
ROC AUC score: 95.32%


In [None]:
lgbm_random_search.cv_results_

{'mean_fit_time': array([0.0819488 , 0.07946256, 0.15861337, 0.25314051, 0.05942539,
        0.1668286 , 0.26002836, 0.11740788, 0.03764458, 0.07336767,
        0.1103761 , 0.21208536, 0.21469974, 0.40264325, 0.03680651,
        0.05418252, 0.25682111, 0.16509298, 0.09819098, 0.23837496,
        0.09685754, 0.32275127, 0.14374846, 0.11189991, 0.14576644,
        0.05554791, 0.09037532, 0.12008992, 0.06418842, 0.05488745,
        0.07373503, 0.08273757, 0.07091721, 0.23814735, 0.0546902 ,
        0.07181068, 0.06873872, 0.0335101 , 0.05717273, 0.27576489,
        0.09689674, 0.03803795, 0.0535446 , 0.05209586, 0.43761971,
        0.07778667, 0.09563795, 0.06306946, 0.03471504, 0.07507072,
        0.18215179, 0.0933981 , 0.3108874 , 0.14655172, 0.24639389,
        0.1098068 , 0.09629863, 0.34136575, 0.26459169, 0.42488554,
        0.03705207, 0.08841521, 0.06071272, 0.07165668, 0.07574308,
        0.1118011 , 0.3354285 , 0.27879605, 0.29445897, 0.17367128,
        0.03695433, 0.28097394,

In [None]:
lgbm_random_search.best_estimator_

LGBMClassifier(bagging_fraction=0.34, bagging_seed=20,
               feature_fraction=0.3799999999999999, learning_rate=0.015,
               min_child_weight=0.14, min_data_in_leaf=110, n_estimators=500,
               num_leaves=200, objective='binary', random_state=200,
               reg_alpha=0.7, reg_lambda=0.387, verbosity=1)

In [None]:
lgbm_random_search.best_params_

{'bagging_fraction': 0.34,
 'bagging_seed': 20,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.3799999999999999,
 'learning_rate': 0.015,
 'max_depth': -1,
 'min_child_weight': 0.14,
 'min_data_in_leaf': 110,
 'n_estimators': 500,
 'num_leaves': 200,
 'objective': 'binary',
 'random_state': 200,
 'reg_alpha': 0.7,
 'reg_lambda': 0.387,
 'verbosity': 1}

In [None]:
print(lgbm_random_search.best_score_)
print(lgbm_random_search.best_index_)
print(lgbm_random_search.scorer_)
print(lgbm_random_search.n_splits_)
print(lgbm_random_search.refit_time_)
print(lgbm_random_search.multimetric_)
print(lgbm_random_search.classes_)

0.9569055173983478
98
make_scorer(accuracy_score)
50
0.41963768005371094
False
[0 1]


# 10. Model 8: CatBoost Classifier

### 10.1 Apply CatBoost Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'depth'         : randint(4, 11),             # k belongs {low, ..., high-1}
          'learning_rate' : arange(0.01, 0.11, 0.01),   # k belongs {low, ..., high-1}
          'iterations'    : randint(10, 101, 10)        # k belongs {low, ..., high-1}
          }

# CatBoost Classifier
cat = CatBoostClassifier() # change the classifier here

# Apply CatBoost Classifier with Repeated Stratified 5-Fold 10 times
cat_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# cat_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# cat_rskf = KFold(n_splits=5, randon_state = 1001)

cat_random_search = RandomizedSearchCV(estimator = cat, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = cat_rskf.split(X, y),       # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
cat_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits
0:	learn: 0.5919633	total: 52ms	remaining: 4s
1:	learn: 0.5015378	total: 57.1ms	remaining: 2.17s
2:	learn: 0.4132146	total: 61.8ms	remaining: 1.54s
3:	learn: 0.3610929	total: 66.4ms	remaining: 1.23s
4:	learn: 0.3222335	total: 70.7ms	remaining: 1.03s
5:	learn: 0.2891851	total: 75ms	remaining: 900ms
6:	learn: 0.2649381	total: 79.5ms	remaining: 806ms
7:	learn: 0.2442721	total: 83.9ms	remaining: 734ms
8:	learn: 0.2236223	total: 88.3ms	remaining: 677ms
9:	learn: 0.2085180	total: 92.7ms	remaining: 630ms
10:	learn: 0.2010281	total: 97ms	remaining: 591ms
11:	learn: 0.1940475	total: 101ms	remaining: 557ms
12:	learn: 0.1854986	total: 106ms	remaining: 528ms
13:	learn: 0.1766497	total: 110ms	remaining: 503ms
14:	learn: 0.1702418	total: 114ms	remaining: 480ms
15:	learn: 0.1649563	total: 119ms	remaining: 460ms
16:	learn: 0.1605809	total: 123ms	remaining: 442ms
17:	learn: 0.1577635	total: 128ms	remaining: 425ms
18:	learn: 0.1546138	tota

RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7f3b53e47cd0>,
                   estimator=<catboost.core.CatBoostClassifier object at 0x7f3b54068b50>,
                   n_iter=100, n_jobs=-1,
                   param_distributions={'depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b54068910>,
                                        'iterations': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b540683d0>,
                                        'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ])},
                   random_state=1001, scoring='accuracy', verbose=3)

In [None]:
# Make predictions for validation data
y_pred = cat_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.79%
ROC AUC score: 95.88%


In [None]:
cat_random_search.cv_results_

{'mean_fit_time': array([1.30723273, 0.90509202, 1.93324365, 3.80524849, 1.42207992,
        0.5886728 , 0.54374773, 4.12053216, 0.40173864, 5.76051048,
        0.32794652, 0.64890213, 0.25891109, 3.75118493, 0.78479919,
        1.7174389 , 1.13925374, 0.58601795, 3.74482353, 0.39147774,
        1.39433102, 0.41823979, 0.81116929, 0.64444701, 1.18485619,
        1.66173418, 0.21592706, 0.39561724, 3.60582648, 0.81806102,
        0.79745271, 3.51680247, 0.92398751, 1.25238932, 2.25447348,
        2.31559836, 0.48835379, 0.34760412, 0.26856147, 0.63099884,
        0.42873579, 0.44407376, 2.06798029, 3.33936275, 0.43133192,
        2.67342977, 1.10716429, 1.08895215, 0.31850758, 2.90364638,
        3.17387188, 0.64489774, 7.19482434, 0.62488663, 0.27115426,
        0.39889451, 0.26448457, 2.69513208, 0.99709745, 2.18712993,
        3.22784792, 6.95214185, 0.46869217, 1.23587595, 7.32617854,
        4.13174156, 2.71645937, 1.14723366, 1.06238319, 0.88629008,
        0.95376707, 7.3681384 ,

In [None]:
cat_random_search.best_estimator_

<catboost.core.CatBoostClassifier at 0x7f3b5408d5d0>

In [None]:
cat_random_search.best_params_

{'depth': 6, 'iterations': 78, 'learning_rate': 0.09999999999999999}

In [None]:
print(cat_random_search.best_score_)
print(cat_random_search.best_index_)
print(cat_random_search.scorer_)
print(cat_random_search.n_splits_)
print(cat_random_search.refit_time_)
print(cat_random_search.multimetric_)
print(cat_random_search.classes_)

0.9579930647601452
69
make_scorer(accuracy_score)
50
0.5015377998352051
False
[0 1]


### 10.2 Apply CatBoost Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Libraries:
# from numpy import arange
# from scipy.stats import randint, uniform

# Defining parameters
params = {'depth'         : randint(4, 11),             # k belongs {low, ..., high-1}
          'learning_rate' : arange(0.01, 0.11, 0.01),   # k belongs {low, ..., high-1}
          'iterations'    : randint(10, 101, 10)        # k belongs {low, ..., high-1}
          }

# CatBoost Classifier
cat = CatBoostClassifier() # change the classifier here

# Apply CatBoost Classifier with Repeated Stratified 5-Fold 10 times
cat_rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# cat_rskf = StratifiedKFold(n_splits=5, random_state = 1001)
# cat_rskf = KFold(n_splits=5, randon_state = 1001)

cat_random_search = RandomizedSearchCV(estimator = cat, 
                                       param_distributions = params, 
                                       n_iter = 100,                    # Number of random hyperparameter combinations we are trying out. Here, we try out 100 combinations of random states
                                       scoring = 'accuracy',            # Evaluate each random hyperparameter combination as according to how much accuracy they achieve
                                       n_jobs = -1,                     # Number of jobs running in parallel (multi-core CPU processing). -1 means using all processors.
                                       cv = cat_rskf.split(X, y),       # Splitting the data into repeated stratified 5-Fold 10 times
                                       verbose = 3,                     # Controls the verbosity: the higher, the more messages.
                                       random_state = 1001)             # Pseudo random generator. Whatever the number is, it doesn't make a difference. Make sure this number (1001) is consistent across all classifiers.

# Model fitting and training
cat_random_search.fit(X, y)

Fitting 50 folds for each of 100 candidates, totalling 5000 fits
0:	learn: 0.6158003	total: 6.63ms	remaining: 610ms
1:	learn: 0.5437031	total: 14.7ms	remaining: 667ms
2:	learn: 0.4869669	total: 20.1ms	remaining: 602ms
3:	learn: 0.4403496	total: 25.6ms	remaining: 569ms
4:	learn: 0.3942302	total: 30.9ms	remaining: 544ms
5:	learn: 0.3584235	total: 36.4ms	remaining: 528ms
6:	learn: 0.3249542	total: 41.7ms	remaining: 513ms
7:	learn: 0.3057384	total: 47.1ms	remaining: 501ms
8:	learn: 0.2852944	total: 52.5ms	remaining: 490ms
9:	learn: 0.2660520	total: 57.8ms	remaining: 480ms
10:	learn: 0.2497835	total: 63.2ms	remaining: 471ms
11:	learn: 0.2321762	total: 68.6ms	remaining: 463ms
12:	learn: 0.2216464	total: 74.1ms	remaining: 456ms
13:	learn: 0.2100248	total: 79.4ms	remaining: 448ms
14:	learn: 0.2014361	total: 84.6ms	remaining: 440ms
15:	learn: 0.1937853	total: 90ms	remaining: 433ms
16:	learn: 0.1853011	total: 95.2ms	remaining: 425ms
17:	learn: 0.1796347	total: 101ms	remaining: 419ms
18:	learn: 0

RandomizedSearchCV(cv=<generator object _RepeatedSplits.split at 0x7f3b53feb8d0>,
                   estimator=<catboost.core.CatBoostClassifier object at 0x7f3b53ff95d0>,
                   n_iter=100, n_jobs=-1,
                   param_distributions={'depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b53ff9b90>,
                                        'iterations': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b53ff9e10>,
                                        'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ])},
                   random_state=1001, scoring='accuracy', verbose=3)

In [None]:
# Make predictions for validation data
y_pred = cat_random_search.best_estimator_.predict(X)
predictions = [round(value) for value in y_pred]

# Evaluate predictions: Accuracy score
accuracy = accuracy_score(y, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
print("Accuracy: %.2f%%" % (accuracy))

# Evaluate predictions: ROC AUC score
roc_score = roc_auc_score(y, predictions) * 100
print("ROC AUC score: %.2f%%" % (roc_score))

# # Evaluate predictions: Classification report
# cr = classification_report(y, predictions)
# print("Classification report:")
# print(cr)

# # Evaluate predictions: Confusion Matrix
# cm = confusion_matrix(y, predictions)
# print("Confusion matrix:")
# print(cm)

Accuracy: 96.74%
ROC AUC score: 95.78%


In [None]:
cat_random_search.cv_results_

{'mean_fit_time': array([1.57677966, 1.09093275, 2.33789041, 4.67796828, 1.73156056,
        0.72953963, 0.75639082, 5.12197109, 0.54346879, 7.22011114,
        0.42754935, 0.84991332, 0.33929036, 4.6006396 , 0.97458915,
        2.08612964, 1.42042274, 0.81898673, 4.73837304, 0.58438738,
        1.65982277, 0.54465159, 1.04582897, 0.79835196, 1.48253098,
        1.90866685, 0.30496967, 0.52021509, 4.43620708, 1.05643825,
        1.06666543, 4.41540605, 1.07672714, 1.54493361, 2.8128618 ,
        2.92298963, 0.61329815, 0.46598064, 0.35700312, 0.80096042,
        0.53358047, 0.5847672 , 2.61372962, 4.182411  , 0.54737873,
        3.2683171 , 1.39309574, 1.35609959, 0.40748615, 3.6856456 ,
        3.87323335, 0.83727101, 9.03814053, 0.76610262, 0.36374882,
        0.51798077, 0.33584904, 3.34191307, 1.29801507, 2.6812538 ,
        3.99452831, 8.73808534, 0.62928062, 1.53496131, 9.19794014,
        5.19300649, 3.43855235, 1.49179912, 1.25751332, 1.07312285,
        1.18153856, 9.26291865,

In [None]:
cat_random_search.best_estimator_

<catboost.core.CatBoostClassifier at 0x7f3b53e69510>

In [None]:
cat_random_search.best_params_

{'depth': 6, 'iterations': 93, 'learning_rate': 0.06999999999999999}

In [None]:
print(cat_random_search.best_score_)
print(cat_random_search.best_index_)
print(cat_random_search.scorer_)
print(cat_random_search.n_splits_)
print(cat_random_search.refit_time_)
print(cat_random_search.multimetric_)
print(cat_random_search.classes_)

0.9617686200479308
58
make_scorer(accuracy_score)
50
0.6170327663421631
False
[0 1]


# 11. Model 9: TabNet Classifier

### 11.1 Apply TabNet Classifier with Repeated Stratified 5-Fold 10 times, X input: mean_final_elementwise_sum

In [None]:
# Classifying X and Y
X = mean_final_elementwise_sum
# X = mean_final_concatenate
y = IC50_class

In [None]:
# Apply TabNet Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # TabNet Classifier
    tn = TabNetClassifier() # change the classifier here
    
    # Model fitting and training
    tn.fit(X_train, y_train, eval_set=[(X_validate, y_validate)])
    
    # Make predictions for validation data
    y_pred = tn.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Device used : cpu
epoch 0  | loss: 1.23287 | val_0_auc: 0.77563 |  0:00:00s
epoch 1  | loss: 0.83592 | val_0_auc: 0.27343 |  0:00:00s
epoch 2  | loss: 0.62761 | val_0_auc: 0.39776 |  0:00:00s
epoch 3  | loss: 0.55756 | val_0_auc: 0.5818  |  0:00:00s
epoch 4  | loss: 0.52787 | val_0_auc: 0.57556 |  0:00:00s
epoch 5  | loss: 0.49214 | val_0_auc: 0.62913 |  0:00:01s
epoch 6  | loss: 0.46065 | val_0_auc: 0.57891 |  0:00:01s
epoch 7  | loss: 0.4114  | val_0_auc: 0.43744 |  0:00:01s
epoch 8  | loss: 0.37953 | val_0_auc: 0.62154 |  0:00:01s
epoch 9  | loss: 0.35606 | val_0_auc: 0.75959 |  0:00:01s
epoch 10 | loss: 0.34788 | val_0_auc: 0.82812 |  0:00:01s
epoch 11 | loss: 0.34297 | val_0_auc: 0.58273 |  0:00:02s
epoch 12 | loss: 0.3265  | val_0_auc: 0.60717 |  0:00:02s
epoch 13 | loss: 0.30779 | val_0_auc: 0.64052 |  0:00:02s
epoch 14 | loss: 0.30083 | val_0_auc: 0.79477 |  0:00:02s
epoch 15 | loss: 0.29656 | val_0_auc: 0.64587 |  0:00:02s
epoch 16 | loss: 0.30857 |

In [None]:
# Printing Accuracy score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([58.14, 70.8 , 43.67, 77.2 , 75.39, 25.32, 42.89, 75.45, 75.39,
       75.65, 75.45, 24.55, 24.55, 75.39, 76.94, 75.45, 61.5 , 68.22,
       75.65, 75.65, 75.19, 74.16, 75.45, 54.92, 75.65, 75.45, 73.64,
       75.45, 75.39, 75.65, 75.45, 24.55, 75.45, 58.81, 75.65, 75.45,
       75.45, 79.59, 75.39, 75.13, 75.45, 75.45, 75.45, 75.39, 61.66,
       75.45, 79.84, 75.45, 75.39, 75.65])
Mean   | Max    | Min
68.41% | 79.84% | 24.55%
ROC AUC score array: array([69.77, 77.45, 61.25, 54.04, 50.19, 49.45, 59.32, 50.  , 50.  ,
       50.  , 50.  , 50.  , 50.  , 50.  , 79.35, 50.  , 68.8 , 75.74,
       50.53, 50.  , 50.18, 49.5 , 50.  , 65.14, 50.  , 50.  , 68.33,
       50.  , 50.  , 50.  , 50.  , 50.  , 50.  , 62.4 , 50.  , 50.  ,
       50.  , 80.44, 50.  , 61.92, 50.  , 50.  , 50.71, 50.  , 60.95,
       50.  , 82.38, 50.  , 50.  , 50.  ])
Mean   | Max    | Min
55.56% | 82.38% | 49.45%


In [None]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['tabnet', 'elementwise sum', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['tabnet', 'elementwise sum', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

### 11.2 Apply TabNet Classifier with Repeated Stratified 5-Fold 10 timesd, X input: mean_final_concatenate

In [None]:
# Classifying X and Y
# X = mean_final_elementwise_sum
X = mean_final_concatenate
y = IC50_class

In [None]:
# Apply TabNet Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # TabNet Classifier
    tn = TabNetClassifier() # change the classifier here
    
    # Model fitting and training
    tn.fit(X_train, y_train, eval_set=[(X_validate, y_validate)])
    
    # Make predictions for validation data
    y_pred = tn.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Device used : cpu
epoch 0  | loss: 0.68615 | val_0_auc: 0.16406 |  0:00:00s
epoch 1  | loss: 0.52768 | val_0_auc: 0.81063 |  0:00:00s
epoch 2  | loss: 0.46155 | val_0_auc: 0.79041 |  0:00:00s
epoch 3  | loss: 0.45017 | val_0_auc: 0.86658 |  0:00:00s
epoch 4  | loss: 0.42008 | val_0_auc: 0.75984 |  0:00:00s
epoch 5  | loss: 0.33379 | val_0_auc: 0.80616 |  0:00:01s
epoch 6  | loss: 0.30581 | val_0_auc: 0.54719 |  0:00:01s
epoch 7  | loss: 0.30497 | val_0_auc: 0.43861 |  0:00:01s
epoch 8  | loss: 0.26072 | val_0_auc: 0.5172  |  0:00:01s
epoch 9  | loss: 0.26757 | val_0_auc: 0.43937 |  0:00:01s
epoch 10 | loss: 0.23407 | val_0_auc: 0.27275 |  0:00:01s
epoch 11 | loss: 0.22499 | val_0_auc: 0.47855 |  0:00:02s
epoch 12 | loss: 0.20464 | val_0_auc: 0.37354 |  0:00:02s
epoch 13 | loss: 0.20969 | val_0_auc: 0.38738 |  0:00:02s

Early stopping occured at epoch 13 with best_epoch = 3 and best_val_0_auc = 0.86658
Best weights from best epoch are automatically used!
Accu

In [None]:
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([82.43, 24.55, 75.45, 75.39, 75.65, 75.45, 75.45, 75.45, 28.5 ,
       75.65, 75.45, 75.45, 72.61, 75.39, 75.65, 45.74, 24.81, 75.45,
       75.39, 24.35, 75.45, 75.45, 75.45, 75.39, 59.84, 29.97, 79.84,
       74.94, 75.39, 75.65, 75.45, 75.45, 76.74, 75.39, 58.03, 75.45,
       74.94, 75.45, 75.39, 75.65, 75.45, 80.62, 75.45, 75.39, 75.39,
       75.45, 24.55, 75.45, 75.65, 75.65])
Mean   | Max    | Min
68.57% | 82.43% | 24.35%
ROC AUC score array: array([64.57, 50.  , 50.  , 50.  , 50.  , 50.  , 50.  , 50.  , 52.58,
       50.  , 50.  , 50.  , 77.94, 50.  , 50.  , 62.98, 50.17, 50.  ,
       50.  , 50.  , 50.  , 50.  , 50.  , 50.  , 68.77, 53.6 , 82.38,
       49.66, 50.  , 50.  , 50.  , 50.  , 75.36, 50.  , 66.85, 50.  ,
       49.66, 50.  , 50.  , 50.  , 50.  , 81.12, 50.  , 50.  , 49.83,
       50.  , 50.  , 50.  , 51.24, 50.  ])
Mean   | Max    | Min
53.73% | 82.38% | 49.66%


In [None]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['tabnet', 'concatenate', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['tabnet', 'concatenate', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()