In [3]:
# imporing all the necessary modules 
import pandas as pd
import sklearn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")

In [26]:
with open('/kaggle/input/vgg19-isic-17/X_train_vgg19.txt', 'r') as f:
    X_train = np.loadtxt(f)

with open('/kaggle/input/vgg16-isic-17/y_train_vgg.txt', 'r') as f:
    y_train = np.loadtxt(f)
    
with open('/kaggle/input/vgg19-isic-17/X_test_vgg19.txt', 'r') as f:
    X_test = np.loadtxt(f)

with open('/kaggle/input/vgg16-isic-17/y_test_vgg.txt', 'r') as f:
    y_test = np.loadtxt(f)

In [27]:
with open('/kaggle/input/eb0-3-featurevectors/X_train_b0_3.txt', 'r') as f:
    X_train_xg = np.loadtxt(f)

with open('/kaggle/input/new-sc-feature-vectors/y_train.txt', 'r') as f:
    y_train_xg = np.loadtxt(f)
    
with open('/kaggle/input/eb0-3-featurevectors/X_test_b0_3.txt', 'r') as f:
    X_test_xg = np.loadtxt(f)

with open('/kaggle/input/new-sc-feature-vectors/y_test.txt', 'r') as f:
    y_test_xg = np.loadtxt(f)

In [6]:
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, precision_score, f1_score, roc_auc_score

def eval_clf(y_test, y_pred):
    print(f'Accuracy : {accuracy_score(y_test, y_pred)}')
    print('-----------------------------')
    print(f'Sensitivity/Recall: {recall_score(y_test, y_pred)}')
    print('-----------------------------')
    print(f'Specificity: {recall_score(y_test, y_pred, pos_label=0)}')
    print('-----------------------------')
    print(f'Precision : {precision_score(y_test, y_pred)}')
    print('-----------------------------')
    print(f'ROC:\n{roc_auc_score(y_test, y_pred)}')    
    print('-----------------------------')
    print(f'Confusion_Matrix:\n{confusion_matrix(y_test, y_pred)}')

## Preprocessing

### Standardization

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
len(X_train), len(y_train), len(X_test), len(y_test)

(3870, 3870, 600, 600)

In [29]:
sc = StandardScaler()
X_train_xf1 = sc.fit_transform(X_train_xg)
X_test_xf1 = sc.transform(X_test_xg)

## Import Libraries

In [9]:
import xgboost as xgb

In [10]:
import sys
!cp ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.6/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.6"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

## Trial 1

In [11]:
import optuna
from cuml.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# import xgboost as xgb
from sklearn.ensemble import VotingClassifier

In [11]:
def objective(trial):
    # Define the hyperparameters to be optimized for each classifier
    rf_params = {
        "n_estimators": trial.suggest_int("n_estimators_1", 70, 270, 10),
        "max_depth": trial.suggest_int("max_depth_1", 40, 70),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 16),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 16),
        "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        'split_criterion':trial.suggest_categorical('split_criterion', ['gini', 'entropy'])
    }
    
    knn_params = {
        'n_neighbors': trial.suggest_categorical('knn_n_neighbors', [1, 3 ,5]),
        'weights': trial.suggest_categorical('knn_weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric', ['minkowski','euclidean','manhattan'])
    }
    
    xgb_params = {
        'max_depth': trial.suggest_int('max_depth', 2, 18),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 270, 700, 10),
        'eta': trial.suggest_discrete_uniform('eta', 0.02, 0.09, 0.01),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 11),
        'gamma':trial.suggest_uniform('gamma', 0, 5),
        'reg_lambda': trial.suggest_int('reg_lambda', 1, 36),
        'min_child_weight': trial.suggest_int('min_child_weight', 13, 26),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        'nthread' : -1    

    }
    # Create the classifiers
    rf = RandomForestClassifier(**rf_params)
    xgbclf = xgb.XGBClassifier(random_state=42, 
                             tree_method='gpu_hist', 
                             gpu_id=0, 
                             predictor="gpu_predictor"
                             ,**xgb_params )  
    knn = KNeighborsClassifier(**knn_params)

    # Create the ensemble classifier
    estimators = [('rf', rf), ('xgb', xgbclf), ('knn', knn)]
    ensemble = VotingClassifier(estimators=estimators, voting = 'hard')

    # Train and evaluate the ensemble classifier
    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [12]:
study_1 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study_1.optimize(objective, n_trials=1000, show_progress_bar = True)

[32m[I 2023-04-07 05:41:54,026][0m A new study created in memory with name: no-name-63e6a058-78c1-4876-8bbd-d1d26c071638[0m


  0%|          | 0/1000 [00:00<?, ?it/s]

[32m[I 2023-04-07 05:42:09,236][0m Trial 0 finished with value: 0.77 and parameters: {'n_estimators_1': 180, 'max_depth_1': 56, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': True, 'split_criterion': 'gini', 'knn_n_neighbors': 1, 'knn_weights': 'distance', 'metric': 'euclidean', 'max_depth': 17, 'subsample': 0.55, 'n_estimators': 630, 'eta': 0.03, 'reg_alpha': 3, 'gamma': 1.5852972282669402, 'reg_lambda': 7, 'min_child_weight': 24, 'colsample_bytree': 0.6297577745101005}. Best is trial 0 with value: 0.77.[0m
[32m[I 2023-04-07 05:42:13,534][0m Trial 1 finished with value: 0.745 and parameters: {'n_estimators_1': 90, 'max_depth_1': 63, 'min_samples_split': 10, 'min_samples_leaf': 13, 'max_features': 'auto', 'bootstrap': False, 'split_criterion': 'entropy', 'knn_n_neighbors': 1, 'knn_weights': 'uniform', 'metric': 'manhattan', 'max_depth': 6, 'subsample': 0.8, 'n_estimators': 390, 'eta': 0.03, 'reg_alpha': 1, 'gamma': 2.91271830777686, 'reg_lambda

In [13]:
optuna.visualization.plot_slice(study_1)

In [14]:
optuna.visualization.plot_param_importances(study_1)

In [15]:
optuna.visualization.plot_optimization_history(study_1)

In [16]:
study_1.best_params

{'n_estimators_1': 130,
 'max_depth_1': 69,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'bootstrap': False,
 'split_criterion': 'gini',
 'knn_n_neighbors': 1,
 'knn_weights': 'uniform',
 'metric': 'manhattan',
 'max_depth': 16,
 'subsample': 0.8,
 'n_estimators': 510,
 'eta': 0.08,
 'reg_alpha': 1,
 'gamma': 1.4854966985357754,
 'reg_lambda': 13,
 'min_child_weight': 19,
 'colsample_bytree': 0.510773307811268}

## Evaluation

In [30]:
all_rf_st_1 = {'n_estimators': 130,
 'max_depth': 69,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'bootstrap': False,
 'split_criterion': 'gini',}

all_knn_st_1 = { 'n_neighbors': 1,
 'weights': 'uniform',
 'metric': 'manhattan'}

all_xgb_st_1 = {'max_depth': 16,
 'subsample': 0.8,
 'n_estimators': 510,
 'eta': 0.08,
 'reg_alpha': 1,
 'gamma': 1.4854966985357754,
 'reg_lambda': 13,
 'min_child_weight': 19,
 'colsample_bytree': 0.510773307811268}

all_xgb_f1 = {'max_depth': 18,
 'subsample': 0.9,
 'n_estimators': 125,
 'eta': 0.07,
 'reg_alpha': 4,
 'gamma': 0.4203399999338451,
 'reg_lambda': 8,
 'min_child_weight': 9,
 'colsample_bytree': 0.4436555055347376}

In [18]:
rf = RandomForestClassifier(**all_rf_st_1)
xgbclf = xgb.XGBClassifier(random_state=42, 
                         tree_method='gpu_hist', 
                         gpu_id=0, 
                         predictor="gpu_predictor"
                         ,**all_xgb_st_1)  
knn = KNeighborsClassifier(**all_knn_st_1)

# Create the ensemble classifier
estimators = [('rf', rf), ('xgb', xgbclf), ('knn', knn)]
ensemble = VotingClassifier(estimators=estimators, voting = 'hard')

# Train and evaluate the ensemble classifier
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)
eval_clf(y_test, y_pred)

Accuracy : 0.8133333333333334
-----------------------------
Sensitivity/Recall: 0.3247863247863248
-----------------------------
Specificity: 0.9316770186335404
-----------------------------
Precision : 0.5352112676056338
-----------------------------
ROC:
0.6282316717099325
-----------------------------
Confusion_Matrix:
[[450  33]
 [ 79  38]]


In [31]:
from sklearn.model_selection import train_test_split
X_train_xg, __, y_train_xg, __  = train_test_split(X_train_xg, y_train_xg ,test_size=0.25,random_state=42, stratify = y_train_xg)

In [32]:
model = xgb.XGBClassifier(random_state=42, 
                             tree_method='gpu_hist', 
                             gpu_id=0, 
                             predictor="gpu_predictor"
                             ,**all_xgb_f1)
model.fit(X_train_xg, y_train_xg)
preds = model.predict(X_test_xg)
eval_clf(y_test, preds)

Accuracy : 0.7916666666666666
-----------------------------
Sensitivity/Recall: 0.6581196581196581
-----------------------------
Specificity: 0.8240165631469979
-----------------------------
Precision : 0.47530864197530864
-----------------------------
ROC:
0.741068110633328
-----------------------------
Confusion_Matrix:
[[398  85]
 [ 40  77]]


In [23]:
from sklearn.ensemble import VotingClassifier
class HierarchicalClassifier:
    def __init__(self, lv1_clf, clf2, clf3, clf4):
        self.lv1_clf = lv1_clf
        self.lv2_clf = VotingClassifier(estimators=[('clf2', clf2), ('clf3', clf3), ('clf4', clf4)], voting='hard')
    
    def fit(self, X_train_xg, y_train_xg, X_train, y_train):
        self.lv1_clf.fit(X_train_xg, y_train_xg)
        self.lv2_clf.fit(X_train, y_train)
    
    def predict(self, X_test, X_test_xg):
        y_pred = []
        for i in range(len(X_test)):
            pred1 = self.lv1_clf.predict(np.array([X_test_xg[i]]))
            if pred1 != 1:
                pred = self.lv2_clf.predict(np.array([X_test[i]]))
            else:
                pred = pred1
            y_pred.append(pred)
        return y_pred

In [22]:
clf1 = xgb.XGBClassifier(**all_xgb_f1, random_state = 42, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
clf2 = KNeighborsClassifier(**all_knn_st_1)
clf3 = xgb.XGBClassifier(random_state = 42, **all_xgb_st_1, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
clf4 = RandomForestClassifier(**all_rf_st_1, random_state = 42)
hclf = HierarchicalClassifier(lv1_clf=clf1, clf2=clf2, clf3=clf3, clf4=clf4)
hclf.fit(X_train_xg, y_train_xg, X_train, y_train)
pred = hclf.predict(X_test, X_test_xf1)
eval_clf(y_test, pred)

Accuracy : 0.82
-----------------------------
Sensitivity/Recall: 0.5128205128205128
-----------------------------
Specificity: 0.8944099378881988
-----------------------------
Precision : 0.5405405405405406
-----------------------------
ROC:
0.7036152253543557
-----------------------------
Confusion_Matrix:
[[432  51]
 [ 57  60]]


## Trial 2

In [14]:
def objective(trial):
    # Define the hyperparameters to be optimized for each classifier
    rf_params = {
        "n_estimators": trial.suggest_int("n_estimators_1", 100, 150, 1),
        "max_depth": trial.suggest_int("max_depth_1", 65, 80),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 7),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 3),
        "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        'split_criterion':trial.suggest_categorical('split_criterion', ['gini', 'entropy'])
    }
    
    knn_params = {
        'n_neighbors': trial.suggest_categorical('knn_n_neighbors', [1, 3 ,5]),
        'weights': trial.suggest_categorical('knn_weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric', ['minkowski','euclidean','manhattan'])
    }
    
    xgb_params = {
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 390, 590, 1),
        'eta': trial.suggest_discrete_uniform('eta', 0.06, 0.09, 0.005),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'gamma':trial.suggest_uniform('gamma', 1.3, 3.62),
        'reg_lambda': trial.suggest_int('reg_lambda', 4, 24),
        'min_child_weight': trial.suggest_int('min_child_weight', 18, 24),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        'nthread' : -1    

    }
    # Create the classifiers
    rf = RandomForestClassifier(**rf_params)
    xgbclf = xgb.XGBClassifier(random_state=42, 
                             tree_method='gpu_hist', 
                             gpu_id=0, 
                             predictor="gpu_predictor"
                             ,**xgb_params )  
    knn = KNeighborsClassifier(**knn_params)

    # Create the ensemble classifier
    estimators = [('rf', rf), ('xgb', xgbclf), ('knn', knn)]
    ensemble = VotingClassifier(estimators=estimators, voting = 'hard')

    # Train and evaluate the ensemble classifier
    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [15]:
study_2 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study_2.optimize(objective, n_trials=1000, show_progress_bar = True)

[32m[I 2023-04-07 08:41:18,146][0m A new study created in memory with name: no-name-281f95ef-5de6-48bc-8882-76c9e6eb3df2[0m


  0%|          | 0/1000 [00:00<?, ?it/s]

[32m[I 2023-04-07 08:41:35,754][0m Trial 0 finished with value: 0.7633333333333333 and parameters: {'n_estimators_1': 100, 'max_depth_1': 69, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'split_criterion': 'gini', 'knn_n_neighbors': 1, 'knn_weights': 'distance', 'metric': 'manhattan', 'max_depth': 20, 'subsample': 0.8, 'n_estimators': 523, 'eta': 0.06999999999999999, 'reg_alpha': 0.5677221885098167, 'gamma': 1.912093039512119, 'reg_lambda': 22, 'min_child_weight': 24, 'colsample_bytree': 0.42311294830024526}. Best is trial 0 with value: 0.7633333333333333.[0m
[32m[I 2023-04-07 08:41:39,134][0m Trial 1 finished with value: 0.7716666666666666 and parameters: {'n_estimators_1': 123, 'max_depth_1': 70, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'bootstrap': False, 'split_criterion': 'entropy', 'knn_n_neighbors': 3, 'knn_weights': 'distance', 'metric': 'euclidean', 'max_depth': 15, 'subsample': 0.75, 'n_estimators

In [16]:
optuna.visualization.plot_param_importances(study_2)

In [17]:
optuna.visualization.plot_slice(study_2)

In [18]:
optuna.visualization.plot_optimization_history(study_2)

In [19]:
study_2.best_params

{'n_estimators_1': 127,
 'max_depth_1': 76,
 'min_samples_split': 6,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'bootstrap': False,
 'split_criterion': 'gini',
 'knn_n_neighbors': 1,
 'knn_weights': 'distance',
 'metric': 'manhattan',
 'max_depth': 17,
 'subsample': 0.6,
 'n_estimators': 525,
 'eta': 0.075,
 'reg_alpha': 0.1943878136909994,
 'gamma': 1.371344978728483,
 'reg_lambda': 12,
 'min_child_weight': 18,
 'colsample_bytree': 0.45713901320775036}

In [33]:
all_rf_st_1 = {'n_estimators': 127,
 'max_depth': 76,
 'min_samples_split': 6,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'bootstrap': False,
 'split_criterion': 'gini',}

all_knn_st_1 = {'n_neighbors': 1,
 'weights': 'distance',
 'metric': 'manhattan'}

all_xgb_st_1 = {'max_depth': 17,
 'subsample': 0.6,
 'n_estimators': 525,
 'eta': 0.075,
 'reg_alpha': 0.1943878136909994,
 'gamma': 1.371344978728483,
 'reg_lambda': 12,
 'min_child_weight': 18,
 'colsample_bytree': 0.45713901320775036}

all_xgb_f1 = {'max_depth': 18,
 'subsample': 0.9,
 'n_estimators': 125,
 'eta': 0.07,
 'reg_alpha': 4,
 'gamma': 0.4203399999338451,
 'reg_lambda': 8,
 'min_child_weight': 9,
 'colsample_bytree': 0.4436555055347376}

In [34]:
model = xgb.XGBClassifier(random_state=42, 
                             tree_method='gpu_hist', 
                             gpu_id=0, 
                             predictor="gpu_predictor"
                             ,**all_xgb_f1)
model.fit(X_train_xg, y_train_xg)
preds = model.predict(X_test_xg)
eval_clf(y_test, preds)

Accuracy : 0.7916666666666666
-----------------------------
Sensitivity/Recall: 0.6581196581196581
-----------------------------
Specificity: 0.8240165631469979
-----------------------------
Precision : 0.47530864197530864
-----------------------------
ROC:
0.741068110633328
-----------------------------
Confusion_Matrix:
[[398  85]
 [ 40  77]]


In [21]:
rf = RandomForestClassifier(**all_rf_st_1)
xgbclf = xgb.XGBClassifier(random_state=42, 
                         tree_method='gpu_hist', 
                         gpu_id=0, 
                         predictor="gpu_predictor"
                         ,**all_xgb_st_1)  
knn = KNeighborsClassifier(**all_knn_st_1)

# Create the ensemble classifier
estimators = [('rf', rf), ('xgb', xgbclf), ('knn', knn)]
ensemble = VotingClassifier(estimators=estimators, voting = 'hard')

# Train and evaluate the ensemble classifier
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)
eval_clf(y_test, y_pred)

Accuracy : 0.82
-----------------------------
Sensitivity/Recall: 0.3162393162393162
-----------------------------
Specificity: 0.9420289855072463
-----------------------------
Precision : 0.5692307692307692
-----------------------------
ROC:
0.6291341508732813
-----------------------------
Confusion_Matrix:
[[455  28]
 [ 80  37]]


In [35]:
from sklearn.ensemble import AdaBoostClassifier
clf1 = xgb.XGBClassifier(**all_xgb_f1, random_state = 42, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
clf2 = KNeighborsClassifier(**all_knn_st_1)
clf3 = xgb.XGBClassifier(random_state = 42, **all_xgb_st_1, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
clf4 = RandomForestClassifier(**all_rf_st_1, random_state = 42)
hclf = HierarchicalClassifier(lv1_clf=clf1, clf2=clf2, clf3=clf3, clf4=clf4)
hclf.fit(X_train_xg, y_train_xg, X_train, y_train)
pred = hclf.predict(X_test, X_test_xg)
eval_clf(y_test, pred)

Accuracy : 0.775
-----------------------------
Sensitivity/Recall: 0.7094017094017094
-----------------------------
Specificity: 0.7908902691511387
-----------------------------
Precision : 0.45108695652173914
-----------------------------
ROC:
0.7501459892764241
-----------------------------
Confusion_Matrix:
[[382 101]
 [ 34  83]]


## Trial 3

In [36]:
def objective(trial):
    # Define the hyperparameters to be optimized for each classifier
    rf_params = {
        "n_estimators": trial.suggest_int("n_estimators_1", 116, 141, 1),
        "max_depth": trial.suggest_int("max_depth_1", 72, 79),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 7),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 3),
        "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        'split_criterion':trial.suggest_categorical('split_criterion', ['gini', 'entropy'])
    }
    
    knn_params = {
        'n_neighbors': trial.suggest_categorical('knn_n_neighbors', [1, 3 ,5]),
        'weights': trial.suggest_categorical('knn_weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric', ['minkowski','euclidean','manhattan'])
    }
    
    xgb_params = {
        'max_depth': trial.suggest_int('max_depth', 14, 19),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.7, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 487, 565, 1),
        'eta': trial.suggest_discrete_uniform('eta', 0.07, 0.07, 0.0025),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.35),
        'gamma':trial.suggest_uniform('gamma', 1.0, 1.85),
        'reg_lambda': trial.suggest_int('reg_lambda', 10, 17),
        'min_child_weight': trial.suggest_int('min_child_weight', 16, 20),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.29, 0.61),
        'nthread' : -1    

    }
    # Create the classifiers
    rf = RandomForestClassifier(**rf_params)
    xgbclf = xgb.XGBClassifier(random_state=42, 
                             tree_method='gpu_hist', 
                             gpu_id=0, 
                             predictor="gpu_predictor"
                             ,**xgb_params )  
    knn = KNeighborsClassifier(**knn_params)

    # Create the ensemble classifier
    estimators = [('rf', rf), ('xgb', xgbclf), ('knn', knn)]
    ensemble = VotingClassifier(estimators=estimators, voting = 'hard')

    # Train and evaluate the ensemble classifier
    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [37]:
study_3 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study_3.optimize(objective, n_trials=1000, show_progress_bar = True)

[32m[I 2023-04-07 10:21:14,535][0m A new study created in memory with name: no-name-3ed8ed44-5619-4365-afe8-8f3d8d78c51b[0m


  0%|          | 0/1000 [00:00<?, ?it/s]

[32m[I 2023-04-07 10:21:17,941][0m Trial 0 finished with value: 0.765 and parameters: {'n_estimators_1': 123, 'max_depth_1': 76, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'bootstrap': True, 'split_criterion': 'gini', 'knn_n_neighbors': 5, 'knn_weights': 'distance', 'metric': 'minkowski', 'max_depth': 15, 'subsample': 0.7, 'n_estimators': 531, 'eta': 0.07, 'reg_alpha': 0.05242384720687207, 'gamma': 1.3995924034003957, 'reg_lambda': 12, 'min_child_weight': 20, 'colsample_bytree': 0.3678374768026299}. Best is trial 0 with value: 0.765.[0m
[32m[I 2023-04-07 10:21:22,144][0m Trial 1 finished with value: 0.775 and parameters: {'n_estimators_1': 131, 'max_depth_1': 78, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': False, 'split_criterion': 'entropy', 'knn_n_neighbors': 5, 'knn_weights': 'distance', 'metric': 'minkowski', 'max_depth': 18, 'subsample': 0.4, 'n_estimators': 519, 'eta': 0.07, 'reg_alpha': 0.008827642942239367

In [38]:
optuna.visualization.plot_param_importances(study_3)

In [39]:
optuna.visualization.plot_optimization_history(study_3)

In [40]:
optuna.visualization.plot_slice(study_3)

In [41]:
study_3.best_params

{'n_estimators_1': 123,
 'max_depth_1': 77,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'bootstrap': False,
 'split_criterion': 'gini',
 'knn_n_neighbors': 1,
 'knn_weights': 'uniform',
 'metric': 'manhattan',
 'max_depth': 17,
 'subsample': 0.6000000000000001,
 'n_estimators': 531,
 'eta': 0.07,
 'reg_alpha': 0.12491474326323539,
 'gamma': 1.7341317500337,
 'reg_lambda': 15,
 'min_child_weight': 19,
 'colsample_bytree': 0.3724182079024168}

In [42]:
all_rf_st_1 = {'n_estimators': 123,
 'max_depth': 77,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'bootstrap': False,
 'split_criterion': 'gini',}

all_knn_st_1 = {'n_neighbors': 1,
 'weights': 'uniform',
 'metric': 'manhattan',}

all_xgb_st_1 = { 'max_depth': 17,
 'subsample': 0.6000000000000001,
 'n_estimators': 531,
 'eta': 0.07,
 'reg_alpha': 0.12491474326323539,
 'gamma': 1.7341317500337,
 'reg_lambda': 15,
 'min_child_weight': 19,
 'colsample_bytree': 0.3724182079024168}

all_xgb_f1 = {'max_depth': 17,
 'subsample': 0.9,
 'n_estimators': 95,
 'eta': 0.07600000000000001,
 'reg_alpha': 1,
 'gamma': 2.8591358197630745,
 'reg_lambda': 8,
 'min_child_weight': 9,
 'colsample_bytree': 0.4604302286209221}

In [43]:
rf = RandomForestClassifier(**all_rf_st_1)
xgbclf = xgb.XGBClassifier(random_state=42, 
                         tree_method='gpu_hist', 
                         gpu_id=0, 
                         predictor="gpu_predictor"
                         ,**all_xgb_st_1)  
knn = KNeighborsClassifier(**all_knn_st_1)

# Create the ensemble classifier
estimators = [('rf', rf), ('xgb', xgbclf), ('knn', knn)]
ensemble = VotingClassifier(estimators=estimators, voting = 'hard')

# Train and evaluate the ensemble classifier
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)
eval_clf(y_test, y_pred)

Accuracy : 0.8216666666666667
-----------------------------
Sensitivity/Recall: 0.358974358974359
-----------------------------
Specificity: 0.9337474120082816
-----------------------------
Precision : 0.5675675675675675
-----------------------------
ROC:
0.6463608854913203
-----------------------------
Confusion_Matrix:
[[451  32]
 [ 75  42]]


In [44]:
model = xgb.XGBClassifier(random_state=42, 
                             tree_method='gpu_hist', 
                             gpu_id=0, 
                             predictor="gpu_predictor"
                             ,**all_xgb_f1)
model.fit(X_train_xg, y_train_xg)
preds = model.predict(X_test_xg)
eval_clf(y_test, preds)

Accuracy : 0.7916666666666666
-----------------------------
Sensitivity/Recall: 0.6239316239316239
-----------------------------
Specificity: 0.8322981366459627
-----------------------------
Precision : 0.474025974025974
-----------------------------
ROC:
0.7281148802887933
-----------------------------
Confusion_Matrix:
[[402  81]
 [ 44  73]]


In [45]:
from sklearn.ensemble import AdaBoostClassifier
clf1 = xgb.XGBClassifier(**all_xgb_f1, random_state = 42, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
clf2 = KNeighborsClassifier(**all_knn_st_1)
clf3 = xgb.XGBClassifier(random_state = 42, **all_xgb_st_1, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
clf4 = RandomForestClassifier(**all_rf_st_1, random_state = 42)
hclf = HierarchicalClassifier(lv1_clf=clf1, clf2=clf2, clf3=clf3, clf4=clf4)
hclf.fit(X_train_xg, y_train_xg, X_train, y_train)
pred = hclf.predict(X_test, X_test_xg)
eval_clf(y_test, pred)

Accuracy : 0.7783333333333333
-----------------------------
Sensitivity/Recall: 0.7008547008547008
-----------------------------
Specificity: 0.7971014492753623
-----------------------------
Precision : 0.45555555555555555
-----------------------------
ROC:
0.7489780750650317
-----------------------------
Confusion_Matrix:
[[385  98]
 [ 35  82]]
