#                                  MODELS OPTIMIZATION
#
#

# 
# Importing libraries needed for the project
# =============================================================

In [1]:
# Import data processing libraries.

import pandas as pd
import numpy as np

#Importing Scikit Learn library

#-. Split the data into train and test data sets

from sklearn.model_selection import train_test_split

# -. Models

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn import tree,svm

#-.XGBoost Model

import xgboost as xgb

#-.Metrics

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

#-.Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# 
# Importing the data into a DataFrame from our previous cleaned data files.
# =============================================================
# 

In [2]:
#Importing the data from CSV files in a dataframe.

df = pd.read_csv ('heart_disease_clean.csv', index_col= 0)
df.head ()

Unnamed: 0,Heart_Disease,High_BP,High_Chol,Weight,BMI,Smoker,Stroke,Diabetes,Phys_Activ,Eat_Fruits,Eat_Veg,Alcohol,Gen_Health,Ment_Health,Phys_Health,Diff_Walk,Sex,Age
0,0,1,1,280.0,40,1,0,0,0,0,1,0,5,18,15,1,0,9
1,0,0,0,165.0,25,1,0,0,1,0,0,0,3,0,0,0,0,7
3,0,1,1,180.0,28,0,0,0,0,1,0,0,5,30,30,1,0,9
5,0,1,0,145.0,27,0,0,0,1,1,1,0,2,0,0,0,0,11
6,0,1,1,148.0,24,0,0,0,1,1,1,0,2,3,0,0,0,11


# 
# Creating two datsets from the original, one for the terget variable (Y) and other for the dependent variables (X).
#

In [3]:
# Spliting Data into Dependent and Independetn variables, naming Y to dependent or target variable and x to the independent variables.

y = df['Heart_Disease'].copy ()
x = df.drop ('Heart_Disease', axis = 1).copy ()

# 
# ONE-HOT ENCODING. Transforming our categorical variables into dummies.
# =============================================================
# 

In [4]:
#ENCODING variables into dummies

x_categ = pd.get_dummies (x, columns = ['Diabetes', 'Gen_Health', 'Age'])

# 
# Spliting Data into Training and Testing Datasets
# =============================================================



In [5]:
#SPLIT DATA into TRAIN and TEST sets

x_train, x_test, y_train, y_test = train_test_split (x_categ, y, random_state = 42, test_size= 0.3,shuffle= True, stratify = y)

# 
#
# TUNING THE BASE MODELS HYPERPARAMETERS
# ----------------------------------------------------------------------------------------------------------- 
#

#
# 1.-Logistic Regression
# =============================================================
#

# 
## Create base model for hyperparameter tuning.
# 

In [8]:
#CREATE the LOGISTIC REGRESSION MODEL and FIT IT to the training data

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
#Calculating Accuracy of model

lr_y_pred = logreg.predict(x_test)

#Sensitivity
lr_sen = recall_score(y_test,lr_y_pred)*100
print('Sensitivity=', lr_sen,'%')


Sensitivity= 14.475574712643677 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# 
## Hyperparameter tuning with GridSearchCV for Logistic Regression
# 
# 

In [112]:
#Imputing the parameters for the parameter grid

# Solver
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

#Penalty type
penalty = ['l1', 'l2', 'elasticnet', 'none']

#C parameter
C= np.logspace(-3,3,7)

#Class Weight
class_weight = np. arange (1,11,1)

In [113]:
#Creating the parameter grid

lr_param_grid = {'solver':solver, 'penalty':penalty, 'C':C, 'class_weight':class_weight}

print (lr_param_grid)

{'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty': ['l1', 'l2', 'elasticnet', 'none'], 'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'class_weight': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])}


In [114]:
# Creating the Grid with previously defined hyperparameters and fiting it to the train data

lr_grid = GridSearchCV (estimator = logreg, param_grid = lr_param_grid, cv = 5, verbose = 2, n_jobs = -1 ,scoring = 'recall')
lr_grid.fit (x_train, y_train)


Fitting 5 folds for each of 1400 candidates, totalling 7000 fits


3850 fits failed out of a total of 7000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
350 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pazen\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pazen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\pazen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------

In [115]:
#Getting the best parameters

lr_grid.best_params_

{'C': 1.0, 'class_weight': 1, 'penalty': 'l2', 'solver': 'lbfgs'}

# 
## Training the new model with optimized hyperparameters
# 

In [6]:
#CREATE the OPTIMIZED LOGISTIC REGRESSION MODEL and FIT IT to the training data

lr_grid = LogisticRegression(C= 1, class_weight = 1, penalty = 'l2', solver = 'lbfgs')
lr_grid.fit(x_train, y_train)
#Calculating Accuracy of model

lrgrid_y_pred = lr_grid.predict(x_test)

#Sensitivity
lrgrid_sen = recall_score(y_test,lrgrid_y_pred)*100
print('Sensitivity=', lrgrid_sen,'%')


Sensitivity= 14.475574712643677 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
#Saving the model

import pickle
filename = 'lr_grid.sav'
pickle.dump(lr_grid, open(filename, 'wb'))

# 
## Hyperparameter tuning with OPTUNA for Logistic Regression
# 
# 

In [10]:
#Importing optuna library.

import optuna

In [15]:
def objective(trial):
    
    solver = trial.suggest_categorical ('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
    penalty = trial.suggest_categorical ('penalty',['l1''l2','elasticnet', 'none'] )
    C = trial.suggest_int('C', 1, 3,log = True)
    class_weight = trial.suggest_int('class_weight', 1,11)
    
    opt_lr = LogisticRegression(C= C, class_weight = class_weight, penalty = penalty, solver = solver)
    
    score = cross_val_score(opt_lr, x, y, n_jobs=4, cv=5, scoring = 'recall')
    accuracy = score.mean()
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

[32m[I 2022-11-13 19:11:11,921][0m A new study created in memory with name: no-name-2fbc5bac-a0cb-40e1-bfb3-cc1f982e903a[0m
[32m[I 2022-11-13 19:11:41,683][0m Trial 0 finished with value: 0.0966920612462668 and parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 2, 'class_weight': 3}. Best is trial 0 with value: 0.0966920612462668.[0m
[32m[I 2022-11-13 19:12:09,920][0m Trial 1 finished with value: 0.10103820306582208 and parameters: {'solver': 'sag', 'penalty': 'none', 'C': 2, 'class_weight': 5}. Best is trial 1 with value: 0.10103820306582208.[0m
[32m[I 2022-11-13 19:12:37,881][0m Trial 2 finished with value: 0.10085860536467266 and parameters: {'solver': 'sag', 'penalty': 'none', 'C': 1, 'class_weight': 3}. Best is trial 1 with value: 0.10103820306582208.[0m
[33m[W 2022-11-13 19:12:38,597][0m Trial 3 failed because of the following error: ValueError('\nAll the 5 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pazen\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pazen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1158, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "C:\Users\pazen\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1169, in _fit_liblinear
    class_weight_ = compute_class_weight(class_weight, classes=classes_, y=y)
  File "C:\Users\pazen\anaconda3\lib\site-packages\sklearn\utils\class_weight.py", line 44, in compute_class_weight
    if class_weight is None or len(class_weight) == 0:
TypeError: object of type 'int' has no len()


In [40]:
optlr_sen = []

In [None]:
#OPTUNA was not able to run due to incompatibility of sevral of the parameters.

# 
# 2. K-Nearest Neighbor
# =============================================================

# 
## Create base model for hyperparameter tuning.
#

In [20]:
#CREATE K-NEAREST NEIGHBOR MODEL and FIT IT to the training data


knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

#Calculating Accuracy of model

knn_y_pred = knn.predict(x_test)

#Sensitivity
knn_sen = recall_score(y_test,knn_y_pred)*100
print('Sensitivity=', knn_sen,'%')


Sensitivity= 7.483237547892721 %


# 
## Hyperparameter tuning with GridSearchCV for K-Nearest Neighbor
# 

In [16]:
#Imputing the parameters for the parameter grid

# n_neighbors
n_neighbors = np.arange (2,11,1)

#weights
weights = ['uniform', 'distance']

#algorithm
algorithm = ['auto', 'ball_tree', 'kd_tree',  'brute']

#leaf_size
leaf_size = np. arange (10,60,10)

In [17]:
#Creating the parameter grid

param_grid = {'n_neighbors':n_neighbors, 'weights':weights, 
              'algorithm':algorithm, 'leaf_size': leaf_size }

print (param_grid)

{'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10]), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': array([10, 20, 30, 40, 50])}


In [18]:
# Creating the Grid with previously defined hyperparameters and fiting it to the train data

knn_grid = GridSearchCV (estimator = knn, param_grid = param_grid, cv = 5, verbose = 2, n_jobs = -1 ,scoring = 'recall')
knn_grid.fit (x_train, y_train)


Fitting 5 folds for each of 360 candidates, totalling 1800 fits


In [19]:
#Getting the best parameters

knn_grid.best_params_

{'algorithm': 'kd_tree',
 'leaf_size': 10,
 'n_neighbors': 2,
 'weights': 'distance'}

# 
## Training the new model with the GridSearchCV optimized hyperparameters¶
# 
# 

In [13]:
#CREATE the OPTIMIZED KNN MODEL and FIT IT to the training data

grid_knn = KNeighborsClassifier(n_neighbors =  2, weights = 'distance', algorithm = 'kd_tree', leaf_size = 10 )
grid_knn.fit(x_train, y_train)

#Calculating Accuracy of optimized model

gridknn_y_pred = grid_knn.predict(x_test)

#Sensitivity

gridknn_sen = recall_score(y_test,gridknn_y_pred)*100
print('Sensitivity=', gridknn_sen,'%')


Sensitivity= 14.307950191570882 %


In [14]:
#Saving the model

import pickle
filename = 'knn_grid.sav'
pickle.dump(grid_knn, open(filename, 'wb'))

# 
## Hyperparameter tuning with OPTUNA for K-Nearest Neighbor.
# 
# 

In [24]:
#Importing optuna library.

import optuna

In [26]:
def objective(trial):
    
    weights = trial.suggest_categorical ('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical ('algorithm',['auto', 'ball_tree', 'kd_tree',  'brute'] )
    n_neighbors = trial.suggest_int('n_neighbors', 2, 11)
    leaf_size = trial.suggest_int ('leaf_size', 10,60,10)
    
    opt_knn = KNeighborsClassifier(n_neighbors = n_neighbors, weights = weights, algorithm = algorithm , leaf_size = leaf_size)

    score = cross_val_score(opt_knn, x, y, n_jobs=4, cv=5, scoring = 'recall')
    accuracy = score.mean()
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

[32m[I 2022-11-11 19:21:19,931][0m A new study created in memory with name: no-name-732809f3-6385-4e06-877c-835a38f93605[0m
[32m[I 2022-11-11 19:22:56,862][0m Trial 0 finished with value: 0.06957351786263449 and parameters: {'weights': 'distance', 'algorithm': 'auto', 'n_neighbors': 8, 'n_estimators': 950, 'leaf_size': 10}. Best is trial 0 with value: 0.06957351786263449.[0m
[32m[I 2022-11-11 19:24:34,520][0m Trial 1 finished with value: 0.06986084193493126 and parameters: {'weights': 'distance', 'algorithm': 'brute', 'n_neighbors': 7, 'n_estimators': 400, 'leaf_size': 60}. Best is trial 1 with value: 0.06986084193493126.[0m
[32m[I 2022-11-11 19:26:09,037][0m Trial 2 finished with value: 0.15541831655325147 and parameters: {'weights': 'distance', 'algorithm': 'auto', 'n_neighbors': 2, 'n_estimators': 300, 'leaf_size': 30}. Best is trial 2 with value: 0.15541831655325147.[0m
[32m[I 2022-11-11 19:35:20,484][0m Trial 3 finished with value: 0.04198832360171144 and parameters:

In [27]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.15541831655325147


In [28]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'weights': 'distance', 'algorithm': 'auto', 'n_neighbors': 2, 'n_estimators': 300, 'leaf_size': 30}


# 
## Training the new model with the OPTUNA optimized hyperparameters¶
# 
# 

In [15]:
#CREATE the OPTIMIZED KNN MODEL and FIT IT to the training data

opt_knn = KNeighborsClassifier (n_neighbors =  2, weights = 'distance', algorithm = 'auto', leaf_size = 30)
opt_knn.fit(x_train, y_train)

#Calculating Accuracy of optimized model

optknn_y_pred = opt_knn.predict(x_test)

#Sensitivity
optknn_sen = recall_score(y_test,optknn_y_pred)*100
print('Sensitivity=', optknn_sen,'%')


Sensitivity= 14.39176245210728 %


In [18]:
#Saving the model

import pickle
filename = 'knn_optuna.sav'
pickle.dump(opt_knn, open(filename, 'wb'))

#
# 3.-Random Forest
# =============================================================
#

# 
## Create base model for hyperparameter tuning.
# 

In [16]:
#CREATE the RANDOM FOREST MODEL and FIT IT to the training data

rand_forest = RandomForestClassifier()
rand_forest.fit(x_train, y_train)

#Calculating Accuracy of model

rf_y_pred = rand_forest.predict(x_test)

#Sensitivity
rf_sen = recall_score(y_test,rf_y_pred)*100
print('Sensitivity=', rf_sen,'%')


Sensitivity= 12.954980842911878 %


# 
## Hyperparameter tuning with GridSearchCV for Random Forest
# 
# 

In [33]:
#Imputing the parameters for the parameter grid

# Criterion
criterion = ['gini', 'entropy']

#Number of trees
n_estimators = np.arange (300,1100,50)

#Number of features to consider at every split
max_features = ['auto' , 'sqrt','log2']

#Maximum number of level in the trees
max_depth = np. arange (2,10,1)

#Minimum samples required to split a node
min_samples_split = np.arange (2,6,1)

#Minimum samples required at each node
min_samples_leaf = [1, 2, 3]

#Method for selecting the samples for training each tree
bootstrap = [True, False]

#Spliting criterion.
criterion = ['gini', 'entropy']

# 
## ROUND-1
#

In [8]:
#Creating the parameter grid

rf_param_grid = {'n_estimators':n_estimators, 'max_depth':max_depth, 
              'min_samples_split':min_samples_split}

print (rf_param_grid)

{'n_estimators': array([ 300,  350,  400,  450,  500,  550,  600,  650,  700,  750,  800,
        850,  900,  950, 1000, 1050]), 'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9]), 'min_samples_split': array([2, 3, 4, 5])}


In [9]:
# Creating the Grid with previously defined hyperparameters and fiting it to the train data

rf_grid = GridSearchCV (estimator = rand_forest, param_grid = rf_param_grid, cv = 5, verbose = 2, n_jobs = -1 ,scoring = 'recall')
rf_grid.fit (x_train, y_train)


Fitting 5 folds for each of 512 candidates, totalling 2560 fits


In [10]:
#Getting the best parameters

rf_grid.best_params_

{'max_depth': 9, 'min_samples_split': 4, 'n_estimators': 1050}

# 
## Training the new model with ROUND 1 optimized hyperparameters
# 

In [16]:
#CREATE the OPTIMIZED RANDOM FOREST MODEL and FIT IT to the training data

grid_rf = RandomForestClassifier (n_estimators=1050, max_depth= 9, min_samples_split= 4)
grid_rf.fit(x_train, y_train)

#Calculating Accuracy of optimized model

grid_rf_y_pred = grid_rf.predict(x_test)

#Sensitivity
grid_rf_sen = recall_score(y_test,grid_rf_y_pred)*100
print('Sensitivity=', grid_rf_sen,'%')


Sensitivity= 3.148946360153257 %


# 
## ROUND-2
#

In [34]:
#Creating the parameter grid

param_grid = {'criterion':criterion, 'bootstrap':bootstrap, 
              'max_features': max_features, 'min_samples_leaf':min_samples_leaf}

print (param_grid)

In [37]:
# Creating the Grid with previously defined hyperparameters and fiting it to the train data

rf_grid2 = GridSearchCV (estimator = grid_rf, param_grid = param_grid, cv = 5, verbose = 2, n_jobs = -1 ,scoring = 'recall')
rf_grid2.fit (x_train, y_train)


Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [39]:
#Getting the best parameters

rf_grid2.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_features': 'log2',
 'min_samples_leaf': 2,
 'min_samples_split': 2}

# 
## Training the new model with all the GridSearchCV optimized hyperparameters
# 

In [34]:
#CREATE the OPTIMIZED RANDOM FOREST MODEL and FIT IT to the training data

grid2_rf = RandomForestClassifier (n_estimators=1050, max_depth= 9, min_samples_split= 4, bootstrap = False, criterion = 'gini',
                                  max_features = 'log2', min_samples_leaf = 2)
grid2_rf.fit(x_train, y_train)

#Calculating Accuracy of optimized model

grid2rf_y_pred = grid2_rf.predict(x_test)

#Sensitivity
grid2rf_sen = recall_score(y_test,grid2_rf_y_pred)*100
print('Sensitivity=', grid2rf_sen,'%')


Sensitivity= 3.148946360153257 %


In [19]:
#Saving the model

import pickle
filename = 'rf_grid.sav'
pickle.dump(grid2_rf, open(filename, 'wb'))

# 
## Hyperparameter tuning with OPTUNA for Random Forest
# 
# 

In [32]:
#Importing optuna library.

import optuna

In [33]:
def objective(trial):
    
    criterion = trial.suggest_categorical ('criterion', ['gini', 'entropy'])
    max_features = trial.suggest_categorical ('max_features',['auto', 'sqrt','log2'] )
    max_depth = trial.suggest_int('max_depth', 2, 7, log=True)
    n_estimators = trial.suggest_int('n_estimators', 300,3000,50)
    min_samples_split = trial.suggest_int ('min_samples_split', 2,6)
    min_samples_leaf = trial.suggest_int ('min_samples_leaf', 1,5)
    bootstrap = trial.suggest_categorical ('bootstrap', [True, False])

    opt_rf = RandomForestClassifier(criterion =criterion, min_samples_leaf = min_samples_leaf,
            max_depth=max_depth, n_estimators=n_estimators,min_samples_split=min_samples_split, 
                                max_features = max_features, bootstrap = bootstrap)

    score = cross_val_score(opt_rf, x, y, n_jobs=4, cv=5, scoring = 'recall')
    accuracy = score.mean()
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

[32m[I 2022-11-12 15:00:47,442][0m A new study created in memory with name: no-name-91f6320b-bdc8-44ac-9a21-04bd1a3e9d4a[0m
[32m[I 2022-11-12 15:19:53,978][0m Trial 0 finished with value: 0.023490444073617705 and parameters: {'criterion': 'entropy', 'max_features': 'sqrt', 'max_depth': 6, 'n_estimators': 3000, 'min_samples_split': 5, 'min_samples_leaf': 3, 'bootstrap': False}. Best is trial 0 with value: 0.023490444073617705.[0m
[32m[I 2022-11-12 15:24:23,723][0m Trial 1 finished with value: 0.0020833333333333333 and parameters: {'criterion': 'gini', 'max_features': 'sqrt', 'max_depth': 4, 'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 0 with value: 0.023490444073617705.[0m
[32m[I 2022-11-12 15:29:29,277][0m Trial 2 finished with value: 0.009446406936592757 and parameters: {'criterion': 'entropy', 'max_features': 'log2', 'max_depth': 5, 'n_estimators': 1400, 'min_samples_split': 2, 'min_samples_leaf': 3, 'bootstrap': Tr

In [34]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.03552294825831832


In [36]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'criterion': 'gini', 'max_features': 'sqrt', 'max_depth': 7, 'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 5, 'bootstrap': True}


# 
## Training the new model with the OPTUNA optimized hyperparameters¶
# 
# 

In [19]:
#CREATE the OPTIMIZED RANDOM FOREST MODEL and FIT IT to the training data

opt_rf = RandomForestClassifier (max_features = 'sqrt', n_estimators=300, max_depth= 7, min_samples_split= 4, criterion= 'gini', bootstrap = True, min_samples_leaf = 5)
opt_rf.fit(x_train, y_train)

#Calculating Accuracy of optimized model

optrf_y_pred = opt_rf.predict(x_test)

#Sensitivity
optrf_sen = recall_score(y_test,optrf_y_pred)*100
print('Sensitivity=', optrf_sen,'%')


Sensitivity= 2.1551724137931036 %


In [20]:
#Saving the model

import pickle
filename = 'rf_optuna.sav'
pickle.dump(opt_rf, open(filename, 'wb'))

# 
# 4. XGBoost
# =============================================================

# 
## Create base model for hyperparameter tuning.
#

In [23]:
#CREATE XGBOOST model and FIT IT to the training data


xgboost = xgb.XGBClassifier (objective = 'binary:logistic', seed = 42, use_label_encoder=False)
xgboost.fit (x_train, y_train, verbose = True, early_stopping_rounds = 10, eval_metric = 'aucpr', 
             eval_set = ([(x_test, y_test)]))

#Calculating Accuracy of model

xgb_y_pred = xgboost.predict(x_test)

#Sensitivity
xgb_sen = recall_score(y_test,xgb_y_pred)*100
print('Sensitivity=', xgb_sen,'%')


[0]	validation_0-aucpr:0.31653
[1]	validation_0-aucpr:0.32745
[2]	validation_0-aucpr:0.33300
[3]	validation_0-aucpr:0.34011
[4]	validation_0-aucpr:0.34317
[5]	validation_0-aucpr:0.34726
[6]	validation_0-aucpr:0.35199
[7]	validation_0-aucpr:0.35506
[8]	validation_0-aucpr:0.35627
[9]	validation_0-aucpr:0.35784
[10]	validation_0-aucpr:0.35892
[11]	validation_0-aucpr:0.36048
[12]	validation_0-aucpr:0.36159
[13]	validation_0-aucpr:0.36238
[14]	validation_0-aucpr:0.36260
[15]	validation_0-aucpr:0.36339
[16]	validation_0-aucpr:0.36374
[17]	validation_0-aucpr:0.36399
[18]	validation_0-aucpr:0.36456
[19]	validation_0-aucpr:0.36456
[20]	validation_0-aucpr:0.36491
[21]	validation_0-aucpr:0.36517
[22]	validation_0-aucpr:0.36529
[23]	validation_0-aucpr:0.36547
[24]	validation_0-aucpr:0.36544
[25]	validation_0-aucpr:0.36556
[26]	validation_0-aucpr:0.36549
[27]	validation_0-aucpr:0.36537
[28]	validation_0-aucpr:0.36514
[29]	validation_0-aucpr:0.36526
[30]	validation_0-aucpr:0.36524
[31]	validation_0-

# 
## Hyperparameter tuning with GridSearchCV for XGBoost.
# 

In [8]:
#Imputing the parameters for the parameter grid

#Maximum number of level in the trees
max_depth = np. arange (2,15,2)

#Learning rate
learning_rate = (0.05,0.10,0.15,0.20,0.25,0.30)

#Minimum Child Weight
min_child_weight = (1,3,5,7)

#Gamma
gamma = (0.0,0.1,0.2,0.3,0.4)

#Columns Sampled by Tree.
colsample_bytree = (0.3,0.4,0.5,0.6,0.7)

# Scale Post Weight
scale_pos_weight = (1,3,5)

# 
## ROUND 1
#

In [19]:
#Creating the parameter grid

param_grid = {'max_depth':max_depth, 'learning_rate':learning_rate, 
              'min_child_weight':min_child_weight}

print (param_grid)

{'max_depth': array([ 2,  4,  6,  8, 10, 12, 14]), 'learning_rate': (0.05, 0.1, 0.15, 0.2, 0.25, 0.3), 'min_child_weight': (1, 3, 5, 7)}


In [20]:
# Creating the Grid with previously defined hyperparameters and fiting it to the train data

xgb_grid = GridSearchCV (estimator = xgboost, param_grid = param_grid, cv = 5, verbose = 2, n_jobs = -1 ,scoring = 'recall')
xgb_grid.fit (x_train, y_train)


Fitting 5 folds for each of 168 candidates, totalling 840 fits


In [21]:
#Getting the best parameters

xgb_grid.best_params_

{'learning_rate': 0.3, 'max_depth': 14, 'min_child_weight': 1}

# 
## Training the new model with the ROUND 1 optimized hyperparameters¶
# 
# 

In [7]:
#CREATE OPTIMIZED XGBOOST model and FIT IT to the training data


grid_xgb = xgb.XGBClassifier (objective = 'binary:logistic', seed = 42, use_label_encoder=False, learning_rate = 0.3, 
                            max_depth = 14, min_child_weight = 1)
grid_xgb.fit (x_train, y_train, verbose = True, early_stopping_rounds = 10, eval_metric = 'aucpr', 
             eval_set = ([(x_test, y_test)]))

#Calculating Accuracy of model

gridxgb_y_pred = grid_xgb.predict(x_test)

#Sensitivity
gridxgb_sen = recall_score(y_test,gridxgb_y_pred)*100
print('Sensitivity=', gridxgb_sen,'%')


[0]	validation_0-aucpr:0.31075
[1]	validation_0-aucpr:0.31780
[2]	validation_0-aucpr:0.31918
[3]	validation_0-aucpr:0.32138
[4]	validation_0-aucpr:0.32226
[5]	validation_0-aucpr:0.32344
[6]	validation_0-aucpr:0.32369
[7]	validation_0-aucpr:0.32395
[8]	validation_0-aucpr:0.32424
[9]	validation_0-aucpr:0.32526
[10]	validation_0-aucpr:0.32588
[11]	validation_0-aucpr:0.32644
[12]	validation_0-aucpr:0.32620
[13]	validation_0-aucpr:0.32694
[14]	validation_0-aucpr:0.32643
[15]	validation_0-aucpr:0.32666
[16]	validation_0-aucpr:0.32684
[17]	validation_0-aucpr:0.32628
[18]	validation_0-aucpr:0.32613
[19]	validation_0-aucpr:0.32629
[20]	validation_0-aucpr:0.32616
[21]	validation_0-aucpr:0.32602
[22]	validation_0-aucpr:0.32527
[23]	validation_0-aucpr:0.32459
Sensitivity= 12.176724137931034 %


# 
## ROUND 2
#

In [9]:
#Creating the parameter grid 

param_grid2 = {'scale_pos_weight': scale_pos_weight, 'gamma': gamma, 'colsample_bytree': colsample_bytree }

print (param_grid2)

{'scale_pos_weight': (1, 3, 5), 'gamma': (0.0, 0.1, 0.2, 0.3, 0.4), 'colsample_bytree': (0.3, 0.4, 0.5, 0.6, 0.7)}


In [10]:
# Creating the Grid with previously defined hyperparameters and fiting it to the train data

xgb_grid2 = GridSearchCV (estimator = grid_xgb, param_grid = param_grid2, cv = 5, verbose = 2, n_jobs = -1 ,scoring = 'recall')
xgb_grid2.fit (x_train, y_train)


Fitting 5 folds for each of 75 candidates, totalling 375 fits


In [11]:
#Getting the best parameters

xgb_grid2.best_params_

{'colsample_bytree': 0.3, 'gamma': 0.2, 'scale_pos_weight': 5}

# 
## Training the new model with all the GridSearchCV optimized hyperparameters¶
# 
# 

In [21]:
#CREATE OPTIMIZED XGBOOST model and FIT IT to the training data


xgb_grid2 = xgb.XGBClassifier (objective = 'binary:logistic', seed = 42, use_label_encoder=False, learning_rate = 0.3, 
                            max_depth = 14, min_child_weight = 1, colsample_bytree = 0.3, gamma = 0.2, scale_pos_weight = 5)
xgb_grid2.fit (x_train, y_train, verbose = True, early_stopping_rounds = 10, eval_metric = 'aucpr', 
             eval_set = ([(x_test, y_test)]))

#Calculating Accuracy of model

xgbgrid2_y_pred = xgb_grid2.predict(x_test)

#Sensitivity
xgbgrid2_sen = recall_score(y_test,xgbgrid2_y_pred)*100
print('Sensitivity=', xgbgrid2_sen,'%')


[0]	validation_0-aucpr:0.20394
[1]	validation_0-aucpr:0.28027
[2]	validation_0-aucpr:0.29643
[3]	validation_0-aucpr:0.30692
[4]	validation_0-aucpr:0.31604
[5]	validation_0-aucpr:0.32108
[6]	validation_0-aucpr:0.32066
[7]	validation_0-aucpr:0.33267
[8]	validation_0-aucpr:0.33553
[9]	validation_0-aucpr:0.33784
[10]	validation_0-aucpr:0.33704
[11]	validation_0-aucpr:0.33703
[12]	validation_0-aucpr:0.33818
[13]	validation_0-aucpr:0.33924
[14]	validation_0-aucpr:0.34023
[15]	validation_0-aucpr:0.34195
[16]	validation_0-aucpr:0.34251
[17]	validation_0-aucpr:0.34345
[18]	validation_0-aucpr:0.34439
[19]	validation_0-aucpr:0.34502
[20]	validation_0-aucpr:0.34510
[21]	validation_0-aucpr:0.34293
[22]	validation_0-aucpr:0.34318
[23]	validation_0-aucpr:0.34341
[24]	validation_0-aucpr:0.34363
[25]	validation_0-aucpr:0.34244
[26]	validation_0-aucpr:0.34011
[27]	validation_0-aucpr:0.34025
[28]	validation_0-aucpr:0.33938
[29]	validation_0-aucpr:0.33932
[30]	validation_0-aucpr:0.33719
Sensitivity= 54.52

In [22]:
#Saving the model

import pickle
filename = 'xgb_grid.sav'
pickle.dump(xgb_grid2, open(filename, 'wb'))

# 
## Hyperparameter tuning with OPTUNA for XGBoost.
# 
# 

In [13]:
#Importing optuna library.

import optuna

In [28]:
def objective(trial):
    
    max_depth = trial.suggest_int('max_depth', 2, 15, log=True)
    min_child_weight =  trial.suggest_int('min_child_weight', 1,15)
    gamma = trial.suggest_float ('gamma', 0,0.4)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.3,0.9)
    scale_pos_weight = trial.suggest_int ('scale_pos_weight', 1,11)
    learning_rate = trial.suggest_float ('learning_rate', 0.05,0.5)
    
    opt_xgb = xgb.XGBClassifier (objective = 'binary:logistic', seed = 42, use_label_encoder=False, max_depth = max_depth,
                                 learning_rate = learning_rate, min_child_weight = min_child_weight, gamma = gamma,
                                 colsample_bytree = colsample_bytree, scale_pos_weight = scale_pos_weight)

    score = cross_val_score(opt_xgb, x, y, n_jobs=4, cv=5, scoring = 'recall')
    accuracy = score.mean()
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

[32m[I 2022-11-12 14:48:59,618][0m A new study created in memory with name: no-name-f7b67d2f-46c1-4efa-a0d6-6f47a1a4496c[0m
[32m[I 2022-11-12 14:49:09,547][0m Trial 0 finished with value: 0.8369307826783322 and parameters: {'max_depth': 5, 'min_child_weight': 11, 'gamma': 0.05492940075967834, 'colsample_bytree': 0.847302733427076, 'scale_pos_weight': 11, 'learning_rate': 0.13220759527405818}. Best is trial 0 with value: 0.8369307826783322.[0m
[32m[I 2022-11-12 14:49:15,512][0m Trial 1 finished with value: 0.5671130777312008 and parameters: {'max_depth': 4, 'min_child_weight': 8, 'gamma': 0.179663942890913, 'colsample_bytree': 0.6120208721432394, 'scale_pos_weight': 4, 'learning_rate': 0.3859880033847078}. Best is trial 0 with value: 0.8369307826783322.[0m
[32m[I 2022-11-12 14:49:27,333][0m Trial 2 finished with value: 0.762975306654448 and parameters: {'max_depth': 7, 'min_child_weight': 15, 'gamma': 0.14416310962444653, 'colsample_bytree': 0.729259999015774, 'scale_pos_weig

In [29]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.8395886222582728


In [30]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'max_depth': 3, 'min_child_weight': 4, 'gamma': 0.31697002808481123, 'colsample_bytree': 0.732879490463523, 'scale_pos_weight': 11, 'learning_rate': 0.13164217591978739}


# 
## Training the new model with the OPTUNA optimized hyperparameters¶
# 
# 

In [30]:
#CREATE OPTIMIZED XGBOOST model and FIT IT to the training data


opt_xgb = xgb.XGBClassifier (objective = 'binary:logistic', seed = 42, use_label_encoder=False, learning_rate = 0.13164217591978739, 
                                max_depth = 3, min_child_weight = 4, colsample_bytree = 0.732879490463523, gamma = 0.31697002808481123,
                               scale_pos_weight = 11)
opt_xgb.fit (x_train, y_train, verbose = True, early_stopping_rounds = 10, eval_metric = 'aucpr', 
             eval_set = ([(x_test, y_test)]))

#Calculating Accuracy of model

optxgb_y_pred = opt_xgb.predict(x_test)

#Sensitivity
optxgb_sen = recall_score(y_test,optxgb_y_pred)*100
print('Sensitivity=', optxgb_sen,'%')


[0]	validation_0-aucpr:0.22539
[1]	validation_0-aucpr:0.28210
[2]	validation_0-aucpr:0.29818
[3]	validation_0-aucpr:0.31167
[4]	validation_0-aucpr:0.31075
[5]	validation_0-aucpr:0.31359
[6]	validation_0-aucpr:0.31306
[7]	validation_0-aucpr:0.32235
[8]	validation_0-aucpr:0.32601
[9]	validation_0-aucpr:0.32682
[10]	validation_0-aucpr:0.33236
[11]	validation_0-aucpr:0.33258
[12]	validation_0-aucpr:0.33780
[13]	validation_0-aucpr:0.33967
[14]	validation_0-aucpr:0.34062
[15]	validation_0-aucpr:0.34190
[16]	validation_0-aucpr:0.34405
[17]	validation_0-aucpr:0.34555
[18]	validation_0-aucpr:0.35020
[19]	validation_0-aucpr:0.35085
[20]	validation_0-aucpr:0.35150
[21]	validation_0-aucpr:0.35198
[22]	validation_0-aucpr:0.35322
[23]	validation_0-aucpr:0.35440
[24]	validation_0-aucpr:0.35442
[25]	validation_0-aucpr:0.35609
[26]	validation_0-aucpr:0.35721
[27]	validation_0-aucpr:0.35787
[28]	validation_0-aucpr:0.35819
[29]	validation_0-aucpr:0.35938
[30]	validation_0-aucpr:0.35958
[31]	validation_0-

In [31]:
#Saving the model

import pickle
filename = 'xgb_optuna.sav'
pickle.dump(opt_xgb, open(filename, 'wb'))

# 
# 5. Classification Tree
# =============================================================

# 
## Create base model for hyperparameter tuning.
#

In [27]:
#CREATE CLASSIFICATION TREE model and FIT IT to the training data


classtree = DecisionTreeClassifier (random_state=42)
classtree.fit (x_train, y_train)

#Calculating Accuracy of model

classtree_y_pred = classtree.predict(x_test)

#Sensitivity
classtree_sen = recall_score(y_test,classtree_y_pred)*100
print('Sensitivity=', classtree_sen,'%')


Sensitivity= 27.873563218390807 %


# 
# Cost Complexity Pruning using the Alpha value
# 

In [102]:
#Imputing the parameters for the parameter grid

#Cost complexity pruning alpha value.
ccp_alpha = np. arange (0,0.99,0.001)

In [103]:
#Creating the parameter grid

ct_param_grid = {'ccp_alpha':ccp_alpha}

print (ct_param_grid)

{'ccp_alpha': array([0.   , 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008,
       0.009, 0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017,
       0.018, 0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026,
       0.027, 0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035,
       0.036, 0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044,
       0.045, 0.046, 0.047, 0.048, 0.049, 0.05 , 0.051, 0.052, 0.053,
       0.054, 0.055, 0.056, 0.057, 0.058, 0.059, 0.06 , 0.061, 0.062,
       0.063, 0.064, 0.065, 0.066, 0.067, 0.068, 0.069, 0.07 , 0.071,
       0.072, 0.073, 0.074, 0.075, 0.076, 0.077, 0.078, 0.079, 0.08 ,
       0.081, 0.082, 0.083, 0.084, 0.085, 0.086, 0.087, 0.088, 0.089,
       0.09 , 0.091, 0.092, 0.093, 0.094, 0.095, 0.096, 0.097, 0.098,
       0.099, 0.1  , 0.101, 0.102, 0.103, 0.104, 0.105, 0.106, 0.107,
       0.108, 0.109, 0.11 , 0.111, 0.112, 0.113, 0.114, 0.115, 0.116,
       0.117, 0.118, 0.119, 0.12 , 0.121, 0.122, 0.123, 0.124, 0.125,
      

In [104]:
# Finding the best alpha value using GridSearchCV
# Creating the Grid with previously defined hyperparameters and fiting it to the train data

ct_grid = GridSearchCV (estimator = classtree, param_grid = ct_param_grid, cv = 5, verbose = 2, n_jobs = -1 ,scoring = 'recall')
ct_grid.fit (x_train, y_train)


Fitting 5 folds for each of 990 candidates, totalling 4950 fits


In [105]:
#Getting the best parameters

ct_grid.best_params_

{'ccp_alpha': 0.0}

# 
## Training the new model with the GridSearchCV optimized hyperparameters¶
# 
# 

In [24]:
#CREATE OPTIMIZED CLASSIFICATION TREE model and FIT IT to the training data


ct_grid = DecisionTreeClassifier (random_state=42, ccp_alpha = 0.0 )
ct_grid.fit (x_train, y_train)

#Calculating Accuracy of model

ctgrid_y_pred = ct_grid.predict(x_test)

#Sensitivity
ctgrid_sen = recall_score(y_test,ctgrid_y_pred)*100
print('Sensitivity=', ctgrid_sen,'%')


Sensitivity= 27.873563218390807 %


In [25]:
#Saving the model

import pickle
filename = 'ct_grid.sav'
pickle.dump(ct_grid, open(filename, 'wb'))

# 
## Hyperparameter tuning with OPTUNA for Classification Tree.
# 
# 

In [96]:
#Importing optuna library.

import optuna

In [98]:
def objective(trial):
    
    ccp_alpha = trial.suggest_float ('ccp_alpha', 0,0.99)
    
    
    opt_ct = DecisionTreeClassifier (random_state=42, ccp_alpha = ccp_alpha)

    score = cross_val_score(opt_ct, x, y, n_jobs=4, cv=5, scoring = 'recall')
    accuracy = score.mean()
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

[32m[I 2022-11-13 16:27:18,175][0m A new study created in memory with name: no-name-d9c5e52e-e856-47eb-81ad-ed56d8ff8bfd[0m
[32m[I 2022-11-13 16:27:26,051][0m Trial 0 finished with value: 0.0 and parameters: {'ccp_alpha': 0.5854986844801698}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-13 16:27:32,091][0m Trial 1 finished with value: 0.0 and parameters: {'ccp_alpha': 0.18614567039040242}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-13 16:27:37,883][0m Trial 2 finished with value: 0.0 and parameters: {'ccp_alpha': 0.8449850291151026}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-13 16:27:43,751][0m Trial 3 finished with value: 0.0 and parameters: {'ccp_alpha': 0.3834101737002091}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-13 16:27:49,821][0m Trial 4 finished with value: 0.0 and parameters: {'ccp_alpha': 0.8862187811076142}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-11-13 16:27:55,978][0m Trial 5 finished with value: 0.0 and paramete

In [99]:
# Getting the best score:
print(f"The best value is : \n{study.best_value}")

The best value is : 
0.0


In [100]:
# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}")

The best parameters are : 
{'ccp_alpha': 0.5854986844801698}


# 
## Training the new model with the OPTUNA optimized hyperparameters¶
# 
# 

In [26]:
#CREATE OPTIMIZED CLASSIFICATION TREE model and FIT IT to the training data


opt_ct = DecisionTreeClassifier (random_state=42, ccp_alpha = 0.5854986844801698)
opt_ct.fit (x_train, y_train)
#Calculating Accuracy of model

optct_y_pred = opt_ct.predict(x_test)

#Sensitivity
optct_sen = recall_score(y_test,optct_y_pred)*100
print('Sensitivity=', optct_sen,'%')


Sensitivity= 0.0 %


In [28]:
#Saving the model

import pickle
filename = 'ct_optuna.sav'
pickle.dump(opt_ct, open(filename, 'wb'))

# 
# Models scores comparision table
# =============================================================
# 

In [43]:
# Building a table to compare the scores of the base and optimized models.

logis = [lr_sen, lrgrid_sen,optlr_sen]
knei = [knn_sen, gridknn_sen, optknn_sen]
rando = [rf_sen, grid2rf_sen, optrf_sen]
xgbo = [xgb_sen, xgbgrid2_sen, optxgb_sen]
classif = [classtree_sen, ctgrid_sen,optct_sen]
scores = ['Base_Model', 'GridSearchCV', 'OPTUNA']

data = list(zip(scores, logis, knei, rando, xgbo, classif))
eval_scores = pd.DataFrame(data, columns=['Scores','Logistic Regresion', 'K-Nearest Neighbor', 'Random Forest', 'XGBoost', 
                                          'Classification Tree'])
eval_scores

Unnamed: 0,Scores,Logistic Regresion,K-Nearest Neighbor,Random Forest,XGBoost,Classification Tree
0,Base_Model,14.475575,7.483238,12.954981,9.530651,27.873563
1,GridSearchCV,14.475575,14.30795,3.148946,54.525862,27.873563
2,OPTUNA,[],14.391762,2.155172,83.967912,0.0


In [None]:
# We can see that the XGBoost is the model that performed the best after optimization. We'll try to improve the sensitivity 
#even more using model esemble.