# Performance Tuning the LogisticRegression Model

## 1. Import the necessary libraries

In [1]:
%matplotlib inline
from IPython.display import display, clear_output
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, make_scorer, precision_recall_curve, precision_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
import ipywidgets as widgets
import joblib
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
plt.style.use("ggplot")
import pydot
import graphviz
from IPython.display import Image
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.tree import export_graphviz
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
print("The libraries were successfully loaded!")

The libraries were successfully loaded!


## 2. Import the data

In [2]:
# Import Dataset
df = pd.read_csv('data/cleansed-healthcare-dataset-stroke-data.csv',delimiter=',',header='infer')
# Display top rows of the dataset
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,0,79.0,1,0,1,3,0,174.12,24.0,2,1
4,1,81.0,0,0,1,2,1,186.21,29.0,1,1


## 3. Split the data into a training and testing set

In [3]:
# Separate features and labels
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']
label = 'stroke'
X, y = df[features].values, df[label].values

# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)


print('Training cases: %d\nTest cases: %d' % (X_train.size, X_test.size))

Training cases: 34360
Test cases: 14730


## 4. Define the Grid Search Function to test

In [4]:
def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    
    #  Define grid search
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    
    # Train the models 
    grid_search.fit(X_train.values, y_train.values)

    # Test the models
    y_pred = grid_search.predict(X_test.values)
    
    # Print the best hyperparameters
    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

## 5. Logistic Regression Optimisation

In [6]:
# Create model
reg = 0.01
clf = LogisticRegression(random_state=0)

# Define models hyperparameters
param_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'dual' : [True, False],
    'tol' : [0.00001, 0.0001, 0.001, 0.01, 0.1],
    'C' : [1, 10, 100, 1000, 10000],
    'fit_intercept' :[True, False],
    'intercept_scaling' : [0.05, 0.075, 0.1, 0.125, 0.15],
    'class_weight' : ['balanced'],
    'solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter' : [75, 100, 125, 150, 175, 200],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    'warm_start' : [True, False]
}

# Define scores to test
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 1")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
 'dual': [True, False],
 'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1],
 'C': [1, 10, 100, 1000, 10000],
 'fit_intercept': [True, False],
 'intercept_scaling': [0.05, 0.075, 0.1, 0.125, 0.15],
 'class_weight': ['balanced'],
 'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
 'max_iter': [75, 100, 125, 150, 175, 200],
 'multi_class': ['auto', 'ovr', 'multinomial'],
 'warm_start': [True, False]}

In [8]:
# Define new hyperparameters
param_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'dual' : [True, False],
    'tol' : [0.00001, 0.0001, 0.001, 0.01, 0.1],
    'C' : [90, 100, 110],
    'fit_intercept' :[True, False],
    'intercept_scaling' : [0.025, 0.05, 0.055],
    'class_weight' : ['balanced'],
    'solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter' : [160, 175, 180],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    'warm_start' : [True, False]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 2")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [10]:
# Define new hyperparameters
param_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'dual' : [True, False],
    'tol' : [0.0000001, 0.000001, 0.00001, 0.0001],
    'C' : [99, 100, 101],
    'fit_intercept' :[True, False],
    'intercept_scaling' : [0.01, 0.015, 0.02, 0.025, 0.03],
    'class_weight' : ['balanced'],
    'solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter' : [174, 175, 176],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    'warm_start' : [True, False]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 3")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [12]:
# Define new hyperparameters
param_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'dual' : [True, False],
    'tol' : [0.000000001, 0.00000001, 0.0000001],
    'C' : [99, 100, 101],
    'fit_intercept' :[True, False],
    'intercept_scaling' : [0.001, 0.0075, 0.01],
    'class_weight' : ['balanced'],
    'solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter' : [174, 175, 176],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    'warm_start' : [True, False]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 4")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [14]:
# Define new hyperparameters
param_grid = {
    'penalty' : ['l2'],
    'dual' : [False],
    'tol' : [0.000000001, 0.0000000001, 0.00000000001],
    'C' : [100],
    'fit_intercept' :[True],
    'intercept_scaling' : [0.001,0.0001,0.00001],
    'class_weight' : ['balanced'],
    'solver' : ['saga'],
    'max_iter' : [175],
    'multi_class': ['multinomial'],
    'warm_start' : [True]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 5")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [16]:
# Define new hyperparameters

param_grid = {
    'penalty' : ['l2'],
    'dual' : [False],
    'tol' : [0.000000001, 0.0000000011, 0.0000000009],
    'C' : [100],
    'fit_intercept' :[True],
    'intercept_scaling' : [0.001,0.0011,0.0009],
    'class_weight' : ['balanced'],
    'solver' : ['saga'],
    'max_iter' : [175],
    'multi_class': ['multinomial'],
    'warm_start' : [True]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 6")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

## 6. Random Forest Optimisation

In [None]:
# Create model
clf = RandomForestClassifier(random_state=0)

# Define models hyperparameters
param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [2, 3, 4, 5], 
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'n_estimators' : [75, 100, 125, 150, 175],
    'max_depth': [10, 15, 20, 25, 30],
    'max_features': [8, 9, 10, 11]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 1")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define new hyperparameters
param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [1, 2, 3], 
    'min_samples_leaf': [1, 2],
    'n_estimators' : [70, 75, 80],
    'max_depth': [19, 20, 21],
    'max_features': [9, 10, 11]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 2")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define new hyperparameters
param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [1, 2, 3], 
    'min_samples_leaf': [1, 2],
    'n_estimators' : [74, 75, 76],
    'max_depth': [19, 20, 21],
    'max_features': [9, 10, 11]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 3")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define new hyperparameters

param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [1, 2, 3], 
    'min_samples_leaf': [1, 2],
    'n_estimators' : [72, 73, 74],
    'max_depth': [19, 20, 21],
    'max_features': [9, 10, 11]
}

print("Round 4")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define new hyperparameters

param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [1, 2, 3], 
    'min_samples_leaf': [1, 2],
    'n_estimators' : [70, 71, 72],
    'max_depth': [19, 20, 21],
    'max_features': [9, 10, 11]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 5")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')


## 7. KNN Optimisation

In [None]:
# Create model
clf = KNeighborsClassifier()

# Define models hyperparameters
param_grid = {
    'n_neighbors': [1, 5, 10],
    'weights': ['uniform', 'distance'], 
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [10, 30, 50],
    'p': [1, 2],
    'n_jobs': [1, 5, 10]
}

print("Round 1")
# Perform grid search to identify best hyperparameters for recall_score
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define new hyperparameters
param_grid = {
    'n_neighbors': [1, 2, 3],
    'weights': ['uniform', 'distance'], 
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [5, 10, 15],
    'p': [1, 2, 3],
    'n_jobs': [1, 2, 3]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 2")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define new hyperparameters
param_grid = {
    'n_neighbors': [1, 2],
    'weights': ['uniform'], 
    'algorithm': ['auto'],
    'leaf_size' : [4, 5, 6],
    'p': [1, 2, 3],
    'n_jobs': [1, 2]
}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 3")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define new hyperparameters
param_grid = {
    'n_neighbors': [1, 2],
    'weights': ['uniform'], 
    'algorithm': ['auto'],
    'leaf_size' : [1, 2, 3, 4],
    'p': [1, 2, 3],
    'n_jobs': [1, 2]
}

print("Round 4")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

## 8. SVC Optimisation

In [None]:
# Create model
clf = SVC(random_state=0, probability=True)

# Define models hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
    'degree': [1, 3, 5],
    'gamma' : ['scale', 'auto'],
    'coef0': [0, 1],
    'shrinking': [True, False],
    'tol': [0.00001, 0.001, 0.1],
    'cache_size': [100, 200, 300],
    'verbose': [True, False],
    'max_iter': [1, 3, 5],
    'decision_function_shape': ['ovo', 'ovr'],
    'break_ties': [True, False]
}

print("Round 1")
# Perform grid search to identify best hyperparameters for recall_score
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define new hyperparameters
param_grid = {'C': [0.01, 0.1, 0.5],
 'kernel': ['sigmoid'],
 'degree': [0, 1, 2],
 'gamma': ['auto'],
 'coef0': [0],
 'shrinking': [True],
 'tol': [1.00000005, 1.0000005, 1.000005],
 'cache_size': [50, 100, 150],
 'verbose': [True],
 'max_iter': [0, 1, 2],
 'decision_function_shape': ['ovr'],
 'break_ties': [True]}

# Perform grid search to identify best hyperparameters for recall_score
print("Round 2")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
# Define models hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
    'degree': [1, 3, 5],
    'gamma' : ['scale', 'auto'],
    'coef0': [0, 1],
    'shrinking': [True, False],
    'tol': [0.00001, 0.001, 0.1],
    'cache_size': [100, 200, 300],
    'verbose': [True, False],
    'max_iter': [1, 3, 5],
    'decision_function_shape': ['ovo', 'ovr'],
    'break_ties': [True, False]
}

# Perform grid search to identify best hyperparameters for precision_score
print("Round 3")
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

In [None]:
# Define models hyperparameters
param_grid = {
    'C': [0.001, 0.01, 0.1],
    'kernel': ['poly'], 
    'degree': [5, 10, 15],
    'gamma' : ['auto'],
    'coef0': [0],
    'shrinking': [True],
    'tol': [0.0000001, 0.000001, 0.00001],
    'cache_size': [50, 100, 150],
    'verbose': [True],
    'max_iter': [5, 10, 15],
    'decision_function_shape': ['ovr'],
    'break_ties': [True]
}

# Perform grid search to identify best hyperparameters for precision_score
print("Round 4")
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

In [None]:
 # Define models hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'], 
    'degree': [1, 3, 5],
    'gamma' : ['scale', 'auto'],
    'coef0': [0, 1],
    'shrinking': [True, False],
    'tol': [0.00001, 0.001, 0.1],
    'cache_size': [100, 200, 300],
    'verbose': [True, False],
    'max_iter': [1, 3, 5],
    'decision_function_shape': ['ovo', 'ovr'],
    'break_ties': [True, False]
}

print("Round 5")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')