# Performance Tuning the RandomForestClassifier Model

## 1. Import the necessary libraries

In [1]:
%matplotlib inline
from IPython.display import display, clear_output
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, make_scorer, precision_recall_curve, precision_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
import ipywidgets as widgets
import joblib
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

plt.style.use("ggplot")

print("The libraries were successfully loaded!")


The libraries were successfully loaded!


## 2. Import the data

In [3]:
df = pd.read_csv('data/cleansed-healthcare-dataset-stroke-data.csv',delimiter=',',header='infer')

# mimic the pipline for numerical variables
#scaler = StandardScaler()

#x = pd.DataFrame(df["age"], columns = ["age"])
#x["avg_glucose_level"] = df["avg_glucose_level"]
#x["bmi"] = df["bmi"]

#scaled_data = scaler.fit_transform(x)
#scaled_data = pd.DataFrame(scaled_data)

#df["age"] = scaled_data[0]
#df["avg_glucose_level"] = scaled_data[1]
#df["bmi"] = scaled_data[2]

# mimic the peipline for categorical variables 
#encoder = OneHotEncoder(handle_unknown='ignore')

#y = pd.DataFrame(df["gender"], columns = ["gender"])
#y["hypertension"] = df["hypertension"]
#y["heart_disease"] = df["heart_disease"]
#y["ever_married"] = df["ever_married"]
#y["work_type"] = df["work_type"]
#y["Residence_type"] = df["Residence_type"]
#y["smoking_status"] = df["smoking_status"]

#encoder_data = encoder.fit_transform(y)
#encoder_data = pd.DataFrame(encoder_data)

#df["gender"] = encoder_data[0]
#df["hypertension"] = encoder_data[1]
#df["heart_disease"] = encoder_data[2]
#df["ever_married"] = encoder_data[3]
#df["work_type"] = encoder_data[4]
#df["Residence_type"] = encoder_data[5]
#df["smoking_status"] = encoder_data[6]

df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,0,79.0,1,0,1,3,0,174.12,24.0,2,1
4,1,81.0,0,0,1,2,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4904,0,13.0,0,0,0,4,0,103.08,18.6,0,0
4905,0,81.0,0,0,1,3,1,125.20,40.0,2,0
4906,0,35.0,0,0,1,3,0,82.99,30.6,2,0
4907,1,51.0,0,0,1,2,0,166.29,25.6,1,0


## 3. Split the data into a training and testing set

In [4]:
# Separate features and labels
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']
label = 'stroke'
X, y = df[features].values, df[label].values

# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)


print('Training cases: %d\nTest cases: %d' % (X_train.size, X_test.size))

Training cases: 34360
Test cases: 14730


## 4. Define the RandomForestClassifier parameters to test

In [5]:
# First build a generic classifier and setup a parameter grid; random forests have many tunable parameters, which make it suitable for GridSearchCV. The scorers dictionary can be used as the scoring argument in GridSearchCV. When multiple scores are passed, GridSearchCV.cv_results_ will return scoring metrics for each of the score types provided.
clf = RandomForestClassifier(random_state=0)

param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [2, 3, 4, 5], 
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'n_estimators' : [75, 100, 125, 150, 175],
    'max_depth': [10, 15, 20, 25, 30],
    'max_features': [8, 9, 10, 11]
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

param_grid

{'bootstrap': [False, True],
 'min_samples_split': [2, 3, 4, 5],
 'min_samples_leaf': [1, 2, 3, 4, 5],
 'n_estimators': [75, 100, 125, 150, 175],
 'max_depth': [10, 15, 20, 25, 30],
 'max_features': [8, 9, 10, 11]}

In [6]:
def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train.values, y_train.values)

    # make the predictions
    y_pred = grid_search.predict(X_test.values)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

In [None]:
grid_search_clf = grid_search_wrapper(refit_score='precision_score')
grid_search_clf

In [None]:
results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_precision_score', ascending=False)

## 5. Test to see which parameters achieve the largest recall_score

In [7]:
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

  self.best_estimator_.fit(X, y, **fit_params)


Best params for recall_score
{'bootstrap': False, 'max_depth': 20, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 75}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg      1345        73
pos        46         9
Best params for recall_score
{'max_depth': 20, 'max_features': 10, 'min_samples_split': 2, 'n_estimators': 100}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg      1410         8
pos        52         3


In [8]:
param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [1, 2, 3], 
    'min_samples_leaf': [1, 2],
    'n_estimators' : [70, 75, 80],
    'max_depth': [19, 20, 21],
    'max_features': [9, 10, 11]
}


In [9]:
print("Round 2")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

Round 2


 0.06938154 0.06738858 0.06892857        nan        nan        nan
 0.10871906 0.10871906 0.11053724 0.10871906 0.10871906 0.11053724
        nan        nan        nan 0.07805736 0.08235994 0.07610994
 0.07997899 0.07976254 0.07997899        nan        nan        nan
 0.08805556 0.09305556 0.09305556 0.08805556 0.09305556 0.09305556
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan 0.0700887  0.07330128 0.07307692
 0.06915043 0.06772949 0.07002747        nan        nan        nan
 0.09760795 0.09760795 0.10134921 0.09760795 0.09760795 0.10134921
        nan        nan        nan 0.08412879 0.08759804 0.07732207
 0.07331232 0.07309587 0.07331232        nan        nan        nan
 0.08583333 0.09083333 0.09261905 0.08583333 0.09083333 0.09261905
        nan        nan        nan        nan        nan       

Best params for recall_score
{'bootstrap': False, 'max_depth': 20, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 75}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg      1345        73
pos        46         9


In [10]:
param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [1, 2, 3], 
    'min_samples_leaf': [1, 2],
    'n_estimators' : [74, 75, 76],
    'max_depth': [19, 20, 21],
    'max_features': [9, 10, 11]
}

In [11]:
print("Round 3")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

Round 3


 0.06738858 0.06738858 0.07002747        nan        nan        nan
 0.11064214 0.10871906 0.10871906 0.11064214 0.10871906 0.10871906
        nan        nan        nan 0.08319328 0.08235994 0.07767857
 0.07976254 0.07976254 0.07997899        nan        nan        nan
 0.09305556 0.09305556 0.09305556 0.09305556 0.09305556 0.09305556
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan 0.0700887  0.07330128 0.0700887
 0.06863858 0.06772949 0.06911838        nan        nan        nan
 0.09953102 0.09760795 0.10841991 0.09953102 0.09760795 0.10841991
        nan        nan        nan 0.08797683 0.08759804 0.08329545
 0.07309587 0.07309587 0.07331232        nan        nan        nan
 0.09083333 0.09083333 0.09083333 0.09083333 0.09083333 0.09083333
        nan        nan        nan        nan        nan        

Best params for recall_score
{'bootstrap': False, 'max_depth': 20, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 74}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg      1346        72
pos        46         9


In [12]:
param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [1, 2, 3], 
    'min_samples_leaf': [1, 2],
    'n_estimators' : [72, 73, 74],
    'max_depth': [19, 20, 21],
    'max_features': [9, 10, 11]
}

In [13]:
print("Round 4")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

Round 4


 0.07002747 0.06647949 0.06738858        nan        nan        nan
 0.11064214 0.11064214 0.11064214 0.11064214 0.11064214 0.11064214
        nan        nan        nan 0.08319328 0.08319328 0.08319328
 0.07976254 0.07976254 0.07976254        nan        nan        nan
 0.09305556 0.09305556 0.09305556 0.09305556 0.09305556 0.09305556
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan 0.0700887  0.06880665 0.0700887
 0.07002747 0.06786838 0.06863858        nan        nan        nan
 0.10841991 0.10841991 0.09953102 0.10841991 0.10841991 0.09953102
        nan        nan        nan 0.08843137 0.08759804 0.08797683
 0.07309587 0.07309587 0.07309587        nan        nan        nan
 0.09083333 0.09083333 0.09083333 0.09083333 0.09083333 0.09083333
        nan        nan        nan        nan        nan        

Best params for recall_score
{'bootstrap': False, 'max_depth': 20, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 72}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg      1346        72
pos        46         9


In [14]:
param_grid = {
    'bootstrap': [False, True],
    'min_samples_split': [1, 2, 3], 
    'min_samples_leaf': [1, 2],
    'n_estimators' : [70, 71, 72],
    'max_depth': [19, 20, 21],
    'max_features': [9, 10, 11]
}

In [15]:
print("Round 5")
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

Round 5


 0.06938154 0.06813154 0.07002747        nan        nan        nan
 0.10871906 0.11064214 0.11064214 0.10871906 0.11064214 0.11064214
        nan        nan        nan 0.07805736 0.08235994 0.08319328
 0.07997899 0.07976254 0.07976254        nan        nan        nan
 0.08805556 0.08805556 0.09305556 0.08805556 0.08805556 0.09305556
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan 0.0700887  0.0700887  0.0700887
 0.06915043 0.06915043 0.07002747        nan        nan        nan
 0.09760795 0.10841991 0.10841991 0.09760795 0.10841991 0.10841991
        nan        nan        nan 0.08412879 0.08843137 0.08843137
 0.07331232 0.07309587 0.07309587        nan        nan        nan
 0.08583333 0.08583333 0.09083333 0.08583333 0.08583333 0.09083333
        nan        nan        nan        nan        nan        

Best params for recall_score
{'bootstrap': False, 'max_depth': 20, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 71}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg      1345        73
pos        46         9


## . Test to see which parameters achieve the largest recall_score

In [None]:
grid_search_clf = grid_search_wrapper(refit_score='precision_score')
grid_search_clf
results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_precision_score', ascending=False)