<a href="https://colab.research.google.com/github/cm-int/machine-learning-fundamentals/blob/main/module_2/Democode/Mod_2_Lesson_3_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validating a Binary Classification Machine Learning Model

In this demonstration, you’ll build and test a Gradient Boosted Tree model. You will measure the accuracy, precision, recall and AUC for the model, and then perform a grid search to optimize the hyperparameters for the model. You’ll also assess the model for its response to bias and variance in the data.

This demonstration uses a modified version of the **Mushroom Classification: Safe to eat or deadly poison?** dataset originally donated to the UCI Machine Learning repository. It is available for use under the **CC0: Public Domain** licence.

# Upload and prepare the data

This is the same dataset used by the previous demonstration

In [None]:
!wget 'https://raw.githubusercontent.com/cm-int/machine-learning-fundamentals/main/module_2/Democode/mushrooms.csv'

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

mushrooms = pd.read_csv("mushrooms.csv")
mushrooms = pd.get_dummies(mushrooms)
features = mushrooms.drop(['class_e', 'class_p'], axis=1)
predictions = mushrooms['class_e']

features_train, features_test, predictions_train, predictions_test = train_test_split(features, predictions, test_size=0.33, random_state=13) # Random state specified to ensure repeatability if necessary

# Create a Gradient Boosted Tree model to classify the data

This is the same procedure used by the previous demonstration

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbt_model = GradientBoostingClassifier(n_estimators=11, learning_rate=3, criterion='squared_error', max_depth=10) # NOTE: These parameters have been deliberately chosen to generate a poor model
_= gbt_model.fit(features_train, predictions_train)

# Test the model using the test dataset

Display the results of the predictions, generate the Confusion Matrix and ROC curve, and measure the AUC, accuracy, precision, and recall

In [None]:
# Make predictions

results = gbt_model.predict(features_test)
print(results)
print('\n')

probabilities = gbt_model.predict_proba(features_test)
print(probabilities[0:100]) # Display the first 100 sets of probabilities

In [None]:
# Generate the confusion matrix from the predictions

from sklearn.metrics import ConfusionMatrixDisplay

_ = ConfusionMatrixDisplay.from_predictions(predictions_test, results, display_labels=['Poisonous', 'Edible'])

In [None]:
# Calculate the accuracy, precision, and recall
# All are a bit low

from sklearn.metrics import accuracy_score, precision_score, recall_score

print(f'Accuracy: {accuracy_score(predictions_test, results)}')
print(f'Precision: {precision_score(predictions_test, results)}')
print(f'Recall: {recall_score(predictions_test, results)}')

In [None]:
# Display the ROC curve
# Model predictions are not much better than random chance

from sklearn.metrics import RocCurveDisplay, auc, roc_curve 
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(predictions_test, probabilities[:, 1])
auc_score = auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(10, 10))
display = RocCurveDisplay(fpr=fpr, tpr=tpr, estimator_name='GBT model', roc_auc=auc_score)
display.plot(ax=ax, c='blue')
ax.plot((0, 1), (0, 1), c='red', label='Random chance')
plt.legend()
plt.show() 

In [None]:
# Display the raw data for the ROC curve

print(fpr)
print('\n')
print(tpr)
print('\n')
print(thresholds)

# Optimize the model

In [None]:
# Perform a grid search for the best combination of relevant hyperparameters that give the highest precision
# NOTE: This step takes about 10 minutes to run. Go and get a cup of tea!

from sklearn.model_selection import GridSearchCV

hyperparam_values = [
  {'n_estimators': [5, 10, 50, 100, 200], 'learning_rate': [0.0, 0.1, 0.5, 3], 'criterion': ['friedman_mse', 'squared_error'], 'max_depth': [2, 3, 5, 10]}
]

#scoring_metrics = {'AUC': 'roc_auc', 'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall'}
scoring_metrics = {'Precision': 'precision'}

gbt_gridsearcher = GridSearchCV(GradientBoostingClassifier(), param_grid=hyperparam_values, scoring=scoring_metrics, refit='Precision', return_train_score=True, cv=5)
_ = gbt_gridsearcher.fit(features_train, predictions_train)

results = gbt_gridsearcher.cv_results_
print(results)

In [None]:
print(gbt_gridsearcher.best_params_)

# Capture the best estimator from the grid search
precision_gbt_model = gbt_gridsearcher.best_estimator_ 

In [None]:
# Test the best estimator with the test dataset

results = precision_gbt_model.predict(features_test)
print(results)
print('\n')

probabilities = precision_gbt_model.predict_proba(features_test)
print(probabilities[0:100]) # Display the first 100 sets of probabilities.Compare with those generated previously

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, results, display_labels=['Poisonous', 'Edible'])

print(f'Accuracy: {accuracy_score(predictions_test, results)}')
print(f'Precision: {precision_score(predictions_test, results)}')
print(f'Recall: {recall_score(predictions_test, results)}')

# Results should be much improved

In [None]:
fpr, tpr, thresholds = roc_curve(predictions_test, probabilities[:, 1], drop_intermediate=True)
auc_score = auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(10, 10))
display = RocCurveDisplay(fpr=fpr, tpr=tpr, estimator_name='GBT model', roc_auc=auc_score)
display.plot(ax=ax, c='blue')
ax.plot((0, 1), (0, 1), c='red', label='Random chance')
plt.legend()
plt.show()

# Model predictions are now much better than random chance

In [None]:
print(fpr)
print('\n')
print(tpr)
print('\n')
print(thresholds)

In [None]:
# Try again. This time optimize for AUC

hyperparam_values = [
  {'n_estimators': [5, 10, 50, 100, 200], 'learning_rate': [0.0, 0.1, 0.5, 3], 'criterion': ['friedman_mse', 'squared_error'], 'max_depth': [2, 3, 5, 10]}
]

scoring_metrics = {'AUC': 'roc_auc'}

gbt_gridsearcher = GridSearchCV(GradientBoostingClassifier(), param_grid=hyperparam_values, scoring=scoring_metrics, refit='AUC', return_train_score=True, cv=5)
_ = gbt_gridsearcher.fit(features_train, predictions_train)

results = gbt_gridsearcher.cv_results_
print(results)

In [None]:
print(gbt_gridsearcher.best_params_)
auc_gbt_model = gbt_gridsearcher.best_estimator_ 

In [None]:
# Test the model

results = auc_gbt_model.predict(features_test)

probabilities = auc_gbt_model.predict_proba(features_test)

_ = ConfusionMatrixDisplay.from_predictions(predictions_test, results, display_labels=['Poisonous', 'Edible'])

print(f'Accuracy: {accuracy_score(predictions_test, results)}')
print(f'Precision: {precision_score(predictions_test, results)}')
print(f'Recall: {recall_score(predictions_test, results)}')

fpr, tpr, thresholds = roc_curve(predictions_test, probabilities[:, 1], drop_intermediate=True)
auc_score = auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(10, 10))
display = RocCurveDisplay(fpr=fpr, tpr=tpr, estimator_name='GBT model', roc_auc=auc_score)
display.plot(ax=ax, c='blue')
ax.plot((0, 1), (0, 1), c='red', label='Random chance')
plt.legend()
plt.show() 

# AUC is slightly increased. Precision is slightly reduced

# Tune to balance variance and bias

In [None]:
# Search for better value for subsample to balance variance and bias in the precision optimized model
# NOTE: This step takes about 5 minutes to run.

hyperparam_values = [
  {'ccp_alpha': np.arange(0.0, 5.0, 0.5), 'subsample': np.arange(0.1, 1.0, 0.1), 'max_features': ['auto', 'sqrt', 'log2'], 'tol':[1e-1, 1e-2, 1e-4] }
]

scoring_metrics = {'Precision': 'precision'}

gbt_gridsearcher = GridSearchCV(precision_gbt_model, param_grid=hyperparam_values, scoring=scoring_metrics, refit='Precision', return_train_score=True, cv=5)

_ = gbt_gridsearcher.fit(features_train, predictions_train)

results = gbt_gridsearcher.cv_results_
print(results)

In [None]:
print(gbt_gridsearcher.best_params_)
gbt_tuned_model = gbt_gridsearcher.best_estimator_

In [None]:
# Test the model with the proposed leaf size

results = gbt_tuned_model.predict(features_test)
print(results)
print('\n')

probabilities = gbt_tuned_model.predict_proba(features_test)
print(probabilities[0:100]) # Display the first 100 sets of probabilities.Compare with those generated previously

In [None]:
_ = ConfusionMatrixDisplay.from_predictions(predictions_test, results, display_labels=['Poisonous', 'Edible'])

print(f'Accuracy: {accuracy_score(predictions_test, results)}')
print(f'Precision: {precision_score(predictions_test, results)}')
print(f'Recall: {recall_score(predictions_test, results)}')

# Precision marginally improved

In [None]:
fpr, tpr, thresholds = roc_curve(predictions_test, probabilities[:, 1], drop_intermediate=True)
auc_score = auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(10, 10))
display = RocCurveDisplay(fpr=fpr, tpr=tpr, estimator_name='GBT model', roc_auc=auc_score)
display.plot(ax=ax, c='blue')
ax.plot((0, 1), (0, 1), c='red', label='Random chance')
plt.legend()
plt.show() 

In [None]:
# Plot precision vs subsample

# NOTE: Run this several times. The graphs will vary slightly because subsample values less than 1 cause stochastic gradient boosting

hyperparam_values = [
  {'subsample': np.arange(0.1, 1.0, 0.1)}
]

scoring_metrics = {'Precision': 'precision', 'Recall': 'recall', 'Accuracy': 'accuracy', 'AUC': 'roc_auc'}

gbt_gridsearcher = GridSearchCV(gbt_tuned_model, param_grid=hyperparam_values, scoring=scoring_metrics, refit='Precision', return_train_score=True, cv=5)

_ = gbt_gridsearcher.fit(features_train, predictions_train)

results = gbt_gridsearcher.cv_results_

param_subsample = [r['subsample'] for r in results['params']]

mean_test_precision = results['mean_test_Precision']
mean_train_precision = results['mean_train_Precision']

std_test_precision = results['std_test_Precision']
std_train_precision = results['std_train_Precision']

plt.figure(figsize=(10, 10))

plt.plot(param_subsample, mean_test_precision, label='Test Precision')
plt.fill_between(param_subsample, mean_test_precision - std_test_precision, \
                 mean_test_precision + std_test_precision, alpha=0.1)

plt.plot(param_subsample, mean_train_precision, label='Train Precision')
plt.fill_between(param_subsample, mean_train_precision - std_train_precision, \
                 mean_train_precision + std_train_precision, alpha=0.1)

plt.legend(prop={'size':18})
plt.xlabel('Subsample', fontdict={'family':'serif','color': 'darkred', \
           'weight': 'normal', 'size': 28})
plt.ylabel('Scores', fontdict={'family': 'serif', 'color': 'darkred', \
           'weight': 'normal', 'size': 28})

plt.plot((0.7, 0.7), (0.87, 0.92), c='red')
plt.text(0.60, 0.87, 'Subsample=0.7', fontdict={'size': 18})

plt.show()


In [None]:
# Plot recall vs subsample

mean_test_recall = results['mean_test_Recall']
mean_train_recall = results['mean_train_Recall']

std_test_recall = results['std_test_Recall']
std_train_recall = results['std_train_Recall']

plt.figure(figsize=(10, 10))

plt.plot(param_subsample, mean_test_recall, label='Test Recall')
plt.fill_between(param_subsample, mean_test_recall - std_test_recall, \
                 mean_test_recall + std_test_recall, alpha=0.1)

plt.plot(param_subsample, mean_train_recall, label='Train Recall')
plt.fill_between(param_subsample, mean_train_recall - std_train_recall, \
                 mean_train_recall + std_train_recall, alpha=0.1)

plt.legend(prop={'size':18})
plt.xlabel('Subsample', fontdict={'family':'serif','color': 'darkred', \
           'weight': 'normal', 'size': 28})
plt.ylabel('Scores', fontdict={'family': 'serif', 'color': 'darkred', \
           'weight': 'normal', 'size': 28})

plt.plot((0.7, 0.7), (0.87, 0.93), c='red')
plt.text(0.60, 0.87, 'Subsample=0.7', fontdict={'size': 18})

plt.show()

In [None]:
# Plot accuracy vs subsample

mean_test_accuracy = results['mean_test_Accuracy']
mean_train_accuracy = results['mean_train_Accuracy']

std_test_accuracy = results['std_test_Accuracy']
std_train_accuracy = results['std_train_Accuracy']

plt.figure(figsize=(10, 10))

plt.plot(param_subsample, mean_test_accuracy, label='Test Accuracy')
plt.fill_between(param_subsample, mean_test_accuracy - std_test_accuracy, \
                 mean_test_accuracy + std_test_accuracy, alpha=0.1)

plt.plot(param_subsample, mean_train_accuracy, label='Train Accuracy')
plt.fill_between(param_subsample, mean_train_accuracy - std_train_accuracy, \
                 mean_train_accuracy + std_train_accuracy, alpha=0.1)

plt.legend(prop={'size':18})
plt.xlabel('Subsample', fontdict={'family':'serif','color': 'darkred', \
           'weight': 'normal', 'size': 28})
plt.ylabel('Scores', fontdict={'family': 'serif', 'color': 'darkred', \
           'weight': 'normal', 'size': 28})

plt.plot((0.7, 0.7), (0.87, 0.93), c='red')
plt.text(0.60, 0.87, 'Subsample=0.7', fontdict={'size': 18})

plt.show()

In [None]:
# Plot AUC vs subsample

mean_test_auc = results['mean_test_AUC']
mean_train_auc = results['mean_train_AUC']

std_test_auc = results['std_test_AUC']
std_train_auc = results['std_train_AUC']

plt.figure(figsize=(10, 10))

plt.plot(param_subsample, mean_test_auc, label='Test AUC')
plt.fill_between(param_subsample, mean_test_auc - std_test_auc, \
                 mean_test_auc + std_test_auc, alpha=0.1)

plt.plot(param_subsample, mean_train_auc, label='Train AUC')
plt.fill_between(param_subsample, mean_train_auc - std_train_auc, \
                 mean_train_auc + std_train_auc, alpha=0.1)

plt.legend(prop={'size':18})
plt.xlabel('Subsample', fontdict={'family':'serif','color': 'darkred', \
           'weight': 'normal', 'size': 28})
plt.ylabel('Scores', fontdict={'family': 'serif', 'color': 'darkred', \
           'weight': 'normal', 'size': 28})

plt.plot((0.7, 0.7), (0.87, 0.94), c='red')
plt.text(0.60, 0.87, 'Subsample=0.7', fontdict={'size': 18})

plt.show()

# Conclusions

Setting the correct hyperparameter values can have a significant impact on the performance of the model.

The results of tuning to balance bias and variance depend on the data rather than the model. Ideally, tuning should result in small adjustments to the precision, scale, and accuracy. If not, there may be significant skew in the data which should warrant further investigation. For example, is your sampling methodology biased? 