In [None]:
# Copyright 2021 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Binary Classification Model Diagnostics Example Notebook

This notebook demonstrates the uset of [Binary Classication Diagnostics](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/diagnostics/binary_classification.py) module.

## Requirements:

A test ML dataset (scored by a binary classification model) containing the original binary label, predicted probabilities (for the positive class) and features     is avilable
as a csv file or a GCP BigQuery table.  

## Install and import required modules

In [None]:
# Uncomment to install gps_building_blocks
# !pip install gps_building_blocks

In [None]:
import pandas as pd
import numpy as np

from gps_building_blocks.cloud.utils import bigquery as bigquery_utils
from gps_building_blocks.ml.diagnostics import binary_classification

## Set paramaters

In [None]:
# If the scored test dataset is available as a GCP BigQuery table

# GCP Project ID
PROJECT_ID = 'project-id'
# BigQuery dataset name
DATASET = 'dataset'
# BigQuery table name containing the scored test dataset
TEST_DATA_PREDICTIONS_TABLE = 'test_prediction_table'

bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

# SQL for extracting prediction dataset when using BQML.
sql = f"""
  SELECT *
  FROM `{PROJECT_ID}.{DATASET}.{TEST_DATA_PREDICTIONS_TABLE}`;
"""
print (sql)
df_prediction = bq_utils.run_query(sql).to_dataframe()

In [None]:
# If the scored test dataset is available as a csv file

# Csv file name of the scored test dataset 
TEST_DATA_PREDICTIONS_FILE = 'test_predictions.csv'

df_prediction = pd.read_csv('TEST_DATA_PREDICTIONS_FILE')

In [None]:
# Print the dataset
df_prediction.head()

In [None]:
# Name of the column in the prediction table with the predicted probabilities
PREDICTED_LABEL_NAME = 'predicted_label_probs'
# Name of the column in the prediction table with the actual label
ACTUAL_LABEL_NAME = 'label'
# Label value for the positive class
POSITIVE_CLASS_LABEL = True

## Transform the label values

In [None]:
# Check the actual label values
df_prediction[ACTUAL_LABEL_NAME].value_counts()

In [None]:
# Convert the actual label values into 1.0 and 0.0 if the they are different
df_prediction['label_numerical'] = [1.0 if label == POSITIVE_CLASS_LABEL
                                    else 0.0 for label in 
                                    df_prediction[ACTUAL_LABEL_NAME]]

# Check the transformed label values
df_prediction['label_numerical'].value_counts() 

## Run model diagnostics

## Performance metrics

The following function calculates below metrics:

* `prop_positives`: Proportion of positive instances in the dataset scored
* `auc_roc`: Area under the ROC curve (more on this below).
* `auc_pr`: Area under the Precision Recall Curve (more on this below).
* `binarize_threshold`: The probability threshold used to binarize the predictions to calculate the following performance metrics.
* `accuracy`: overall accuracy of the predictions.
* `true_positive_rate`: (Recall or Sensitivity) proportion of positive. instances correctly predicted out of all the positive instances in the dataset.
* `true_negative_rate`: (Specificity) proportion of negative instances. correctly predicted out of all the negative instances in the dataset.
* `precision`: (Confidence) proportion of positive instances correctly predicted out of all the instances predicted as positives.
* `f1_score`: A weighted average between Precision and Recall.

In [None]:
binary_classification.calc_performance_metrics(
    labels = df_prediction['label_numerical'].values,
    probability_predictions = df_prediction[PREDICTED_LABEL_NAME].values)

## ROC curve (True Positive Rate vs False Positive Rate)

The following function plots the receiver operating characteristic (ROC) curve, which illustrates the relationship between true positive rate and false positive rate of the predictions from a binary classifier system. As the major metric to evaluate the ROC curve, the AUC (Area Under The Curve) is also printed.

In [None]:
binary_classification.plot_roc_curve(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction[PREDICTED_LABEL_NAME].values)

## Precision-Recall curve

The following function plots the precision-recall curve, which illustrates the relationship between the precision and recall of the predictions from a binary classifier system. In general a high precision rate relates to a low false positive rate, while a high recall rate relates to a low false negative rate. By showing the tradeoff between precision and recall with different thresholds on the predictions, we are able to find out the optimal threshold for the specific problem. Moreover, the Average Precision (AP), which calculates the area under the precision-recall curve, is also printed for the evaluation of precision recall curve.[link text](https://)

In [None]:
binary_classification.plot_precision_recall_curve(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction[PREDICTED_LABEL_NAME].values)

## Predicted probability distributions

The following function plots the distributions of predicted probabilities for each class. This illustrates how distinguishable the predicted probabilities for different classes are.

In [None]:
binary_classification.plot_predicted_probabilities(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction[PREDICTED_LABEL_NAME].values)

## Calculate and plot performance metrics for Bins of the Probabilities

The following function does following:

1. Sorts predicted probabilities in the descending order, then divides the instances into N number of equal sized bins (e.g. deciles when N = 10) such that the first bin has the instances with the highest probabilities and so on.
2. Calculates the Precision, Precision Uplift (Precision of the bin divided by the proportion of positive instances in the dataset indicating the Precision of selecting a random sample of the size of the bin) and Coverage (or Recall = the proportion of positive instances in the bin out of all the positive instances in the dataset) for each bin.

**Observations:** These plots can give a good understanding of the model performance for these different bins of the probability predictions. Generally we would expect to see a monotonically decreasing trend of these metrics going from the top bin to the bottom bin. If we see some different pattern from this plot, it would create some doubts of the quality of the model leading to further investigations.

In [None]:
bin_metrics = binary_classification.calc_bin_metrics(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction[PREDICTED_LABEL_NAME].values,
    number_bins=10,
    decimal_points=4)

binary_classification.plot_bin_metrics(bin_metrics=bin_metrics)

In [None]:
bin_metrics

## Calculate and plot performance metrics for Cumulative Bins of the Probabilities

The following function plots does following:

1. Sorts predicted probabilities in the descending order, and then divides the instances into N number of bins with increasing size. For example, when N = 10, it creates 10 bins such that the first bin contains the top 10% instances with the highest probability, the second bin contains the top 20% instances with the highest probability and so on where the last bin contains all the instances.
2. Calculates the Precision, Precision Uplift (Precision of the bin divided by the proportion of positive instances in the dataset indicating the Precision of selecting a random sample of the size of the bin) and Coverage (or Recall = the proportion of positive instances in the bin out of all the positive instances in the dataset) for each bin.

**Observations:** From these plots we would generally expect the Precision and Precision uplift to monotonically decrease and Recall to monotonically increase when we go from the high probability bins to the lower ones. These plots give us a good understanding of the expected precision and recall values for example if we select top N% of the instances.


In [None]:
cumulative_bin_metrics = binary_classification.calc_cumulative_bin_metrics(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction[PREDICTED_LABEL_NAME].values,
    number_bins=10,
    decimal_points=4)

binary_classification.plot_cumulative_bin_metrics(
    cumulative_bin_metrics=cumulative_bin_metrics);

## Feature exploration plots

This section calculates and plots the distribution of features in the dataset for the equal sized bins of the predicted probability.

How it works:
1. First sorts predicted probabilities in the descending order, and then divides the instances into N number of equal sized bins (e.g. deciles when N = 10) such that the first bin has the instances with the highest probabilities and so on.
2. Then it calculates and plots the distribution of each feature for each plot.

**Expected outputs:**

The output plots can be used to understand the relationships (positive, negative and non-linear correlations) between the features and the predictions (labels) leading to:

* extracting new insights in relation to the business problem (e.g. how the demographics, user-behaviour or functionalities in the app/website related to the conversion rate).
* confirming the relationships the model has learned between features and label makes sense (e.g. it hasn't learned any suspicious relationships caused by label leakages due to data collection or processing issues).


In [None]:
# If the scored test dataset is available as a GCP BigQuery table we can read in 
# the table schema to identify data types of the features

sql = f"""SELECT column_name, data_type
       FROM `{PROJECT_ID}.{DATASET}`.INFORMATION_SCHEMA.COLUMNS
       WHERE table_name='{TEST_DATA_PREDICTIONS_TABLE}'"""

print(sql)
schema = bq_utils.run_query(sql).to_dataframe()
schema.columns = ['column_name', 'type']
print(schema.head())

# Seperate out numerical and categorical columns names.
num_features = list(schema[schema['type'].isin(['INT64','FLOAT64'])]['column_name'])
cat_features = list(schema[schema['type'].isin(['STRING'])]['column_name'])

In [None]:
# If the scored test dataset is available as a csv file   

# List of numerical features to plot
num_features = [...]
# List of categorical features to plot
cat_features = [...]

In [None]:
# Remove non-feature columns.
columns_to_remove = ['user_id', 'label']
num_features = [v for v in num_features if v not in columns_to_remove]
cat_features = [v for v in cat_features if v not in columns_to_remove]
print(f'Number of numerical features: {len(num_features)}')
print(f'Number of categorical features: {len(cat_features)}')

feature_names_to_plot = num_features + cat_features
feature_types = (['numerical'] * len(num_features) + 
                 ['categorical'] * len(cat_features))

In [None]:
binary_classification.plot_binned_features(
    data=df_prediction,
    prediction_column_name=PREDICTED_LABEL_NAME,
    feature_names=feature_names_to_plot,
    feature_types=feature_types);