In [None]:
# Copyright 2021 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 5. Model Evaluation and Diagnostics

This notebook demonstrates the evaluation of a propensity model by using the
[Binary Classication Diagnostics](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/diagnostics/binary_classification.py)
module.

This evaluation consists of:
* Model performance with respect to a variety of metrics.
* Plots to understand the model performance in relation to the different propensity groups helping to design media experiments.
* Model insights (the relationship between features and the predictions/label) helping to generate new business insights.
* Insights helping to diagnose the model to make sure it is reasonable (e.g. no label leakage in features).

### Requirements
* The model and the Testing dataset should be available in GCP BigQuery.

## Install and import required modules

In [None]:
# Install gps_building_blocks package if not installed.
# !pip install gps_building_blocks

In [None]:
import pandas as pd
import numpy as np

from gps_building_blocks.cloud.utils import bigquery as bigquery_utils
from gps_building_blocks.ml.diagnostics import binary_classification

## Set paramaters

In [None]:
# GCP Project ID
PROJECT_ID = 'project-id'
# BigQuery dataset name
DATASET = 'dataset'
# BigQuery table name with the test dataset ready for scoring.
TEST_DATA_TABLE = 'test_table'
# BigQuery model name
MODEL_NAME = 'propensity_model'
# BigQuery table name containing the scored test dataset.
TEST_DATA_PREDICTIONS_TABLE = 'test_prediction_table'
# Optional: selected snapshot date to filter ML instances,
# reflecting the instances to be evaluated on a given scoring date
# YYYY-MM-DD format
SELECTED_SNAPSHOT_DATE = '2021-05-31'
# Name of the column in the prediction table with the predicted label
PREDICTED_LABEL_NAME = 'predicted_label_probs'
# Name of the column in the prediction table with the actual label
ACTUAL_LABEL_NAME = 'label'
# Label value for the positive class
POSITIVE_CLASS_LABEL = True

In [None]:
bq_client = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

## Run Model Diagnostics

## Create Scored Test Dataset (if not already scored)

In [None]:
# Prediction sql query
# TODO() replace filtering with list of dates
prediction_query =f"""
  CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET}.{TEST_DATA_PREDICTIONS_TABLE}` AS (
  SELECT *
  FROM ML.PREDICT(MODEL `{PROJECT_ID}.{DATASET}.{MODEL_NAME}`,
                  TABLE `{PROJECT_ID}.{DATASET}.{TEST_DATA_TABLE}`)
  -- Uncomment line below if you want to filter instances on the snapshot date
  -- WHERE snapshot_ts='{SELECTED_SNAPSHOT_DATE}')
"""
print(prediction_query)
bq_client.run_query(prediction_query)

## Read the Predition Test Dataset (after scoring, in BQML format)

In this step, we assume the prediction dataset is available as a BQ table, for example produced by AUTOML model using batch scoring. Following functions require a Pandas DataFrame with columns containing the binary label (1.0 and 0.0 values) and predicted probabilities (between 0.0 and 1.0) so we are going to transform loaded data into dataframe with this format.

In [None]:
# Sql for extracting prediction dataset when using BQML.
sql = f"""
SELECT
  predictions.label AS label,
  predicted_label,
  probs.label AS predicted_score_label,
  probs.prob AS score,
  *
  #remove duplicates of columns like label_1
  EXCEPT(predicted_label_probs,
    label,
    predicted_label,
    prob)
FROM
  `{PROJECT_ID}.{DATASET}.{TEST_DATA_PREDICTIONS_TABLE}` AS predictions,
  UNNEST({PREDICTED_LABEL_NAME}) AS probs
WHERE
  probs.label={POSITIVE_CLASS_LABEL};
"""
print (sql)
df_prediction = (
    bq_client.run_query(sql).to_dataframe())

### Check loaded outputs

In [None]:
df_prediction

In [None]:
df_prediction[ACTUAL_LABEL_NAME].value_counts()

In [None]:
# Extract score for predicting positive label.
# Change positive label into 1.0.
df_prediction['label_numerical'] = [1.0 if label==POSITIVE_CLASS_LABEL else 0.0
                          for label in df_prediction[ACTUAL_LABEL_NAME]]

### Check transformed outputs

In [None]:
df_prediction.head()

In [None]:
df_prediction['label_numerical'].value_counts()

## Performance metrics

The following function calculates following metrics:

* prop_positives: Proportion of positive instances in the dataset scored
* auc_roc: Area under the ROC curve (more on this below).
* auc_pr: Area under the Precision Recall Curve (more on this below).
* binarize_threshold: The probability threshold used to binarize the predictions to calculate the following performance metrics.
* accuracy: overall accuracy of the predictions.
* true_positive_rate: (Recall or Sensitivity) proportion of positive. instances correctly predicted out of all the positive instances in the dataset.
* true_negative_rate: (Specificity) proportion of negative instances. correctly predicted out of all the negative instances in the dataset.
* precision: (Confidence) proportion of positive instances correctly predicted out of all the instances predicted as positives.
* f1_score: A weighted average between Precision and Recall.

We need to provide:

`label` -  an array of true binary labels represented in merical form (1.0 and 0.0)
`probability predictions` - an array of predicted probabilities between 0.0 and 1.0

In [None]:
binary_classification.calc_performance_metrics(
    labels = df_prediction['label_numerical'].values,
    probability_predictions = df_prediction['score'].values)

## ROC Curve (True Positive Rate vs False Positive Rate)

The following function plots the receiver operating characteristic (ROC) curve, which illustrates the relationship between true positive rate and false positive rate of the predictions from a binary classifier system. As the major metric to evaluate the ROC curve, the AUC (Area Under The Curve) is also printed.

In [None]:
binary_classification.plot_roc_curve(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction['score'].values)

## Precision-Recall Curve

The following function plots the precision-recall curve, which illustrates the relationship between the precision and recall of the predictions from a binary classifier system. In general a high precision rate relates to a low false positive rate, while a high recall rate relates to a low false negative rate. By showing the tradeoff between precision and recall with different thresholds on the predictions, we are able to find out the optimal threshold for the specific problem. Moreover, the Average Precision (AP), which calculates the area under the precision-recall curve, is also printed for the evaluation of precision recall curve.

In [None]:
binary_classification.plot_precision_recall_curve(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction['score'].values)

## Predicted Probability Distributions

The following function plots the distributions of predicted probabilities for each class. This illustrates how distinguishable the predicted probabilities for different classes are.

In [None]:
binary_classification.plot_predicted_probabilities(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction['score'].values)

## Calculate and plot performance metrics for Bins of the Probabilities

The following function does following:

1. Sorts predicted probabilities in the descending order, then divides the instances into N number of equal sized bins (e.g. deciles when N = 10) such that the first bin has the instances with the highest probabilities and so on.
2. Calculates the Precision, Precision Uplift (Precision of the bin divided by the proportion of positive instances in the dataset indicating the Precision of selecting a random sample of the size of the bin) and Coverage (or Recall = the proportion of positive instances in the bin out of all the positive instances in the dataset) for each bin.

**Observations:** These plots can give a good understanding of the model performance for these different bins of the probability predictions. Generally we would expect to see a monotonically decreasing trend of these metrics going from the top bin to the bottom bin. If we see some different pattern from this plot, it would create some doubts of the quality of the model leading to further investigations.

In [None]:
bin_metrics = binary_classification.calc_bin_metrics(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction['score'].values,
    number_bins=10,
    decimal_points=4)

binary_classification.plot_bin_metrics(bin_metrics=bin_metrics, )

In [None]:
bin_metrics

## Calculate and plot performance metrics for Cumulative Bins of the Probabilities

The following function plots does following:

1. Sorts predicted probabilities in the descending order, and then divides the instances into N number of bins with increasing size. For example, when N = 10, it creates 10 bins such that the first bin contains the top 10% instances with the highest probability, the second bin contains the top 20% instances with the highest probability and so on where the last bin contains all the instances.
2. Calculates the Precision, Precision Uplift (Precision of the bin divided by the proportion of positive instances in the dataset indicating the Precision of selecting a random sample of the size of the bin) and Coverage (or Recall = the proportion of positive instances in the bin out of all the positive instances in the dataset) for each bin.

**Observations:** From these plots we would generally expect the Precision and Precision uplift to monotonically decrease and Recall to monotonically increase when we go from the high probability bins to the lower ones. These plots give us a good understanding of the expected precision and recall values for example if we select top N% of the instances.


In [None]:
cumulative_bin_metrics = binary_classification.calc_cumulative_bin_metrics(
    labels=df_prediction['label_numerical'].values,
    probability_predictions=df_prediction['score'].values,
    number_bins=10,
    decimal_points=4)

binary_classification.plot_cumulative_bin_metrics(
    cumulative_bin_metrics=cumulative_bin_metrics)

## Feature Exploration Plots

This section calculates and plots the distribution of features in the dataset for the equal sized bins of the predicted probability.

How it works:
1. First sorts predicted probabilities in the descending order, and then divides the instances into N number of equal sized bins (e.g. deciles when N = 10) such that the first bin has the instances with the highest probabilities and so on.
2. Then it calculates and plots the distribution of each feature for each plot.

**Expected outputs:**

The output plots can be used to understand the relationships (positive, negative and non-linear correlations) between the features and the predictions (labels) leading to:

* extracting new insights in relation to the business problem (e.g. how the demographics, user-behaviour or functionalities in the app/website related to the conversion rate).
* confirming the relationships the model has learned between features and label makes sense (e.g. it hasn't learned any suspicious relationships caused by label leakages due to data collection or processing issues).


### Read in the table prediction table schema.

In [None]:
sql = f"""SELECT column_name, data_type
       FROM `{PROJECT_ID}.{DATASET}`.INFORMATION_SCHEMA.COLUMNS
       WHERE table_name='{TEST_DATA_PREDICTIONS_TABLE}'"""

print(sql)
schema = bq_client.run_query(sql).to_dataframe()
schema.columns = ['column_name', 'type']
schema.head()

In [None]:
# Seperate out numerical and categorical columns names.
num_features = list(schema[schema['type'].isin(['INT64','FLOAT64'])]['column_name'])
cat_features = list(schema[schema['type'].isin(['STRING'])]['column_name'])
# Remove non-feature columns.
columns_to_remove = ['user_id', 'label']
num_features = [v for v in num_features if v not in columns_to_remove]
cat_features = [v for v in cat_features if v not in columns_to_remove]
print(f'Number of numerical features: {len(num_features)}')
print(f'Number of categorical features: {len(cat_features)}')

In [None]:
feature_names_to_plot = num_features + cat_features
feature_types = ['numerical'] * len(num_features) + ['categorical'] * len(cat_features)

In [None]:
binary_classification.plot_binned_features(
    data=df_prediction,
    prediction_column_name='score',
    feature_names=feature_names_to_plot,
    feature_types=feature_types);