In [None]:
# Copyright 2022 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 4. Model Training

This notebook demonstrates how to train a Propensity Model using BigQuery ML.

### Requirements

* Input features used for training needs to be stored as a BigQuery table. This can be done using [2. ML Data Preparation Notebook](2.ml_data_preparation.ipynb).

### Install and import required modules

In [None]:
# Uncomment to install required python modules
# !sh ../utils/setup.sh

In [None]:
# Add custom utils module to Python environment
import os
import sys
sys.path.append(os.path.abspath(os.pardir))

from gps_building_blocks.cloud.utils import bigquery as bigquery_utils

from utils import model
from utils import helpers

### Set paramaters

In [None]:
configs = helpers.get_configs('config.yaml')
dest_configs, run_id_configs = configs.destination, configs.run_id

# GCP project ID
PROJECT_ID = dest_configs.project_id
# Name of the BigQuery dataset
DATASET_NAME = dest_configs.dataset_name

In [None]:
# To distinguish the separate runs of the training pipeline
RUN_ID = run_id_configs.train

# BigQuery table name containing model development dataset
FEATURES_DEV_TABLE = f'features_dev_table_{RUN_ID}'

# BigQuery table name containing model testing dataset
FEATURES_TEST_TABLE = f'features_test_table_{RUN_ID}'

# Output model name to save in BigQuery
MODEL_NAME = f'propensity_model_{RUN_ID}'

In [None]:
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

Next, let's configure modeling options.

### Model and features configuration

Model options can be configured in detail based on BigQuery ML specifications
listed in [The CREATE MODEL statement](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create).

**NOTE**: Propensity modeling supports only following four types of models available in BigQuery ML:
- LOGISTIC_REG
- [AUTOML_CLASSIFIER](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-automl)
- [BOOSTED_TREE_CLASSIFIER](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-boosted-tree)
- [DNN_CLASSIFIER](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-dnn-models)

In order to use specific model options, you can add options to following configuration exactly same as listed in the [The CREATE MODEL statement](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create). For example, if you want to trian [AUTOML_CLASSIFIER](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-automl) with `BUDGET_HOURS=1`, you can specify it as:

```python
params = {
  'model_type': 'AUTOML_CLASSIFIER',
  'budget_hours': 1
}
```

In [None]:
# Read in Features table schema to select feature names for model training
sql = ("SELECT column_name "
       f"FROM `{PROJECT_ID}.{DATASET_NAME}`.INFORMATION_SCHEMA.COLUMNS "
       f"WHERE table_name='{FEATURES_DEV_TABLE}';")

print(sql)
features_schema = bq_utils.run_query(sql).to_dataframe()

# Columns to remove from the feature list
to_remove = ['window_start_ts', 'window_end_ts', 'snapshot_ts', 'user_id',
             'label', 'key', 'data_split']

# Selected features for model training
training_features = [v for v in features_schema['column_name']
                     if v not in to_remove]

print('Number of training features:', len(training_features))
print(training_features)

In [None]:
# Set parameters for AUTOML_CLASSIFIER model

FEATURE_COLUMNS = training_features
TARGET_COLUMN = 'label'

params = {
  'model_path': f'{PROJECT_ID}.{DATASET_NAME}.{MODEL_NAME}',
  'features_table_path': f'{PROJECT_ID}.{DATASET_NAME}.{FEATURES_DEV_TABLE}',
  'feature_columns': FEATURE_COLUMNS,
  'target_column': TARGET_COLUMN,
  'MODEL_TYPE': 'AUTOML_CLASSIFIER',
  'BUDGET_HOURS': 1.0,
  # ENABLE_GLOBAL_EXPLAIN is not available for AUTOML:
  # https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create#other_model_options
  'ENABLE_GLOBAL_EXPLAIN': False,
  # Enable data_split_col if you want to use custom data split.
  # Details on AUTOML data split column:
  # https://cloud.google.com/automl-tables/docs/prepare#split
  # 'DATA_SPLIT_COL': 'data_split',
  'OPTIMIZATION_OBJECTIVE': 'MAXIMIZE_AU_ROC'
}

## Train the model

First, we initialize `PropensityModel` with config parameters.

In [None]:
propensity_model = model.PropensityModel(bq_utils=bq_utils,
                                         params=params)

Next cell triggers model training job in BigQuery which takes some time to finish depending on dataset size and model complexity. Set `verbose=True`, if you want to verify training query details.

In [None]:
propensity_model.train(verbose=False)

Following cell allows you to see detailed information about the input features used to train a model. It provides following columns:
- input — The name of the column in the input training data.
- min — The sample minimum. This column is NULL for non-numeric inputs.
- max — The sample maximum. This column is NULL for non-numeric inputs.
- mean — The average. This column is NULL for non-numeric inputs.
- stddev — The standard deviation. This column is NULL for non-numeric inputs.
- category_count — The number of categories. This column is NULL for non-categorical columns.
- null_count — The number of NULLs.

For more details refer to [help page](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-feature).

In [None]:
propensity_model.get_feature_info()

### Evaluate the model
This section helps to do quick model evaluation to get following model metrics:

*  recall
*  accuracy
*  f1_score
*  log_loss
*  roc_auc

Two optional parameters can be specified for evaluation:

* eval_table: BigQuery table containing evaluation dataset
* threshold: Custom probability threshold to be used for evaluation (to binarize the predictions). Default value is 0.5.

If neither of these options are specified, the model is evaluated using evaluation dataset split during training with default threshold of 0.5.

**NOTE:** This evaluation provides basic model performance metrics. For thorough evaluation refer to [5. Model evaluation notebook](5.model_evaluation_and_diagnostics.ipynb) notebook.

TODO(): Add sql code to calculate the proportion of positive examples in the evaluation dataset to be used as the *threshold*.

In [None]:
# Model performance on the model development dataset on which the final
# model has been trained

EVAL_TABLE_NAME = FEATURES_DEV_TABLE

eval_params = {
  'eval_table_path':  f'{PROJECT_ID}.{DATASET_NAME}.{EVAL_TABLE_NAME}',
  'threshold': 0.5
}
propensity_model.evaluate(eval_params, verbose=False)

In [None]:
# Model performance on the held out test dataset

EVAL_TABLE_NAME = FEATURES_TEST_TABLE

eval_params = {
  'eval_table_path':  f'{PROJECT_ID}.{DATASET_NAME}.{EVAL_TABLE_NAME}',
  'threshold': 0.5
}
propensity_model.evaluate(eval_params, verbose=False)

## Next

Use [5. Model evaluation notebook](5.model_evaluation_and_diagnostics.ipynb) to get detailed performance metrics of the model and decide of model actually solves the business problem.