In [None]:
# Copyright 2022 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#4. Model Training

This notebook demonstrates how to train an LTV Model using BigQuery ML.

### Install and import required modules

In [None]:
# Uncomment to install required python modules
# !sh ../utils/setup.sh

In [None]:
# Add custom utils module to Python environment
import os
import sys
sys.path.append(os.path.abspath(os.pardir))

import numpy as np
import pandas as pd

from gps_building_blocks.cloud.utils import bigquery as bigquery_utils
from utils import model
from utils import helpers

### Set parameters

In [None]:
configs = helpers.get_configs('config.yaml')
dest_configs = configs.destination

# GCP project ID
PROJECT_ID = dest_configs.project_id
# Name of the BigQuery dataset
DATASET_NAME = dest_configs.dataset_name

In [None]:
# To distinguish the seperate runs of the training pipeline
RUN_ID = '01'

# BigQuery table name containing model development dataset
FEATURES_DEV_TABLE = f'features_dev_table_{RUN_ID}'

# BigQuery table name containing out of time test dataset
FEATURES_TEST_TABLE = f'features_test_table_{RUN_ID}'

In [None]:
# Initialize BigQuery client.
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

In [None]:
# Read in Features table schema to select feature names for model training
sql = ("SELECT column_name "
       f"FROM `{PROJECT_ID}.{DATASET_NAME}`.INFORMATION_SCHEMA.COLUMNS "
       f"WHERE table_name='{FEATURES_DEV_TABLE}';")

print(sql)
features_schema = bq_utils.run_query(sql).to_dataframe()

# Columns to remove from the feature list
to_remove = ['window_start_ts', 'window_end_ts', 'snapshot_ts', 'user_id',
             'label', 'key', 'data_split']

# Selected features for model training
training_features = [v for v in features_schema['column_name']
                     if v not in to_remove]

print('Number of training features:', len(training_features))
print(training_features)


FEATURE_COLUMNS = training_features
TARGET_COLUMN = 'label'

## Model Training

### BQML Regression

In [None]:
# Train BQML Regression model on the development dataset

# Output BQML model name to save in BigQuery
BQML_MODEL_NAME = f'ltv_model_bqml_{RUN_ID}'

# Set model parameters
bqml_params = {
  'model_path': f'{PROJECT_ID}.{DATASET_NAME}.{BQML_MODEL_NAME}',
  'features_table_path': f'{PROJECT_ID}.{DATASET_NAME}.{FEATURES_DEV_TABLE}',
  'feature_columns': FEATURE_COLUMNS,
  'target_column': TARGET_COLUMN,
  'MODEL_TYPE': 'LINEAR_REG',
  'L2_REG': 0.0001,
  'MAX_ITERATIONS': 50,
  'LEARN_RATE_STRATEGY': 'LINE_SEARCH',
  'EARLY_STOP': True,
  'MIN_REL_PROGRESS': 0.001,
  # Use data_split_col if you want to use custom data split.
  # Details on AUTO_SPLIT:
  # https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-glm#data_split_method
  # 'CUSTOM' - split data using a customer provided column of type BOOL.
  'DATA_SPLIT_METHOD': 'AUTO_SPLIT',
  # 'DATA_SPLIT_COL': 'data_split',
  # TODO(): Adopt BOOL type data_split_col
  'ENABLE_GLOBAL_EXPLAIN': True,
  'OPTIMIZATION_OBJECTIVE': 'MINIMIZE_RMSE'
}

# Train moodel
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)
ltv_model = model.LTVModel(bq_utils=bq_utils,
                           params=bqml_params)

In [None]:
ltv_model.train(verbose=False)

In [None]:
ltv_model.get_feature_info()

In [None]:
# Extract the performance on the test partition of the development dataset
sql = f"""SELECT * FROM
        ML.EVALUATE(MODEL `{PROJECT_ID}.{DATASET_NAME}.{BQML_MODEL_NAME}`);"""

print(sql)
bq_utils.run_query(sql).to_dataframe()

In [None]:
# Extract the performance on the test partition of the testing dataset
sql =  f"""SELECT * FROM
          ML.EVALUATE(MODEL `{PROJECT_ID}.{DATASET_NAME}.{BQML_MODEL_NAME}`,
          (SELECT * FROM `{PROJECT_ID}.{DATASET_NAME}.{FEATURES_TEST_TABLE}`));"""

print(sql)
bq_utils.run_query(sql).to_dataframe()

### AUTOML Model

In [None]:
# Train AutoML model on the development dataset

# Output AutoML model name to save in BigQuery
AUTOML_MODEL_NAME = f'ltv_model_automl_{RUN_ID}'

# Set model parameters
automl_params = {
  'model_path': f'{PROJECT_ID}.{DATASET_NAME}.{AUTOML_MODEL_NAME}',
  'features_table_path': f'{PROJECT_ID}.{DATASET_NAME}.{FEATURES_DEV_TABLE}',
  'feature_columns': FEATURE_COLUMNS,
  'target_column': TARGET_COLUMN,
  'MODEL_TYPE': 'AUTOML_REGRESSOR',
  # Enable data_split_col if you want to use custom data split.
  # Details on AUTOML data split column:
  # https://cloud.google.com/automl-tables/docs/prepare#split
  # 'DATA_SPLIT_COL': 'data_split',
  # TODO(): Adopt BOOL type data_split_col
  'OPTIMIZATION_OBJECTIVE': 'MINIMIZE_RMSE',
  'BUDGET_HOURS': 1.0
}

# train moodel
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)
ltv_model = model.LTVModel(bq_utils=bq_utils,
                           params=automl_params)

In [None]:
ltv_model.train(verbose=False)

In [None]:
ltv_model.get_feature_info()

In [None]:
# Extract the performance on the test partition of the development dataset
sql = f"""SELECT * FROM
        ML.EVALUATE(MODEL `{PROJECT_ID}.{DATASET_NAME}.{AUTOML_MODEL_NAME}`);"""

print(sql)
bq_utils.run_query(sql).to_dataframe()

In [None]:
# Extract the performance on the test partition of the testing dataset
sql =  f"""SELECT * FROM
          ML.EVALUATE(MODEL `{PROJECT_ID}.{DATASET_NAME}.{AUTOML_MODEL_NAME}`,
          (SELECT * FROM `{PROJECT_ID}.{DATASET_NAME}.{FEATURES_TEST_TABLE}`));"""

print(sql)
bq_utils.run_query(sql).to_dataframe()