In [None]:
# Copyright 2022 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 7. Batch Scoring Based on Pretrained Propensity Model

This notebook demonstrates how to create a scoring dataset and use it to predict probabilities based on a pretrained propensity model.

### Requirements

This notebook requires to have pretrained model stored in BigQuery. This can be done using [4. Model training notebook](4.model_training.ipynb).

### Install and import required modules

In [None]:
# Uncomment to install required python modules
# !sh ../utils/setup.sh

In [None]:
# Add custom utils module to Python environment
import os
import sys
sys.path.append(os.path.abspath(os.pardir))

from gps_building_blocks.ml.data_prep.ml_windowing_pipeline import ml_windowing_pipeline
from gps_building_blocks.cloud.utils import bigquery as bigquery_utils

from utils import model
from utils import helpers

### Step 1. Run scoring dataset generation pipeline

Following executes MLWP's [Running Prediction Pipeline](https://github.com/google/gps_building_blocks/tree/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline#running-prediction-pipeline) to generate dataset for scoring. For features, make sure to use the parameters used in the training ML dataset creation. For detailed params of `run_prediction_pipeline.py`, refer to [Step 4. Run Features Pipeline](https://github.com/google/gps_building_blocks/tree/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline#step-4-run-features-pipeline) of MLWP.

Before generating the scoring dataset, first configure following variables based on your GCP project:

In [None]:
configs = helpers.get_configs('config.yaml')
source_configs, dest_configs, run_id_configs = \
    configs.source, configs.destination, configs.run_id

# GCP project ID
PROJECT_ID = dest_configs.project_id
# Name of the BigQuery dataset
DATASET_NAME = dest_configs.dataset_name
# Bigquery table name containing the original data to create scoring dataset.
# Example: 'bigquery-public-data.google_analytics_sample.ga_sessions_*' for
# Google Merchandize Store GA360 dataset
SOURCE_TABLE_PATH = (f'{source_configs.project_id}'
                    f'.{source_configs.dataset_name}'
                    f'.{source_configs.table_name}')

In [None]:
# To distinguish the separate scoring runs
RUN_ID = run_id_configs.score

# Snapshot date to make predictions from
SNAPSHOT_DATE = '2017-06-15'
# Length of the lookback window in days.
# This should be the same as in the training ML dataset creation step
LOOKBACK_WINDOW_SIZE_IN_DAYS = 30
# Days from lookback window ends in relation to the snapshot date.
# This should be the same as in the training ML dataset creation step
LOOKBACK_WINDOW_GAP_IN_DAYS = 1
# Local dir for MWLP sql templates in case customized SQL was used to create
# the training ML dataset
LOCAL_TEMPLATE_DIR = f'{os.getcwd()}/templates'

In [None]:
scoring_parameters = {
  'run_id': RUN_ID,
  'project_id': PROJECT_ID,
  'dataset_id': DATASET_NAME,
  'analytics_table': SOURCE_TABLE_PATH,
  'snapshot_date': SNAPSHOT_DATE,
  'lookback_window_size_in_days': LOOKBACK_WINDOW_SIZE_IN_DAYS,
  'lookback_window_gap_in_days': LOOKBACK_WINDOW_GAP_IN_DAYS,
  'templates_dir': LOCAL_TEMPLATE_DIR,
  'sessions_sql': 'sessions_google_analytics.sql',
  'features_sql': 'features_from_input.sql',
  'sum_values': 'totals_visits;totals_pageviews',
  'avg_values': 'totals_visits;totals_pageviews',
  'min_values': 'totals_visits;totals_pageviews',
  'max_values': 'totals_visits;totals_pageviews',
  'count_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
  'latest_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
  'proportions_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
  'mode_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
}

ml_windowing_pipeline.run_prediction_pipeline(scoring_parameters)

### Step 2. Run batch scoring

In [None]:
# Initialize BigQuery client
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

In [None]:
# BigQuery table name containing the dataset to be scored
SCORING_FEATURES_TABLE = f'features_{RUN_ID}'
# BigQuery table name to store the predictions from scoring
SCORING_PREDICTION_TABLE = f'scored_{RUN_ID}'
# BigQuery model name
MODEL_NAME =  f'propensity_model_{run_id_configs.train}'

In [None]:
# Extract pretrained model features. This helps to verify which features were
# used to train the propensity model.
sql = f"""
  SELECT
    input
    FROM ML.FEATURE_INFO(MODEL `{PROJECT_ID}.{DATASET_NAME}.{MODEL_NAME}`);
"""
expected_features = bq_utils.run_query(sql).to_dataframe()
expected_features = expected_features['input'].tolist()
print(f'Model expects following input features: \n {expected_features}')

In [None]:
# Run batch scoring

scoring_query =f"""
  CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_NAME}.{SCORING_PREDICTION_TABLE}`
  AS (
    SELECT *
    FROM ML.PREDICT(MODEL `{PROJECT_ID}.{DATASET_NAME}.{MODEL_NAME}`,
                    TABLE `{PROJECT_ID}.{DATASET_NAME}.{SCORING_FEATURES_TABLE}`)
  );
"""
print(scoring_query)
bq_utils.run_query(scoring_query)