In [None]:
# Copyright 2021 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 7. Batch Scoring Based on Pretrained Propensity Model

This notebook demonstrates how to create a scoring dataset and use it to predict probabilities based on a pretrained propensity model.

## Requirements

This notebook requires to have pretrained model stored in BigQuery. This can be done using [4. Model training notebook](4.model_training.ipynb).

## Install and import required modules

In [None]:
# Install gps_building_blocks package if not installed
# !pip install gps_building_blocks

In [None]:
import os

from utils import model
from gps_building_blocks.ml.data_prep.ml_windowing_pipeline import ml_windowing_pipeline
from gps_building_blocks.cloud.utils import bigquery as bigquery_utils

## Step 1. Run Scoring Dataset Generation Pipeline

Following executes MLWP's [Running Prediction Pipeline](https://github.com/google/gps_building_blocks/tree/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline#running-prediction-pipeline) to generate dataset for scoring. For features, make sure to use the parameters used in the training ML dataset creation. For detailed params of `run_prediction_pipeline.py`, refer to [Step 4. Run Features Pipeline](https://github.com/google/gps_building_blocks/tree/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline#step-4-run-features-pipeline) of MLWP.

Before generating the scoring dataset, first configure following variables based on your GCP project:

In [None]:
# GCP Project ID
PROJECT_ID = 'project-id'
# BigQuery dataset name
DATASET_NAME = 'dataset'
# Bigquery table name containing the original data to create scoring dataset.
# Example: 'bigquery-public-data.google_analytics_sample.ga_sessions_*' for
# Google Merchandize Store GA360 dataset
DATA_TABLE_NAME = 'table'

# To distinguish the seperate runs of the MWLP
RUN_ID = '01_score'
# Snapshot date to make predictions from
SNAPSHOT_DATE = '2021-01-01'
# Length of the lookback window in days.
# This should be the same as in the training ML dataset creation step
LOOKBACK_WINDOW_SIZE_IN_DAYS = 7
# Days from lookback window ends in relation to the snapshot date.
# This should be the same as in the training ML dataset creation step
LOOKBACK_WINDOW_GAP_IN_DAYS = 0
# Local dir for MWLP sql templates in case customized SQL was used to create
# the training ML dataset
LOCAL_TEMPLATE_DIR = f'{os.getcwd()}/templates'

In [None]:
scoring_parameters = {
  'run_id': RUN_ID,
  'project_id': PROJECT_ID,
  'dataset_id': DATASET_NAME,
  'analytics_table': DATA_TABLE_NAME,
  'snapshot_date': SNAPSHOT_DATE,
  'lookback_window_size_in_days': LOOKBACK_WINDOW_SIZE_IN_DAYS,
  'lookback_window_gap_in_days': LOOKBACK_WINDOW_GAP_IN_DAYS,
  'templates_dir': LOCAL_TEMPLATE_DIR,
  'sessions_sql': 'sessions_google_analytics.sql', # or sessions_firebase.sql
  'features_sql': 'features_from_input.sql',
  'sum_values': 'totals_visits;totals_pageviews',
  'avg_values': 'totals_visits;totals_pageviews',
  'min_values': 'totals_visits;totals_pageviews',
  'max_values': 'totals_visits;totals_pageviews',
  'count_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
  'latest_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
  'proportions_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
  'mode_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
}

ml_windowing_pipeline.run_prediction_pipeline(scoring_parameters)

## Step 2. Run Batch Score

In [None]:
# Initialize BigQuery client
bq_client = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

In [None]:
# Configure features input table to be scored, output table and model name
FEATURES_TABLE_NAME = f'features_{RUN_ID}'
PREDICTION_TABLE_NAME = f'scored_{RUN_ID}'
MODEL_NAME = 'test_model'

In [None]:
# Extract pretrained model features. This helps to verify which features were
# used to train the propensity model.
sql = f"""
  SELECT
    input
  FROM ML.FEATURE_INFO(MODEL `{PROJECT_ID}.{DATASET_NAME}.{MODEL_NAME}`);
"""
expected_features = bq_client.run_query(sql).to_dataframe()
expected_features = expected_features['input'].tolist()
print(f'Model expects following input features: \n {expected_features}')

In [None]:
# These params are passed to PropensityModel object to run batch prediction.
# If binary predictions are required, add `threshold` param to the scoring
# params. Example: {'threshold': 0.5}.
scoring_params = {
  'model_path': f'{PROJECT_ID}.{DATASET_NAME}.{MODEL_NAME}',
  'features_table_path': f'{PROJECT_ID}.{DATASET_NAME}.{FEATURES_TABLE_NAME}',
  'output_table_path': f'{PROJECT_ID}.{DATASET_NAME}.{PREDICTION_TABLE_NAME}',
  'feature_columns': expected_features
}

In [None]:
# Execute batch prediction job
propensity_model = model.PropensityModel(bq_client=bq_client,
                                         params=scoring_params)
propensity_model.predict(params=scoring_params, verbose=False, overwrite_table=True)