In [None]:
# Copyright 2022 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

This notebook demonstrates the generation of a propensity audience for a remarketing use case. It relies on the sample size calculations from the [6.media_experiment_design.ipynb](https://source.corp.google.com/piper///depot/google3/third_party/professional_services/solutions/compass/packages/propensity/6.media_experiment_design.ipynb) notebook to create the Test and Control audiences which are written to a new BigQuery table. This data can then be uploaded via measurement protocol to GA and used for the activation with the Google Ads products as demonstrated in [9.audience_upload.ipynb notebook](google3/third_party/professional_services/solutions/compass/packages/propensity/9.audience_upload.ipynb) notebook.

**Requirements:**


* An already scored dataset from the [7.batch_scoring.ipynb](google3/third_party/professional_services/solutions/compass/packages/propensity/7.batch_scoring.ipynb) notebook: this is the model prediction dataset containing ML prediction for each `user_id` and `snapshot_ts`, from which we create the remarketing audience.
* Statistical sample size calculations from the [6.media_experiment_design.ipynb](https://source.corp.google.com/piper///depot/google3/third_party/professional_services/solutions/compass/packages/propensity/6.media_experiment_design.ipynb) notebook for each propensity audience group.

## Install and import required modules

In [None]:
# Uncomment to install required python modules
# !sh ../utils/setup.sh

In [None]:
import numpy as np
import pandas as pd
import random

from gps_building_blocks.cloud.utils import bigquery as bigquery_utils

import utils

## Set parameters

In [None]:
configs = utils.get_configs('config.yaml')
dest_configs = configs.destination

# GCP project ID
PROJECT_ID = dest_configs.project_id
# Name of BigQuery dataset,
# destination for created tables for modelling and activation.
DATASET_NAME = dest_configs.dataset_name

In [None]:
# To distinguish the seperate runs of the MWLP
RUN_ID = 'SCORE-01'
# BigQuery table name containing the predictions (e.g. generated by
# 7.batch_scoring.ipynb notebook)
PREDICTIONS_TABLE = f'scored_{RUN_ID}'
# Snapshot date to select the ML instances to create the marketing audience in
# YYYY-MM-DD format
SELECTED_SNAPSHOT_DATE = '2017-06-15'
# Name of the column in the predictions table with the predicted label values
PREDICTED_LABEL_NAME = 'predicted_label_probs'
# Label value for the positive class
POSITIVE_CLASS_LABEL = True
# Number of propensity audience groups to devide the scored users into
# (e.g. 3 bins for High, Medium and Low propensity audience groups)
AUDIENCE_GROUPS = 3
# Minimum samples sizes to select as the Test and Control groups for each the
# propensity audience groups based on the output of the
# 6.media_experiment_design.ipynb notebook (following are some example numbers).
MIN_SAMPLE_SIZES = [1000, 2000, 3000]
# Name of the BigQuery table with exported audience
AUDIENCE_EXPORT_TABLE = f'audience_export_{RUN_ID}'

In [None]:
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

### Read the prediction dataset

In this step, we assume the prediction dataset is available as a BigQuery table.

In [None]:
# SQL for extracting prediction dataset when using BQML.
sql = f"""
  SELECT
    user_id,
    snapshot_ts,
    days_since_latest_activity,
    days_since_first_activity,
    probs.label AS predicted_score_label,
    probs.prob AS score
  FROM
    `{PROJECT_ID}.{DATASET_NAME}.{PREDICTIONS_TABLE}` AS predictions,
    UNNEST({PREDICTED_LABEL_NAME}) AS probs
  WHERE
    probs.label={POSITIVE_CLASS_LABEL}
    AND snapshot_ts='{SELECTED_SNAPSHOT_DATE}';
"""
print (sql)
df_prediction = bq_utils.run_query(sql).to_dataframe()

In [None]:
df_prediction.head()

In [None]:
# If required, the users can be filtered by using days_since_latest_activity
# (tenure) and days_since_first_activity (recency) columns before creating the
# audience groups

### Create audience groups


In [None]:
# Separate the users into <AUDIENCE_GROUPS> number of audience groups
df_prediction = df_prediction.sort_values(by='score',
                                          ascending=False).reset_index()
# To avoid duplicate edges of bins we use the index in the qcut function below
df_prediction['audience_group'] = pd.qcut(df_prediction.index,
                                          q=AUDIENCE_GROUPS, labels=False)
# When AUDIENCE_GROUPS=3, audience_group column contains '0', '1' and '2' values
# representing 'High', 'Medium' and 'Low' propensity groups respectively

# Separate each audience group into Test and Control
df_prediction['test_control'] = 'NA'
for i in range(len(MIN_SAMPLE_SIZES)):
  group = df_prediction[df_prediction['audience_group'] == i]
  # Select Control set size based on the minimum sample size
  control_user_ids = random.sample(list(group['user_id']), MIN_SAMPLE_SIZES[i])
  remaining_user_ids = list(set(group.user_id) - set(control_user_ids))

  # Select Test set size based on the minimum sample size
  test_user_ids = random.sample(remaining_user_ids, MIN_SAMPLE_SIZES[i])
  # Alternatively, select Test set to include all the remaining users as below
  # or a subset of users greater than MIN_SAMPLE_SIZES[i] depending on the
  # avaialble campaign budget
  # test_user_ids = remaining_user_ids

  df_prediction.loc[df_prediction['user_id'].isin(test_user_ids),
                    'test_control'] = 'Test'
  df_prediction.loc[df_prediction['user_id'].isin(control_user_ids),
                    'test_control'] = 'Control'

# Explore the created audience sizes and statistics of the predicted
# probabilities. We should expect to see simillar statistics of probabilities
# between each Test and Control pair
df_prediction.groupby(['audience_group', 'test_control']).agg(
    {'score':['count', 'min', 'mean', 'median', 'max']})

# TODO(): Add box plots to visualize the probabilities of Test and
# Control groups

### Write the audience data to a BigQuery table

In [None]:
df_prediction.to_gbq(
    destination_table=f'{DATASET_NAME}.{AUDIENCE_EXPORT_TABLE}',
    project_id=PROJECT_ID,
    if_exists='replace')