In [None]:
# Copyright 2021 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

This notebook demonstrates the generation of propensity audience. It exports the audience segmented into Test and Control to the BigQuery table. This data then can be uploaded via measurement protocol to GA and used for the activation with the Google Ads products.

**Requirements:**
* An already scored dataset.
* This is the prediction dataset, it contains ML instances based on the `user_id` and `snapshot_ts`, from which we can select the audience to target.

## Install and import required modules

In [None]:
# Uncomment to install required python modules
# !pip install -r requirements.txt -q

In [None]:
import numpy as np
import pandas as pd
from gps_building_blocks.cloud.utils import bigquery as bigquery_utils

## Set paramaters

In [None]:
# GCP Project ID
PROJECT_ID = 'project'
# Name of BigQuery dataset,
# destination for created tables for modelling and activation.
DATASET = 'dataset'
# BigQuery table name containing the predictions
PREDICTIONS_TABLE = 'test_prediction_table'
# Selected snapshot date to select the ML instances
# YYYY-MM-DD format
SELECTED_SNAPSHOT_DATE = '2021-05-31'
# Name of the column in the prediction table with the predicted label
PREDICTED_LABEL_NAME = 'predicted_label_probs'
# Label value for the positive class.
POSITIVE_CLASS_LABEL = True
# Name of the table with exported audience.
AUDIENCE_EXPORT_TABLE = 'audience_export'
# Original GA dataset for joining client_id and fullvisitor_id.
# e.x. bigquery-public-data.google_analytics_sample.ga_sessions_* for
# Google Merchandize Store GA360 dataset
SOURCE_DATA = 'project.dataset.ga_sessions_*'

In [None]:
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

## Read the Prediction Test Dataset (if already scored)

In this step, we assume the prediction dataset is available as a BigQuery table.

In [None]:
# SQL for extracting prediction dataset when using BQML.
sql = f"""
  SELECT
    user_id,
    snapshot_ts,
    days_since_latest_activity,
    days_since_first_activity,
    probs.label AS predicted_score_label,
    probs.prob AS score
  FROM
    `{PROJECT_ID}.{DATASET}.{PREDICTIONS_TABLE}` AS predictions,
    UNNEST({PREDICTED_LABEL_NAME}) AS probs
  WHERE
    probs.label={POSITIVE_CLASS_LABEL}
    AND snapshot_ts='{SELECTED_SNAPSHOT_DATE}';
"""
print (sql)
df_prediction = bq_utils.run_query(sql).to_dataframe()

In [None]:
df_prediction

### Add quantiles to dataframe with user_ids

In [None]:
# Set the number of quantiles that you want to divide users into.
number_bins = 10

In [None]:
df_prediction['bins_10'] = pd.cut(
    df_prediction['score'] * 100,
    bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
df_prediction['bins_100'] = pd.cut(
    df_prediction['score'] * 100, bins=np.arange(0, 101, 1))
df_prediction['score_quantile'] = pd.qcut(
    df_prediction['score'], q=number_bins, labels=False)
df_prediction['score_quantile_labels'] = pd.qcut(
    df_prediction['score'], q=number_bins)

In [None]:
df_prediction

## Check scores

In [None]:
# Check score distribution.
df_prediction.score.hist(bins=100)

In [None]:
df_prediction['score'].describe()

In [None]:
# Check ranges of the scores by quantile.
segments =['score_quantile']
df_summary = df_prediction.groupby(segments)[['user_id']].count()
df_summary['score_quantile_labels'] = df_prediction.groupby(
    segments)[['score_quantile_labels']].min()
df_summary['score_min'] = df_prediction.groupby(segments)[['score']].min()
df_summary['score_max'] = df_prediction.groupby(segments)[['score']].max()
df_summary['score_mean'] = df_prediction.groupby(segments)[['score']].mean()
df_summary['records_pct'] = (df_summary['user_id'] /
                             df_summary['user_id'].sum()) * 100

df_summary

## Exporting data audience to the table

### Creating control and test groups


This is a very simple illustrative method of splitting users into Test and Control groups.
If you are going to do multiple refresh uploads then it is strongly adviced to use another method based on the unique fingerfrinting of users and tracking their participation in the Control/Test groups. We can also  score and upload only new users that have not been assigned before.

In [None]:
n_control = 10000
control_ids = df_prediction['user_id'].sample(n_control).values
control_ids

In [None]:
# All users in randomised control_ids will be assigned to Control (True),
# the rest of the users is going to be assigned to Test (False).
df_prediction['control'] = df_prediction['user_id'].isin(control_ids)
df_prediction

In [None]:
# Set quantile threshold to select for upload.
quantile_threshold = 9

# For the exported table we are selecting all users in control
# and also all users above quantile threshold.
# This is because part of those users have been already assigned to the control,
# and rest is in the Test group.
mask = (
    (df_prediction['score_quantile'] >= quantile_threshold) |
    (df_prediction['control'] == True))
df_selected = df_prediction[mask]

In [None]:
df_selected

### Checking stats by control-test split

By looking at the mean score difference between Test and Control groups we can get an idea about the performance we can expect and select the groups accordingly.

In [None]:
segments =['control']
df_summary = df_selected.groupby(segments)[['user_id']].count()
df_summary['score_min'] = df_selected.groupby(segments)[['score']].min()
df_summary['score_max'] = df_selected.groupby(segments)[['score']].max()
df_summary['score_mean'] = df_selected.groupby(segments)[['score']].mean()
df_summary['score_median'] = df_selected.groupby(segments)[['score']].median()
df_summary['records_pct'] = (
    df_summary['user_id'] / df_summary['user_id'].sum()) * 100
df_summary

In [None]:
# How many times better are the average scores in the Test group than Control.
df_summary['score_mean'][False] / df_summary['score_mean'][True]

### Export data for experiment to the BigQuery table

In [None]:
cols = ['user_id',
        'snapshot_ts',
        'score',
        'days_since_first_activity',
        'days_since_latest_activity',
        'score_quantile',
        'control']
df_selected[cols]

In [None]:
destination_table = f'{DATASET}.{AUDIENCE_EXPORT_TABLE}'
destination_table

In [None]:
df_selected[cols].to_gbq(
    destination_table=destination_table,
    project_id=PROJECT_ID,
    if_exists='replace')