In [None]:
# Copyright 2022 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 2. ML Data Preparation for Lifetime-value (LTV) Modeling

This notebook demonstrates how to create an ML dataset using [ML Data Windowing Pipeline (MLWP)](https://github.com/google/gps_building_blocks/tree/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline) and [Data Visualizastion](https://github.com/google/gps_building_blocks/tree/master/py/gps_building_blocks/ml/data_prep/data_visualizer) modules.

* MLWP creates a rich ML dataset by extracting a series of data snapshots over time in a faster, easier and cheaper way.
* Data Visualization visualizes input and output data for MLWP for consistency and accuracy to avoid garbage in, garage out situation in modelling.

[Google Merchandize Store GA360 dataset](https://support.google.com/analytics/answer/7586738?hl=en) is used as an example.

## Requirements

* [Google Analytics dataset stored in BigQuery.](https://support.google.com/analytics/answer/3437618?hl=en)

## Install and import required modules

In [None]:
# Uncomment to install required python modules
# !sh ../utils/setup.sh

In [None]:
# Add custom utils module to Python environment
import os
import sys
sys.path.append(os.path.abspath(os.pardir))

import inspect
import matplotlib.pyplot as plt

from matplotlib.backends.backend_pdf import PdfPages
from gps_building_blocks.cloud.utils import bigquery as bigquery_utils
from gps_building_blocks.ml import utils as pipeline_utils
from gps_building_blocks.ml.data_prep.ml_windowing_pipeline import ml_windowing_pipeline
from gps_building_blocks.ml.data_prep.data_visualizer import instance_visualizer
from gps_building_blocks.ml.data_prep.data_visualizer import fact_visualizer
from gps_building_blocks.ml.data_prep.data_visualizer import feature_visualizer

from utils import helpers

## Configure MLWP module

Following copies all MLWP SQL templates to current project directory to make them customizable for your needs.

In [None]:
# MWLP SQL template dir for this project
MLWP_TEMPLATE_DIR = 'mlwp_templates'

In [None]:
templates_dir = os.path.dirname(inspect.getfile(ml_windowing_pipeline))
source_templates = os.path.join(templates_dir, 'templates')
!cp -r {source_templates} {MLWP_TEMPLATE_DIR}

Next, set up GCP project details and MWLP dataset configuration. Refer to [this page](https://github.com/google/gps_building_blocks/tree/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline) for more details on MWLP configurable parameters.

In [None]:
configs = helpers.get_configs('config.yaml')
source_configs, dest_configs = configs.source, configs.destination

# GCP project ID
PROJECT_ID = dest_configs.project_id
# BigQuery dataset name
DATASET_NAME = dest_configs.dataset_name
# BigQuery table name containing the original data
# e.x. bigquery-public-data.google_analytics_sample.ga_sessions_* for
# Google Merchandize Store GA360 dataset
SOURCE_TABLE_PATH = (f'{source_configs.project_id}'
                    f'.{source_configs.dataset_name}'
                    f'.{source_configs.table_name}')

In [None]:
# To distinguish the seperate runs of the MWLP
RUN_ID = '01'

# First data snapshot date in YYYY-MM-DD format
SNAPSHOT_START_DATE = '2016-11-17'
# Last data snapshot date in YYYY-MM-DD format
SNAPSHOT_END_DATE = '2017-07-01'
# Sliding window length between shapshots
SLIDE_INTERVAL_IN_DAYS = 7
# The days from prediction window starts in relation to the snapshot date
PREDICTION_WINDOW_GAP_IN_DAYS = 1
# The length of the prediction window in days
PREDICTION_WINDOW_SIZE_IN_DAYS = 14
# The days from lookback window ends in relation to the snapshot date
LOOKBACK_WINDOW_GAP_IN_DAYS = 1
# The length of the prediction window in days
LOOKBACK_WINDOW_SIZE_IN_DAYS = 30

# Name of the pdf file containing output instance table plots
INSTANCE_TABLE_PLOTS_FILE = 'instance_plots.pdf'
# Name of the pdf file containing output numerical fact plots
NUMERICAL_FACTS_PLOTS_FILE = 'numerical_fact_plots.pdf'
# Name of the pdf file containing output categorical fact plots
CATEGORICAL_FACTS_PLOTS_FILE = 'categorical_fact_plots.pdf'
# Name of the pdf file containing output feature plots
FEATURE_PLOT_FILES = 'feature_plots.pdf'

## Run MLWP and Data Visualization

The following steps are executed below to create and visualize the ML dataset:

*   Step 1: Run Data Extraction Pipeline
*   Step 2: Run Data Exploration Pipeline
*   Step 3: Visualize Instances and Facts
*   Step 4: Run Windowing Pipeline
*   Step 5: Run Feature Generation Pipeline
*   Step 6: Visualize Features

### Step 1. Run Data Extraction Pipeline

This step extracts and formats the original data from the BigQuery table into
several temporary tables for further processing.

This step first requires to update the following sql templates files in the local /template directory:
*   For GA360 data:
    * to define the label definition: [*conversions_google_analytics_regression.sql*](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/conversions_google_analytics_regression.sql) and [*prediction_window_conversions_to_label_regression.sql*](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/prediction_window_conversions_to_label_regression.sql) (optional)
    * to define the variables to extract (optional): [*sessions_google_analytics.sql*](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/sessions_google_analytics.sql)

*   For Firebase data:
    * to define the label definition: [*conversions_firebase.sql*](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/conversions_firebase.sql) and [*prediction_window_conversions_to_label_regression.sql*](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/prediction_window_conversions_to_label_regression.sql) (optional)
    * to define the variables to extract (optional): [*sessions_firebase.sql*](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/sessions_firebase.sql)

For example, the following code derived from  [*conversions_google_analytics_regression.sql*](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/conversions_google_analytics_regression.sql) file specifies the variable and value used to define a numerical label counting the life-time value of each client by summing up completed purchase actions.

```sql
CREATE OR REPLACE TABLE `{{conversions_table}}`
AS (
  SELECT DISTINCT
    IFNULL(NULLIF(GaTable.clientId, ''), GaTable.fullVisitorId) AS user_id,
    TIMESTAMP_SECONDS(GaTable.visitStartTime) AS conversion_ts,
    1 AS label
  FROM
    `{{analytics_table}}` AS GaTable, UNNEST(GaTable.hits) AS hits
  WHERE
    hits.eCommerceAction.action_type = '6'  -- Google Analytics code for "Completed purchase"
);
```

For example, the following SQL code (the default setting) in the [*prediction_window_conversions_to_label_regression.sql*](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/prediction_window_conversions_to_label_regression.sql) can be used to create a numerical label for the prediction window where the label is assigned the sum of purchases occurred in the prediction window.

```sql
IFNULL(
  (
    SELECT SUM(Conversions.label)
    FROM UNNEST(PredictionWindowConversions.conversions) AS Conversions
  ), 0)
```

Then run data extraction pipeline to extract variables and the label as follows:

In [None]:
data_extract_params = {
 'project_id': PROJECT_ID,
 'dataset_id': DATASET_NAME,
 'analytics_table': SOURCE_TABLE_PATH,
 'conversions_sql': 'conversions_google_analytics_regression.sql', # or conversions_firebase_regression.sql
 'sessions_sql': 'sessions_google_analytics.sql', # or sessions_firebase.sql
 'templates_dir': MLWP_TEMPLATE_DIR,
 'run_id': RUN_ID
}
ml_windowing_pipeline.run_data_extraction_pipeline(data_extract_params)

### Step 2. Run Data Exploration Pipeline

This step outputs facts and instances into BigQuery tables (*numeric_facts_{run_id}*, *categorical_facts_{run_id}* and *instances_{run_id}*) for data exploration and analysis.

In [None]:
data_explo_params = {
 'project_id': PROJECT_ID,
 'dataset_id': DATASET_NAME,
 'analytics_table': SOURCE_TABLE_PATH,
 'snapshot_start_date': SNAPSHOT_START_DATE,
 'snapshot_end_date': SNAPSHOT_END_DATE,
 'slide_interval_in_days': SLIDE_INTERVAL_IN_DAYS,
 'prediction_window_gap_in_days': PREDICTION_WINDOW_GAP_IN_DAYS,
 'prediction_window_size_in_days': PREDICTION_WINDOW_SIZE_IN_DAYS,
 'templates_dir': MLWP_TEMPLATE_DIR,
 'run_id': RUN_ID,
 'prediction_window_conversions_to_label_sql': 'prediction_window_conversions_to_label_regression.sql'
}

ml_windowing_pipeline.run_data_exploration_pipeline(data_explo_params)

### Step 3. Visualize Instances and Facts

This step visualizes instances and facts



### Step 3.1. Visualize Instances

Instance table in BigQuery contains all the instances (e.g. users) selected for each snapshot date with some additional information such as their label, days since the first activity and days since the last activity.

This step generates the following plots:
* plots with the number of total instances, number of positive instances and proportion of positive instances for each snapshot. These plots are helpful to understand how the label is distributed over time, any seasonality and trends, and whether there are any inconsistencies. Based on this we can drop specific periods of snapshots having any data issues and consider what additional features to add to capture the seasonality or any trends of the label over time.
* class specific distribution plots for the *days_since_first_activity* and *days_since_latest_activity* features in the Instance table. From these plots, we can determine a good lookback window period to use to create features and whether it’s worth only using customers having a particular history and recency for modeling.

In [None]:
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)
instance_viz = instance_visualizer.InstanceVisualizer(
        bq_client=bq_utils.client,
        instance_table_path=f'{PROJECT_ID}.{DATASET_NAME}.instances_{RUN_ID}',
        num_instances=100000, # no. radom instances used for ploting
        label_column='label', # name of the label column
        label_type='numerical', # label type
        )

instance_plots = instance_viz.plot_instances()

In [None]:
# Save the plots to a pdf file
instance_plots_pdf = f'{RUN_ID}_{INSTANCE_TABLE_PLOTS_FILE}'
helpers.save_to_pdf(filename=instance_plots_pdf, plots=instance_plots)

### Step 3.2. Visualize Facts Table

Facts table in BigQuery is created by the Data Exploration Pipeline of ML Windowing Pipeline, which contains the original GA variable transformed into facts format containing *user_id*, *timestamp*, *fact_name* and *fact_value* columns.

This step generates plots of numerical and categorical fact variables, which can be used to explore their validity and distribution over time. Based on that we can make decisions such as which facts variables (and which levels in categorical fact variables) to use to generate features in the following steps.

In [None]:
fact_viz = fact_visualizer.FactVisualizer(
        bq_client=bq_utils.client,
        numerical_facts_table_path=f'{PROJECT_ID}.{DATASET_NAME}.numeric_facts_{RUN_ID}',
        categorical_facts_table_path=f'{PROJECT_ID}.{DATASET_NAME}.categorical_facts_{RUN_ID}',
        number_top_categories=5 # No. top categories to explore for categorical variables
        )

numerical_fact_plots = fact_viz.plot_numerical_facts()
categorical_fact_plots = fact_viz.plot_categorical_facts()
plt.close('all') # Don't show plots in notebook

In [None]:
numerical_fact_plots_pdf = f'{RUN_ID}_{NUMERICAL_FACTS_PLOTS_FILE}'
categorical_fact_plots_pdf = f'{RUN_ID}_{CATEGORICAL_FACTS_PLOTS_FILE}'

# Save the plots to a pdf files
for filename, plot_list in zip(
  [numerical_fact_plots_pdf, categorical_fact_plots_pdf],
  [numerical_fact_plots, categorical_fact_plots]):
  helpers.save_to_pdf(filename, plot_list)

### Step 4. Run Data Windowing Pipeline

This step segments the user data into multiple, potentially overlapping time windows, with each window containing a lookback window and a prediction window. This generates an internal table in BigQuery (*windows_{run_id}*) for further processing.

The windows can be defined in two ways:
* based on calendar dates and a sliding window. This is implemented in the *sliding_windows.sql* and used as the default.
* based on each session of each user. This is implemented in the *session_windows.sql* and you can use the *windows_sql* parameter to specify it.

In [None]:
windowing_params = {
 'project_id': PROJECT_ID,
 'dataset_id': DATASET_NAME,
 'snapshot_start_date': SNAPSHOT_START_DATE,
 'snapshot_end_date': SNAPSHOT_END_DATE,
 'slide_interval_in_days': SLIDE_INTERVAL_IN_DAYS,
 'prediction_window_gap_in_days': PREDICTION_WINDOW_GAP_IN_DAYS,
 'prediction_window_size_in_days': PREDICTION_WINDOW_SIZE_IN_DAYS,
 'lookback_window_gap_in_days': LOOKBACK_WINDOW_GAP_IN_DAYS,
 'lookback_window_size_in_days': LOOKBACK_WINDOW_SIZE_IN_DAYS,
 'run_id': RUN_ID,
 'prediction_window_conversions_to_label_sql': 'prediction_window_conversions_to_label_regression.sql'
}

ml_windowing_pipeline.run_windowing_pipeline(windowing_params)

### Step 5. Run Feature Generation Pipeline

This step generates features from the windows of data computed in Data Windowing Pipeline and outputs to *features_{run_id}* table in BigQuery.

In this step, we can select the variables, feature types (aggregation functions) and input values based on prior knowledge or the exploration of facts done at the Fact Visualization step.

For numerical variables, the following feature types (aggregated functions) are supported:
* Sum: sum of all the values over the lookback window
* Average: average of all the values over the lookback window
* Min: minimum of all the values over the lookback window
* Max: maximum of all the values over the lookback window

These options expect a semi-colon separated list of numerical fact names to create the corresponding features (e.x. `sum_values:'variable_1;variable_2,...'`).

For categorical variables, the following feature types (aggregated functions) are supported:
* Counts: total occurrence of each category
* Proportions: proportion of occurance of each category
* Latest value: the latest category value
* Mode value: the most frequent category value

These options expect a semi-colon separated list of categorical Feature Options (`<feature_option1>;<feature_option2>;<feature_option3>`). Each Feature Option should contain a categorical fact name, a list of categorical values to consider and a default value. The default value is specified to use the common value for any value not on the provided list. Feature Option = `<fact_name>:[<value1>, …,<valueN>]:[<default_value>]`. (e.x. `count_values':'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]'`)

In [None]:
features_params = {
 'project_id': PROJECT_ID,
 'dataset_id': DATASET_NAME,
 'features_sql': 'features_from_input.sql',
 'sum_values': 'totals_visits;totals_pageviews',
 'avg_values': 'totals_visits;totals_pageviews',
 'min_values': 'totals_visits;totals_pageviews',
 'max_values': 'totals_visits;totals_pageviews',
 'count_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
 'latest_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
 'proportions_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
 'mode_values': 'trafficSource_medium:[cpm,cpc,referral,affiliate,organic]:[Other];device_isMobile:[false,true]:[Other]',
 'templates_dir': MLWP_TEMPLATE_DIR,
 'run_id': RUN_ID,
}

ml_windowing_pipeline.run_features_pipeline(features_params)

### (Optional) Step 6. Merge with Customized Features
This step merges the generated feature table above and any additional customized feature table (eg. user_id level aggregated CRM data), in order to generate the final dataset for modeling. To ensure the generated dataset is fair for modeling and does not contain data leakage, the customized feature table should be generated and merged for each ML instance defined by user_id and [snapshot_ts](https://github.com/google/gps_building_blocks/blob/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline/templates/instances.sql#L41).

In this example, we assume that:
- Model labels (LTV) is generated with MLWP dataset based on GA / Firebase data by previous sessions. To use label from CRM data, please drop the label column in MLWP dataset.
- The lookback window to create features is defined based on parameters in this notebook (`LOOKBACK_WINDOW_SIZE_IN_DAYS` and `LOOKBACK_WINDOW_GAP_IN_DAYS`). To use customized feature window based on CRM data (egf. different window size for each user), the MLWP dataset needs to be generated with the same time window.



In [None]:
# Please modify the BigQuery table paths here if necessary.
mlwp_feature_table = f'{PROJECT_ID}.{DATASET_NAME}.features_{RUN_ID}'
custom_features_table = f'{PROJECT_ID}.{DATASET_NAME}.crm_feature'
merged_features_table = f'{PROJECT_ID}.{DATASET_NAME}.merged_feature'

In [None]:
query = pipeline_utils.configure_sql(
  sql_path=os.path.join('../utils/templates', 'customized_dataset_merge.sql'),
  query_params={
      'merged_dataset_table': merged_features_table,
      'mlwp_feature_table': mlwp_feature_table,
      'user_dataset_table': custom_features_table,
      'crm_data_date_start': '2020-01-01',
      'crm_data_date_end': '2020-07-01',
      'crm_user_id': 'user_id',
      'crm_snapshot_ts': 'snapshot_ts'
  })

bq_utils.run_query(query)

### Step 7. Visualize Features

This step visualizes the statistics calculated from the Features table in Big Query. The plots include class-specific distribution plots of numerical and categorical features, which can be used to explore the validity of the features and potentially identify issues such as label leakage, and the distribution of the features over time helping to understand the consistency.

In [None]:
# Read in Features table schema to select numerical and categorical feature
# names.
# Please replace the table name (merged_features_table) if Step 6 is executed.
sql = ("SELECT column_name, data_type "
       f"FROM `{PROJECT_ID}.{DATASET_NAME}`.INFORMATION_SCHEMA.COLUMNS "
       f"WHERE table_name='features_{RUN_ID}';")

features_schema = bq_utils.run_query(sql).to_dataframe()
features_schema.columns = ['column_name', 'type']
print(features_schema.head(10))

In [None]:
# Select numerical and categorical feature names
numerical_features = (list(
    features_schema[features_schema['type'].
                    isin(['INT64','FLOAT64'])]['column_name']))
categorical_features = list(
    features_schema[features_schema['type'].
                    isin(['STRING'])]['column_name'])

# Columns to remove if any
to_remove = ['user_id', 'label']
numerical_features = [v for v in numerical_features if v not in to_remove]
categorical_features = [v for v in categorical_features if v not in to_remove]

print('No. of numerical features:', len(numerical_features))
print('No. of categorical features:', len(categorical_features))

In [None]:
# Plot features
# Please replace the table name (merged_features_table) if Step 6 is executed.
feature_viz = feature_visualizer.FeatureVisualizer(
        bq_client=bq_utils.client,
        features_table_path=f'{PROJECT_ID}.{DATASET_NAME}.features_{RUN_ID}',
        numerical_features=numerical_features,
        categorical_features=categorical_features,
        label_column='label', # name of the label column
        label_type='numerical', # label type
        )

feature_plots = feature_viz.plot_features()
plt.close('all') # Don't show plots in notebook

In [None]:
# Save the plots to a pdf files
feature_plots_pdf = f'{RUN_ID}_{FEATURE_PLOT_FILES}'
helpers.save_to_pdf(filename=feature_plots_pdf, plots=feature_plots)