In [None]:
# Copyright 2022 Google LLC.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 3. ML Data Preprocessing

This notebook demonstrates the preparation of an already created ML dataset for model development. It is vital to split machine learning datasets in such a way that the model performance can be tuned and fairly assessed. This notebook shows an example of dividing a dataset into `out-of-time TEST` dataset (including selected full snapshot/s) and `DEVELOPMENT` dataset (randomly splitting the rest of the snapshots into `TRAIN`,`VALIDATION` and `TEST`). Those names are designed to be directly used in the AUTOML [DATA_SPLIT_COL](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-automl#data_split_col).

### Requirements
1. Using [ML Windowing Pipeline (MLWP)](https://github.com/google/gps_building_blocks/tree/master/py/gps_building_blocks/ml/data_prep/ml_windowing_pipeline) to create features tables.

### Install and import required modules

In [None]:
# Uncomment to install required python modules
# !sh ../utils/setup.sh

In [None]:
# Add custom utils module to Python environment
import os
import sys
sys.path.append(os.path.abspath(os.pardir))

import google.auth
from gps_building_blocks.cloud.utils import bigquery as bigquery_utils

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from utils import helpers

### Notebook custom settings

In [None]:
# Prints all the outputs from cell (instead of using display each time).
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Set parameters

In [None]:
configs = helpers.get_configs('config.yaml')
dest_configs = configs.destination

# GCP project ID
PROJECT_ID = dest_configs.project_id
# Name of the BigQuery dataset with MLWP tables.
DATASET_NAME = dest_configs.dataset_name

In [None]:
# To distinguish the separate runs of the training pipeline
RUN_ID = '01'

# Initial mwlp tables.
FEATURES_TABLE = f'features_{RUN_ID}'

# ML datasets
# These 4 tables will be created in {DATASET_NAME}
FEATURES_SPLIT_TABLE = f'features_split_{RUN_ID}'
FEATURES_TEST_TABLE = f'features_test_table_{RUN_ID}'
FEATURES_DEV_TABLE = f'features_dev_table_{RUN_ID}'
FEATURES_DEV_BALANCED_TABLE = f'features_dev_table_balanced_{RUN_ID}'

In [None]:
# Initialize BigQuery Client utils.
bq_utils = bigquery_utils.BigQueryUtils(project_id=PROJECT_ID)

### Check feature dataset.

1. Determine the right splitting strategy.
2. Verify the date to use as a cut-off for the `OUT-OF-TIME TEST` dataset based on the positive rate trends.
3. Check imbalance in the dataset and decide on a balancing strategy.
4. Consider additional filtering of training data based on snapshot dates.
5. Consider selecting a subset of the columns.


In [None]:
# Check list of columns to investigate what features are available,
# and potentially selecting a subset of them.
sql = f"""
SELECT
 *
FROM
  `{DATASET_NAME}.{FEATURES_TABLE}`
"""
print (sql)
df_raw = bq_utils.run_query(sql).to_dataframe()
df_raw.head()

In [None]:
df_raw.info()

### Check target variable (web conversions) distribution in the features table.

In [None]:
sql = f"""
SELECT
  EXTRACT(DATE FROM snapshot_ts) AS effective_date,
  SUM(label) AS conversions
FROM
  `{DATASET_NAME}.{FEATURES_TABLE}`
GROUP BY
  1
ORDER BY
  1 DESC;
"""
df_raw = bq_utils.run_query(sql).to_dataframe()
df_target_check = df_raw.copy(deep=True)

fig = px.line(df_target_check,
              x='effective_date',
              y='conversions',
              title='Conversions per day',
              height=400)
fig.show()
df_target_check.head(5)

## Create dataset with split on snapshot dates.

#### Get recent effective dates.

In [None]:
# Get last N effective dates.
n_last_dates = 3
recent_dates = df_target_check['effective_date'].sort_values(
    ascending=False).head(n_last_dates).values

In [None]:
# Keep this if you want to use data driven values for last dates in the dataset.
test_dates = [str(x) for x in recent_dates]

# Define dates here if you want to overwrite with curated dates.
# It is useful to keep looking at a date that was used when
# evaluating original model so we can make sure all performs as expected.
# test_dates = ('2021-05-15', '2021-05-09')
if len(test_dates) == 1:
  test_dates = f"('{tuple(test_dates)[0]}')"
else:
  test_dates = tuple(test_dates)
test_dates

In [None]:
# Create the dataset if it doesn't exist.
# TODO(michalszczecinski): Fix the dataset creation with bq_client(utils version).
dataset = bq_utils.client.create_dataset(PROJECT_ID + '.' + DATASET_NAME,
                                         exists_ok=True)
console_url = 'https://console.cloud.google.com/bigquery?project='
print(f'{console_url}{PROJECT_ID}&p={PROJECT_ID}&d={DATASET_NAME}&page=dataset')

### Create dataset with columns indicating allocation to TRAIN/VALIDATE/TEST.

In [None]:
# Add additional columns to dataset to indicate which rows are part of train,
# validate and test split. This is compliant with automl split conventions.
sql = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_NAME}.{FEATURES_SPLIT_TABLE}` AS (
WITH
  ds_features_key AS (
  SELECT
    *,
    FARM_FINGERPRINT(user_id) AS key,
  FROM
    `{PROJECT_ID}.{DATASET_NAME}.{FEATURES_TABLE}`)
SELECT
  *,
  CASE
    WHEN EXTRACT(DATE FROM TIMESTAMP (snapshot_ts)) IN {test_dates}
      THEN 'TEST'
    WHEN EXTRACT(
      DATE FROM TIMESTAMP (snapshot_ts)) NOT IN {test_dates} AND MOD(ABS(key),
      10) IN (0,1,2,3,4,5,6,7)
      THEN 'TRAIN'
    WHEN EXTRACT(
      DATE FROM TIMESTAMP (snapshot_ts)) NOT IN {test_dates} AND MOD(ABS(key),
      10) IN (8)
      THEN 'VALIDATE'
    WHEN EXTRACT(
      DATE FROM TIMESTAMP (snapshot_ts)) NOT IN {test_dates} AND MOD(ABS(key),
      10) IN (9)
      THEN 'TEST'
  END as data_split,
FROM ds_features_key
);
"""
print (sql)
df_raw = bq_utils.run_query(sql).to_dataframe()
df_raw.head()

### Create TEST (OUT-OF-TIME) dataset

In [None]:
sql = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_NAME}.{FEATURES_TEST_TABLE}`
AS (
  SELECT *
  FROM `{PROJECT_ID}.{DATASET_NAME}.{FEATURES_SPLIT_TABLE}`
  WHERE EXTRACT(DATE FROM TIMESTAMP (snapshot_ts)) IN {test_dates}
);
"""
print (sql)
df_raw = bq_utils.run_query(sql).to_dataframe()
df_raw.head()

### Create DEVELOPMENT (IN-TIME) dataset

In [None]:
sql = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_NAME}.{FEATURES_DEV_TABLE}`
AS (
  SELECT *
  FROM `{PROJECT_ID}.{DATASET_NAME}.{FEATURES_SPLIT_TABLE}`
  WHERE EXTRACT(DATE FROM TIMESTAMP (snapshot_ts)) NOT IN {test_dates}
);
"""
print (sql)
df_raw = bq_utils.run_query(sql).to_dataframe()
df_raw.head()