Copyright 2021 Google LLC..

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# About

This notebook builds an AutoML model for predicting the future value of a business KPI such as sales, store visits, or leads. It executes the following steps:

- generates an ML dataset (instances, features and label on a daily basis)
- generates plots for Exploratory Data Analysis (EDA)
- separates development and out of time test data partitions
- generates correlation plots between features and the label
- trains an AutoML model using the development set
- validates the model predictions

## Load Libraries

In [None]:
import datetime
import pandas as pd
import forecaster_util
from utils import template_util
from google.cloud import bigquery

## Input parameters

In [None]:
# Date of this training run (usually today). All output BigQuery tables,
# including the features table and the model, will have this date suffix.
# NB: run_date can be overridden with any date in the format: 'YYYYMMDD'.
run_date = datetime.datetime.today().strftime('%Y%m%d')

parameters = {
    # GCP project.
    'project_id': '',
    # BigQuery dataset to store the output featues and model. Must be located in
    # the US or EU, as required by AutoML.
    'dataset_id': '',

    # BigQuery SQL query to extract the raw training data. Note there must be
    # one column called 'ts' of type TIMESTAMP, and one column for the label (
    # i.e. the KPI to forecast). The label column can have any name. The query
    # must also extract any numeric (INT64, FLOAT64) columns used in the model.
    'data_query': """""",
    # Name of the BigQuery column containing the numeric key business objective
    # that the model will predict.
    'label': '',
    # BigQuery column names of numeric features in the data_query that will be
    # used to help predict the label.
    'numeric_features': [],
    # By default, all data in the data_query is used. Specify a start_date or
    # end_date below to restrict the date range. Date format: 'YYYY-MM-DD'.
    'start_date': '',
    'end_date': '',
    # Reserve this latest fraction of data for testing. Must be in the range
    # (0, 1), non-inclusive.
    'oot_test_fraction': 0.1,

    # Window size.
    'window_size': 'HOUR',  # One of HOUR, DAY, WEEK.

    # Make predictions for this many prediction_windows in the future. These
    # are default values for each prediction_window_size and can be overriden.
    'num_hour_prediction_windows': 72,
    'num_day_prediction_windows': 28,
    'num_week_prediction_windows': 4,

    # List of windows. Numeric features are constructed over historical window
    # periods. Each window is specified with a pair (window_start, window_end),
    # which corresponds to the range (today - window_start day) to
    # (today + window_end days) inclusive
    'hour_lookback_windows': [
        (1, 1), (2, 2), (3, 3), (4, 4), (24, 24), (2 * 24, 2 * 24),
        (7 * 24, 7 * 24), (14 * 24, 14 * 24), (21 * 24, 21 * 24),
        (28 * 24, 28 * 24), (7*24, 1), (21*24, 7*24), (35*24, 21*24)],
    'day_lookback_windows': [
        (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (14, 14),
        (21, 21), (28, 28), (7, 1), (21, 8), (49, 22)],
    'week_lookback_windows': [
        (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (14, 14),
        (21, 21), (28, 28), (7, 1), (21, 8), (49, 22)],

    # List of BigQuery aggregation functions to apply to the historical windows.
    'aggregate_functions': ['SUM', 'AVG'],

    # Name of the ouput features table (default: features_YYYYMMDD).
    'features_table': f'features_{run_date}',
    # Name of the ouput model table (default: model_YYYYMMDD)
    'model_table': f'model_{run_date}',
    # Development ML dataset table (default: ml_development_table_YYYYMMDD).
    'ml_development_table': f'ml_development_data_{run_date}',
    # OOT testing ML dataset table (default: ml_oot_testing_table_YYYYMMDD).
    'ml_oot_testing_table': f'ml_oot_testing_data_{run_date}',

    # SQL template locations.
    'create_model_input_data_template':
        'templates/create_model_input_data.sql',
    'features_template': 'templates/features.sql',
    'train_model_template': 'templates/train_model.sql',
    'evaluate_model_template': 'templates/evaluate_model.sql',
    'prediction_template': 'templates/prediction.sql'
}
if parameters['window_size'] == 'HOUR':
  parameters['micros_per_window'] = 60 * 60 * 1000000
  parameters['num_prediction_periods'] = parameters[
      'num_hour_prediction_windows']
  parameters['lookback_windows'] = parameters['hour_lookback_windows']
  parameters['grouping_column_name'] = 'hournum'
elif parameters['window_size'] == 'DAY':
  parameters['micros_per_window'] = 24 * 60 * 60 * 1000000
  parameters['num_prediction_periods'] = parameters[
      'num_day_prediction_windows']
  parameters['lookback_windows'] = parameters['day_lookback_windows']
  parameters['grouping_column_name'] = 'weekday'
elif parameters['window_size'] == 'WEEK':
  parameters['micros_per_window'] = 7 * 24 * 60 * 60 * 1000000
  parameters['num_prediction_periods'] = parameters[
      'num_week_prediction_windows']
  parameters['lookback_windows'] = parameters['week_lookback_windows']
  parameters['grouping_column_name'] = 'weeknum'
else:
  assert('Error: Unknown window_size', parameters['window_size'])

parameters['max_lookback'] = max(
    [window_start for (window_start, _) in parameters['lookback_windows']])
parameters['training_mode'] = True

### Create ML Dataset

In [None]:
# Create a BigQuery client
client = bigquery.Client(parameters['project_id'])
client.create_dataset(parameters['dataset_id'], exists_ok=True)

# Create the features table
create_features_table_query = template_util.render_template(
    parameters['create_model_input_data_template'], parameters)
client.query(create_features_table_query).result();


### Exploratory Data Analysis (EDA) and ML Data Partition

In [None]:
# Read in data from BigQuery
features_table = (
    f"{parameters['project_id']}.{parameters['dataset_id']}."
    f"{parameters['features_table']}")
ml_data = client.list_rows(features_table).to_dataframe()
ml_data = ml_data.sort_values(['ts', 'prediction_period']).drop_duplicates()

# Visualize the label over time
label_column = f"label_{parameters['label']}"
ml_data[ml_data.prediction_period == 0][['ts', label_column]].plot(
    x='ts', figsize=(30, 10));

In [None]:
# Seperate the ML data into Development and Out of time (OOT) test sets based on
# selected time periods.

development_instances = int(len(ml_data)*(1 - parameters['oot_test_fraction']))

development = ml_data[:development_instances]
oot_test = ml_data[development_instances:]
print(f"Number of development instances ({development[['ts']].min()[0]} "
      f"to {development[['ts']].max()[0]}): {development.shape[0]}")
print(f"Number of OOT testing instances ({oot_test[['ts']].min()[0]} "
      f"to {oot_test[['ts']].max()[0]}): {oot_test.shape[0]}")

In [None]:
# If the number of instances in the development dataset is < 1000, add
# duplicate instances to make it 1000 as per the requirment of AutoML.
if development.shape[0] < 1000:
  duplicates = development.sample(1000 - development.shape[0], replace=True)
  development = pd.concat([development, duplicates])
print('Number of development instances:', development.shape[0])

In [None]:
# Visualize the label distribution of the data over time
label_column = f"label_{parameters['label']}"
ml_data[ml_data.prediction_period == 0].plot(
    x='ts',
    y=label_column,
    figsize=(25, 10),
    title='All Instances Label Distribution')

# Visualize the label distribution of the development dataset over time
development[ml_data.prediction_period == 0].plot(
    x='ts',
    y=label_column,
    figsize=(25, 10),
    title='Development Instances Label Distribution')

# Visualize the label distribution of of OOT Test partition over time
oot_test[ml_data.prediction_period == 0].plot(
    x='ts',
    y=label_column,
    figsize=(25, 10),
    title='OOT Test Instances Label Distribution');

In [None]:
# Explore the correlation between the features and label for Development and
# OOT Test partions
forecaster_util.plot_dev_and_test_feature_correlations_with_label(
    development, oot_test, label_column);

In [None]:
# Save the ML datasets into BQ tables
job_config = bigquery.job.LoadJobConfig()
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
client.load_table_from_dataframe(
    development,
    f"{parameters['dataset_id']}.{parameters['ml_development_table']}",
    job_config=job_config).result()
client.load_table_from_dataframe(
    oot_test,
    f"{parameters['dataset_id']}.{parameters['ml_oot_testing_table']}",
    job_config=job_config).result();

### Train AutoML Model

In [None]:
# Note that this steps takes hours to run.

create_model_query = template_util.render_template(
    parameters['train_model_template'],
    {**parameters, 'input_features_table': parameters['ml_development_table']})
print('Running query:', create_model_query)
client.query(create_model_query).result()

### Evaluate the Model

In [None]:
# Evaluate the model based on the testing partition of the development set 

evaluation_query = template_util.render_template(
    parameters['evaluate_model_template'], parameters)
for row in list(client.query(evaluation_query).result()):
  for (metric, value) in dict(row).items():
    print(f'{metric:<25}{value:=10.2f}')

In [None]:
# Evaluate the model based on the whole development set.
predicted_label_column = f'predicted_{label_column}'

# Run prediction on the development set.
prediction_query = template_util.render_template(
    parameters['prediction_template'],
    {**parameters, 'input_features_table': parameters['ml_development_table']})
dev_pred_data = client.query(prediction_query).to_dataframe()


In [None]:
# Plot predictions against labels for the development set.

import importlib
importlib.reload(forecaster_util)


forecaster_util.plot_predictions_against_labels(
    dev_pred_data[dev_pred_data.prediction_period == 0],
    label_column, predicted_label_column)
# Output the mae and mape for each day of the week (Sunday to Saturday). 
print(forecaster_util.calculate_performance_by_grouping(
    dev_pred_data[dev_pred_data.prediction_period == 0],
    label_column,
    predicted_label_column,
    parameters['grouping_column_name']))

In [None]:
# Evaluate the model based on the OOT testing set

# Run prediction on the OOT testing set
prediction_query = template_util.render_template(
    parameters['prediction_template'],
    {**parameters, 'input_features_table': parameters['ml_oot_testing_table']})
test_pred_data = client.query(prediction_query).to_dataframe()

# Plot predictions against labels for the OOT testing set.
forecaster_util.plot_predictions_against_labels(
    test_pred_data[test_pred_data.prediction_period == 0],
    label_column, predicted_label_column)
# Output the mae and mape for each day of the week (Sunday to Saturday).
print(forecaster_util.calculate_performance_by_grouping(
    test_pred_data[test_pred_data.prediction_period == 0],
    label_column,
    predicted_label_column,
    parameters['grouping_column_name']))