# 02 - ML Experimentation with Custom Model

The purpose of this notebook is to use [custom training](https://cloud.google.com/ai-platform-unified/docs/training/custom-training) to train a TFDF classifier. The notebook covers the following tasks:
1. Preprocess the data locally using Apache Beam.
2. Train and test custom model locally using a TFDF implementation.
3. Submit a Dataflow job to preprocess the data.
4. Submit a custom training job to Vertex AI using a [pre-built container](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers).
5. Upload the trained model to Vertex AI.
6. Track experiment parameters from [Vertex AI Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction).

We use [Vertex TensorBoard](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview) 
and [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction) to track, visualize, and compare ML experiments.

## Setup

### Import libraries

In [None]:
import os
import logging
from datetime import datetime
import numpy as np

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow.keras as keras
import tensorflow_decision_forests as tfdf

from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform import hyperparameter_tuning as hp_tuning

from src.common import features, datasource_utils
from src.model_training import data, model_tfdf, defaults, trainer, exporter
from src.preprocessing import etl

logging.getLogger().setLevel(logging.INFO)
tf.get_logger().setLevel('INFO')

print(f"TensorFlow: {tf.__version__}")
print(f"TensorFlow Transform: {tft.__version__}")
print(f"TensorFlow Decision Forests: {tfdf.__version__}")

### Setup Google Cloud project

In [None]:
PROJECT = '' # Change to your project id.
REGION = '' # Change to your region.
BUCKET =  ''
SERVICE_ACCOUNT = ""

if PROJECT == "" or PROJECT is None or PROJECT == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT = shell_output[0]
    
if SERVICE_ACCOUNT == "" or SERVICE_ACCOUNT is None or SERVICE_ACCOUNT == "[your-service-account]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.account)' 2>/dev/null
    SERVICE_ACCOUNT = shell_output[0]
    
if BUCKET == "" or BUCKET is None or BUCKET == "[your-bucket-name]":
    # Get your bucket name to GCP projet id
    BUCKET = PROJECT
    # Try to create the bucket if it doesn't exist
    ! gsutil mb -l $REGION gs://$BUCKET
    print("")
    
PARENT = f"projects/{PROJECT}/locations/{REGION}"
    
print("Project ID:", PROJECT)
print("Region:", REGION)
print("Bucket name:", BUCKET)
print("Service Account:", SERVICE_ACCOUNT)
print("Vertex API Parent URI:", PARENT)

### Set configurations

In [None]:
VERSION = 'v01'
DATASET_DISPLAY_NAME = 'tfdf-rules'
MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'

WORKSPACE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}'
EXPERIMENT_ARTIFACTS_DIR = os.path.join(WORKSPACE, 'experiments')
RAW_SCHEMA_LOCATION = 'src/raw_schema/schema.pbtxt'

TENSORBOARD_DISPLAY_NAME = f'tb-{DATASET_DISPLAY_NAME}'
EXPERIMENT_NAME = f'{MODEL_DISPLAY_NAME}'

LOCAL_CSV_DIR = os.path.join('.', 'data')

## Create Vertex TensorBoard instance 

In [None]:
tensorboard_resource = vertex_ai.Tensorboard.create(display_name=TENSORBOARD_DISPLAY_NAME)
tensorboard_resource_name = tensorboard_resource.gca_resource.name
print("TensorBoard resource name:", tensorboard_resource_name)

## Initialize workspace

In [None]:
REMOVE_EXPERIMENT_ARTIFACTS = False

if tf.io.gfile.exists(EXPERIMENT_ARTIFACTS_DIR) and REMOVE_EXPERIMENT_ARTIFACTS:
    print("Removing previous experiment artifacts...")
    tf.io.gfile.rmtree(EXPERIMENT_ARTIFACTS_DIR)

if not tf.io.gfile.exists(EXPERIMENT_ARTIFACTS_DIR):
    print("Creating new experiment artifacts directory...")
    tf.io.gfile.mkdir(EXPERIMENT_ARTIFACTS_DIR)

print("Workspace is ready.")
print("Experiment directory:", EXPERIMENT_ARTIFACTS_DIR)

## Initialize Vertex AI experiment

In [None]:
run_id = f"run-local-{datetime.now().strftime('%Y%m%d%H%M%S')}"
run_id = 'run-local-20220421180000'

EXPERIMENT_RUN_DIR = os.path.join(EXPERIMENT_ARTIFACTS_DIR, EXPERIMENT_NAME, run_id)
print("Experiment run directory:", EXPERIMENT_RUN_DIR)

LOG_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'logs')
EXPORT_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'model')

In [None]:
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=BUCKET,
    experiment=EXPERIMENT_NAME
)

vertex_ai.start_run(run_id)

## 1. Preprocess the data

Note that it is not necessary to preprocess data when using Random Forests (https://www.tensorflow.org/decision_forests/migration#feature_normalization_preprocessing)

## 2. Train a custom model locally using Decision Forests

The `TFDF` implementation of the custom model is in the [model_training](src/model_training) directory.

In [None]:
LOG_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'logs')
EXPORT_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'model')

### Train the model locally.

In [None]:
train_data_file_pattern = os.path.join(LOCAL_CSV_DIR, 'data.csv')
eval_data_file_pattern = os.path.join(LOCAL_CSV_DIR, 'data.csv')

print(f"train_data_file_pattern: {train_data_file_pattern}")

In [None]:
trained_model = trainer.train(
    fn_args=None,
    csv_data_dir=train_data_file_pattern,
)

In [None]:
trained_model.summary()

In [None]:
logs = trained_model.make_inspector().training_logs()
print(logs)

In [None]:
LOG_DIR = ""  # Replace with path to the tensorboard directory on your machine
%load_ext tensorboard
trained_model.make_inspector().export_to_tensorboard(LOG_DIR)

In [None]:
trained_model.make_inspector().variable_importances()

In [None]:
tfdf.model_plotter.plot_model_in_colab(trained_model, tree_idx=0)

In [None]:
tree = trained_model.make_inspector().extract_tree(tree_idx=0)
print(tree)

In [None]:
val_loss, val_accuracy = trainer.evaluate(trained_model, train_data_file_pattern)

In [None]:
tensorboard_resource_name = 'projects/692954754682/locations/us-central1/tensorboards/1730314642770624512'

In [None]:
!tb-gcp-uploader --tensorboard_resource_name={tensorboard_resource_name} \
  --logdir={LOG_DIR} \
  --experiment_name={EXPERIMENT_NAME} --one_shot=True

### Export the trained model

In [None]:
saved_model_dir = os.path.join(EXPORT_DIR)

print(f"saved_model_dir: {saved_model_dir}")
print(f"RAW_SCHEMA_LOCATION: {RAW_SCHEMA_LOCATION}")

In [None]:
exporter.export_serving_model(
    trained_model,
    saved_model_dir,
    RAW_SCHEMA_LOCATION,
)

### Inspect model serving signatures

In [None]:
!saved_model_cli show --dir={saved_model_dir} --tag_set=serve --signature_def=serving_tf_example

In [None]:
!saved_model_cli show --dir={saved_model_dir} --tag_set=serve --signature_def=serving_default

### Test the exported SavedModel

In [None]:
serving_model = tf.saved_model.load(saved_model_dir)
print("Saved model is loaded.")

In [None]:
# Test the serving_default with feature dictionary

import tensorflow_data_validation as tfdv
from tensorflow_transform.tf_metadata import schema_utils

raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION)
raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec

In [None]:
instance = {
    "feature_1": 1,
    "feature_2": 0,
    "feature_3": 1,
    "feature_4": 1,
    "feature_5": 0,
}

for feature_name in instance:
    dtype = raw_feature_spec[feature_name].dtype
    instance[feature_name] = tf.constant([[instance[feature_name]]], dtype)

In [None]:
predictions = serving_model.signatures['serving_default'](**instance)
for key in predictions:
    print(f"{key}: {predictions[key].numpy()}")

## Start a new Vertex AI experiment run

In [None]:
vertex_ai.init(
    project=PROJECT,
    staging_bucket=BUCKET,
    experiment=EXPERIMENT_NAME)

run_id = f"run-gcp-{datetime.now().strftime('%Y%m%d%H%M%S')}"
vertex_ai.start_run(run_id)

EXPERIMENT_RUN_DIR = os.path.join(EXPERIMENT_ARTIFACTS_DIR, EXPERIMENT_NAME, run_id)
print("Experiment run directory:", EXPERIMENT_RUN_DIR)

## 4. Submit a Custom Training Job to Vertex AI

In [None]:
LOG_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'logs')
EXPORT_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'model')

### Test the training task locally

In [None]:
!python -m src.model_training.task \
    --model-dir={EXPORT_DIR} \
    --log-dir={LOG_DIR} \
    --train-data-dir={train_data_file_pattern} \
    --eval-data-dir={eval_data_file_pattern}  \
    --experiment-name={EXPERIMENT_NAME} \
    --run-name={run_id} \
    --project={PROJECT} \
    --region={REGION} \
    --staging-bucket={BUCKET}

### Prepare training package

In [None]:
TRAINER_PACKAGE_DIR = os.path.join(WORKSPACE, 'trainer_packages')
TRAINER_PACKAGE_NAME = f'{MODEL_DISPLAY_NAME}_trainer'
print("Trainer package upload location:", TRAINER_PACKAGE_DIR)
print(f"TRAINER_PACKAGE_NAME: {TRAINER_PACKAGE_NAME}")

In [None]:
#rm -r src/__pycache__/
#rm -r src/.ipynb_checkpoints/
#rm -r src/raw_schema/.ipynb_checkpoints/
!rm -f {TRAINER_PACKAGE_NAME}.tar {TRAINER_PACKAGE_NAME}.tar.gz

!mkdir {TRAINER_PACKAGE_NAME}

!cp setup.py {TRAINER_PACKAGE_NAME}/
!cp -r src {TRAINER_PACKAGE_NAME}/
!cp -r data {TRAINER_PACKAGE_NAME}/
!tar cvf {TRAINER_PACKAGE_NAME}.tar {TRAINER_PACKAGE_NAME}
!gzip {TRAINER_PACKAGE_NAME}.tar
!gsutil cp {TRAINER_PACKAGE_NAME}.tar.gz {TRAINER_PACKAGE_DIR}/
!rm -r {TRAINER_PACKAGE_NAME}
!rm -r {TRAINER_PACKAGE_NAME}.tar.gz

### Prepare the training job

A custom job with a custom image can't be used. An error is thrown: "Please use an image offered by Vertex AI for python package training."