 ============================================================================== \
 Copyright 2021 Google LLC. This software is provided as-is, without warranty \
 or representation for any use or purpose. Your use of it is subject to your \
 agreement with Google. \
 ============================================================================== 
 
 Author: Chanchal Chatterjee, Elvin Zhu \
 Email: cchatterjee@google.com, elvinzhu@google.com \
<img src="img/google-cloud-icon.jpg" alt="Drawing" style="width: 200px;"/>

### Import packages

In [2]:
!cd /home/jupyter/vapit/ai-platform-tf/Vertex
!python3 -m pip install -r ./requirements-tf26.txt -U -q --user
!python3 -m pip install -U -q google-cloud-aiplatform
!python3 -m pip install -U -q google-cloud-storage==1.32
!gcloud components update --quiet
!python3 -m pip install -U -q build


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.18.0 requires tensorflow-io-gcs-filesystem==0.18.0, which is not installed.
explainable-ai-sdk 1.3.2 requires xai-image-widget, which is not installed.
tensorflow-transform 1.3.0 requires absl-py<0.13,>=0.9, but you have absl-py 0.14.1 which is incompatible.
tensorflow-transform 1.3.0 requires pyarrow<3,>=1, but you have pyarrow 5.0.0 which is incompatible.
tensorflow-io 0.18.0 requires tensorflow<2.6.0,>=2.5.0, but you have tensorflow 2.6.0 which is incompatible.
pandas-profiling 3.0.0 requires tangled-up-in-unicode==0.1.0, but you have tangled-up-in-unicode 0.2.0 which is incompatible.
cloud-tpu-client 0.10 requires google-api-python-client==1.8.0, but you have google-api-python-client 2.24.0 which is incompatible.
black 21.9b0 requires typing-extensions>=3.10.0.0, but you have typing-extens

In [3]:
import tensorflow as tf
print(tf.__version__)

2.6.0


### Create training application package

The easiest (and recommended) way to create a training application package uses gcloud to package and upload the application when you submit your training job. This method allows you to create a very simple file structure. For this tutorial, the file structure of your training application package should appear similar to the following:

```
trainer/ 
    __init__.py
    train.py
    train_hpt.py
```


In [4]:
%%writefile ./setup.py

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================

from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = [
    'tensorflow==2.6.0',
    'numpy==1.18.0',
    'pandas==1.2.1',
    'scipy==1.4.1',
    'scikit-learn==0.22',
    'google-cloud-storage==1.23.0',
    'xgboost==1.3.3',
    'cloudml-hypertune',
    ]
 
setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Trainer package for XGBoost Task'
)


Overwriting ./setup.py


In [5]:
# Create the tf_trainer directory and load the trainer files in it
!mkdir -p trainer

In [6]:
%%writefile ./trainer/__init__.py

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================


Overwriting ./trainer/__init__.py


In [7]:
%%writefile ./trainer/train.py

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================

import argparse
import json
import hypertune
import os
import warnings

import datetime as datetime
import numpy as np
import pandas as pd
import tensorflow as tf

from pytz import timezone

# from .trainer import model
# from .trainer import inputs

warnings.filterwarnings("ignore")

#0 = all messages are logged (default behavior)
#1 = INFO messages are not printed
#2 = INFO and WARNING messages are not printed
#3 = INFO, WARNING, and ERROR messages are not printed
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

def parse_arguments():
    """Argument parser.
    Returns:
      Dictionary of arguments.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('--depth', default=3, type=int, 
                        help='Hyperparameter: depth of network')
    parser.add_argument('--dropout_rate', default=0.02, type=float, 
                        help='Hyperparameter: Drop out rate')
    parser.add_argument('--learning_rate', default=0.0001, type=float, 
                        help='Hyperparameter: initial learning rate')
    parser.add_argument('--batch_size', default=4, type=int, 
                        help='Hyperparameter: batch size of the deep network')
    parser.add_argument('--epochs', default=1, type=int, 
                        help='number of epochs.')
    parser.add_argument('--job-dir', default="",
                        help='Directory to store model checkpoints and logs.')
    parser.add_argument('--train_feature_name', default="",
                        help='GCS path to train feature csv.')
    parser.add_argument('--test_feature_name', default="",
                        help='GCS path to test feature csv.')
    parser.add_argument('--train_label_name', default="",
                        help='GCS path to train label csv.')
    parser.add_argument('--test_label_name', default="",
                        help='GCS path to test label csv.')
    parser.add_argument('--verbosity', choices=['DEBUG','ERROR','FATAL','INFO','WARN'],
                        default='FATAL')
    args, _ = parser.parse_known_args()
    return args

def tf_model(input_dim, output_dim, depth, dropout_rate):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout

    decr = int((input_dim-output_dim-16)/depth) ^ 1

    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation=tf.nn.relu))
    for i in range(1,depth):
        model.add(Dense(input_dim-i*decr, activation=tf.nn.relu, kernel_regularizer='l2'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(output_dim, activation=tf.nn.softmax))
    print(model.summary())

    return model

# def custom_loss(y_true, y_pred):
#     custom_loss = mean(square(y_true - y_pred), axis=-1)
#     return custom_loss

# def custom_metric(y_true, y_pred):
#     custom_metric = mean(square(y_true - y_pred), axis=-1)
#     return custom_metric

def get_callbacks(args, early_stop_patience: int = 3):
    """Creates Keras callbacks for model training."""

    # Get trialId
    trialId = json.loads(os.environ.get("TF_CONFIG", "{}")).get("task", {}).get("trial", "")
    if trialId == '':
        trialId = '0'
    print("trialId=", trialId)

    curTime = datetime.datetime.now(timezone('US/Pacific')).strftime('%H%M%S')
    
    # Modify model_dir paths to include trialId
    model_dir = args.job_dir + "/checkpoints/cp-"+curTime+"-"+trialId+"-{val_accuracy:.4f}"
    log_dir   = args.job_dir + "/log_dir"

    tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)
    checkpoint_cb  = tf.keras.callbacks.ModelCheckpoint(model_dir, monitor='val_accuracy', mode='max', 
                                                        verbose=0, save_best_only=True,
                                                        save_weights_only=False)
    earlystop_cb   = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

    return [checkpoint_cb, tensorboard_cb, earlystop_cb]


if __name__ == "__main__":

    args = parse_arguments()
    print(args)
    print("Input and pre-process data ...")   
    x_train = pd.read_csv(args.train_feature_name)
    y_train = pd.read_csv(args.train_label_name, header=None)
    x_test = pd.read_csv(args.test_feature_name)
    y_test = pd.read_csv(args.test_label_name, header=None)

    print("Shapes:")
    print(x_train.shape)
    print(y_train.shape)
    # Train model
    print("Creating model ...")
    model = tf_model(x_train.shape[1], y_train.shape[1], 
                              depth=args.depth,
                              dropout_rate=args.dropout_rate)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=args.learning_rate),
                     loss='mean_squared_error',
                     metrics=['accuracy'])
    
    print("Fitting model ...")
    callbacks = get_callbacks(args, 3)
    hist = model.fit(np.array(x_train), np.array(y_train), 
                         epochs=args.epochs,
                         batch_size=args.batch_size,
                         validation_data=(np.array(x_test),y_test),
                         callbacks=callbacks)

    # TBD save history for visualization
    final_epoch_accuracy = hist.history['accuracy'][-1]
    final_epoch_count = len(hist.history['accuracy'])

    print('final_epoch_accuracy = %.6f' % final_epoch_accuracy)
    print('final_epoch_count = %2d' % final_epoch_count)

    model.save(args.job_dir)


Overwriting ./trainer/train.py


In [8]:
# Create the tf directory and load the trainer files in it
!cp ./trainer/train.py ./trainer/train_hpt.py


In [9]:
%%writefile -a ./trainer/train_hpt.py

    """This method updates a CAIP HPTuning Job with a final metric for the job.
    In TF2.X the user must either use hypertune or a custom callback with
    tf.summary.scalar to update CAIP HP Tuning jobs. This function uses
    hypertune, which appears to be the preferred solution. Hypertune also works
    with containers, without code change.
    Args:
        metric_tag: The metric being optimized.  This MUST MATCH the
          hyperparameterMetricTag specificed in the hyperparameter tuning yaml.
        metric_value: The value to report at the end of model training.
        global_step: An int value to specify the number of trainin steps completed
          at the time the metric was reported.
    """
    
    # The default name of the metric is training/hptuning/metric. 
    # We recommend that you assign a custom name. The only functional difference is that 
    # if you use a custom name, you must set the hyperparameterMetricTag value in the 
    # HyperparameterSpec object in your job request to match your chosen name.
    # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#HyperparameterSpec
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        metric_value=final_epoch_accuracy,
        hyperparameter_metric_tag='accuracy',
        global_step=final_epoch_count
    )

Appending to ./trainer/train_hpt.py


In [10]:
# Automatically restart kernel after installs
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)  

{'status': 'ok', 'restart': True}

In [10]:
# Import packages

import json
import logging
import pandas as pd
import numpy as np
from datetime import datetime
from pytz import timezone
from googleapiclient import discovery
from google.cloud import aiplatform

### Configure Global Variables

List your current GCP project name

In [11]:
project_id = !gcloud config list --format 'value(core.project)' 2>/dev/null
project_id

['cchatterjee-sandbox']

Configure your system variables

In [12]:
# Configure your global variables
PROJECT = project_id[0]  # Replace with your project ID
USER = 'cchatterjee'             # Replace with your user name
BUCKET_NAME = 'vapit_data'       # Replace with your gcs bucket name - gloablly unique

FOLDER_NAME = 'tf_models'
TIMEZONE = 'US/Pacific'
REGION = 'us-central1'
PACKAGE_URIS = f"gs://{BUCKET_NAME}/trainer/tensorflow/trainer-0.1.tar.gz" 
TRAIN_FEATURE_PATH = f"gs://{BUCKET_NAME}/tf_data/mortgage_structured_x_train.csv" 
TRAIN_LABEL_PATH = f"gs://{BUCKET_NAME}/tf_data/mortgage_structured_y_train.csv" 
TEST_FEATURE_PATH = f"gs://{BUCKET_NAME}/tf_data/mortgage_structured_x_test.csv" 
TEST_LABEL_PATH = f"gs://{BUCKET_NAME}/tf_data/mortgage_structured_y_test.csv"


### Authenticate your GCP account

This is required if you run the notebook in Colab

In [13]:
try:
    from google.colab import auth
    auth.authenticate_user()
    print("Colab user is authenticated.")
except: pass

Create your bucket

In [14]:
!gsutil mb -l $REGION gs://$BUCKET_NAME 

Creating gs://vapit_data/...
ServiceException: 409 A Cloud Storage bucket named 'vapit_data' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


Build python package and upload to your bucket

In [15]:
!cd /home/jupyter/vapit/ai-platform-tf/Vertex
!python3 -m build
!gsutil cp ./dist/trainer-0.1.tar.gz $PACKAGE_URIS

[1m* Creating virtualenv isolated environment...[0m
DEBUG:filelock:Attempting to acquire lock 140390617913680 on /home/jupyter/.local/share/virtualenv/wheel/3.7/image/1/CopyPipInstall/pip-21.2.4-py3-none-any.lock
DEBUG:filelock:Lock 140390617913680 acquired on /home/jupyter/.local/share/virtualenv/wheel/3.7/image/1/CopyPipInstall/pip-21.2.4-py3-none-any.lock
DEBUG:filelock:Attempting to release lock 140390617913680 on /home/jupyter/.local/share/virtualenv/wheel/3.7/image/1/CopyPipInstall/pip-21.2.4-py3-none-any.lock
DEBUG:filelock:Lock 140390617913680 released on /home/jupyter/.local/share/virtualenv/wheel/3.7/image/1/CopyPipInstall/pip-21.2.4-py3-none-any.lock
[1m* Installing packages in isolated environment... (setuptools >= 40.8.0, wheel)[0m
[1m* Getting dependencies for sdist...[0m
running egg_info
creating trainer.egg-info
writing trainer.egg-info/PKG-INFO
writing dependency_links to trainer.egg-info/dependency_links.txt
writing requirements to trainer.egg-info/requires.txt


In [16]:
# freddie mac public mortgage data (Don't change it)
INPUT_DATA = "gs://tuti_asset/datasets/mortgage_structured.csv" # public mortgage data 
TARGET_COLUMN = "TARGET" # Column name for target labels

-----------
### Special functions


In [17]:
#------
def find_best_model_dir(model_dir, offset=1, maxFlag=1):
    # Get a list of model directories
    all_models = ! gsutil ls $model_dir
    print("")
    print("All Models = ")
    print(*all_models, sep='\n')

    # Check if model dirs exist
    if (("CommandException" in all_models[0]) or (len(all_models) <= 1)):
        print("Create the models first.")
        return ""

    # Find the best model from checkpoints
    import re
    best_acc = -np.Inf
    if (maxFlag != 1):
        best_acc = np.Inf
    best_model_dir = ""
    tup_list = []
    for i in range(1,len(all_models)):
        all_floats = re.findall(r"[-+]?\d*\.\d+|\d+", all_models[i]) #Find the floats in the string
        cur_acc = -float(all_floats[-offset]) #which item is the model optimization metric
        tup_list.append([all_models[i],cur_acc])
        if (maxFlag*(cur_acc > best_acc) or (1-maxFlag)*(cur_acc < best_acc)):
            best_acc = cur_acc
            best_model_dir = all_models[i]
    if maxFlag:
        tup_list.sort(key=lambda tup: tup[1], reverse=False)
    else:
        tup_list.sort(key=lambda tup: tup[1], reverse=True)
    #for i in range(len(tup_list)):
    #    print(tup_list[i][0])
    print("Best Accuracy  from Checkpoints = ", best_acc)
    print("Best Model Dir from Checkpoints = ", best_model_dir)
    
    return best_model_dir


-----------
### Dataset preprocessing

Preprocess input data by

    1. Dropping unique ID column;
    2. Convert categorical into one-hot encodings;
    3. Count number of unique classes;
    4. Split train/test
    5. Save process data into gcs

In [18]:
!python3 preprocessing.py \
    --input_file $INPUT_DATA \
    --x_train_name $TRAIN_FEATURE_PATH \
    --x_test_name $TEST_FEATURE_PATH \
    --y_train_name $TRAIN_LABEL_PATH \
    --y_test_name $TEST_LABEL_PATH \
    --target_column $TARGET_COLUMN

INFO:root:Preprocessing raw data:
INFO:root: => Drop id column:
INFO:root: => One hot encoding categorical features
INFO:root: => Count number of classes
INFO:root: => Perform train/test split
INFO:root:Reading raw data file: gs://tuti_asset/datasets/mortgage_structured.csv
INFO:root:Drop unique id column which is not an useful feature for ML: LOAN_SEQUENCE_NUMBER
INFO:root:Convert categorical columns into one-hot encodings
INFO:root:categorical feature: first_time_home_buyer_flag
INFO:root:categorical feature: occupancy_status
INFO:root:categorical feature: channel
INFO:root:categorical feature: property_state
INFO:root:categorical feature: property_type
INFO:root:categorical feature: loan_purpose
INFO:root:categorical feature: seller_name
INFO:root:categorical feature: service_name
INFO:root:Count number of unique classes ...
INFO:root:No. of Classes: 4
INFO:root:Perform train/test split ...
INFO:root:Get feature/label shapes ...
INFO:root:x_train shape = (93639, 149)
INFO:root:x_tes

------
### Training with Google Vertex AI 

For the full article, please visit: https://cloud.google.com/vertex-ai/docs

Where Vertex AI fits in the ML workflow \
The diagram below gives a high-level overview of the stages in an ML workflow. The blue-filled boxes indicate where Vertex AI provides managed services and APIs:

<img src="img/ml-workflow.svg" alt="Drawing">

As the diagram indicates, you can use Vertex AI to manage the following stages in the ML workflow:

- Train an ML model on your data:
 - Train model
 - Evaluate model accuracy
 - Tune hyperparameters
 
 
- Deploy your trained model.

- Send prediction requests to your model:
 - Online prediction
 - Batch prediction (for TensorFlow only)
 
 
- Monitor the predictions on an ongoing basis.

- Manage your models and model versions.

- For the latest list, see 
  - Pre-built containers for training: https://cloud.google.com/vertex-ai/docs/training/pre-built-containers
    and 
  - Pre-built containers for prediction: https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers


#### Train at local

Before submitting training jobs to Cloud AI Platform, you can test your train.py code in the local environment. You can test by running your python script in command line, but another and maybe better choice is to use `gcloud ai-platform local train` command. The latter method could make sure your your entire python package are ready to be submitted to the remote VMs.

In [19]:
# Train on local machine with python command
!python3 trainer/train.py \
    --job-dir ./models \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --test_feature_name $TEST_FEATURE_PATH \
    --test_label_name $TEST_LABEL_PATH

Namespace(batch_size=4, depth=3, dropout_rate=0.02, epochs=1, job_dir='./models', learning_rate=0.0001, test_feature_name='gs://vapit_data/tf_data/mortgage_structured_x_test.csv', test_label_name='gs://vapit_data/tf_data/mortgage_structured_y_test.csv', train_feature_name='gs://vapit_data/tf_data/mortgage_structured_x_train.csv', train_label_name='gs://vapit_data/tf_data/mortgage_structured_y_train.csv', verbosity='FATAL')
Input and pre-process data ...
Shapes:
(93639, 149)
(93639, 4)
Creating model ...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               19200     
_________________________________________________________________
dense_1 (Dense)              (None, 107)               13803     
_________________________________________________________________
dense_2 (Dense)              (None, 65)                7020      
_________

------
### Hyperparameter Tuning

To use hyperparameter tuning in your training job you must perform the following steps:

- Specify the hyperparameter tuning configuration for your training job by including a HyperparameterSpec in your TrainingInput object.

- Include the following code in your training application:

 - Parse the command-line arguments representing the hyperparameters you want to tune, and use the values to set the hyperparameters for your training trial.
 - Add your hyperparameter metric to the summary for your graph.


In [20]:
# Google Vertex AI requires each job to have unique name, 
# Therefore, we use prefix + timestamp to form job names.
JOBNAME_HPT = 'tensorflow_train_{}_{}_hpt'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    ) # define unique job name

# We use the job names as folder names to store outputs.
JOB_DIR_HPT = 'gs://{}/{}/jobdir/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    datetime.now(timezone(TIMEZONE)).strftime("model_%m%d%y_%H%M")
    )

print("JOB_NAME_HPT = ", JOBNAME_HPT)
print("JOB_DIR_HPT = ", JOB_DIR_HPT)


JOB_NAME_HPT =  tensorflow_train_cchatterjee_101721_1037_hpt
JOB_DIR_HPT =  gs://vapit_data/tf_models/jobdir/model_101721_1037


### Submit the hyperparameter job to vertex AI

In [21]:
executor_image_uri = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-2:latest'
python_module =  "trainer.train_hpt"
api_endpoint = "{}-aiplatform.googleapis.com".format(REGION)
machine_type = "n1-standard-4"

# The AI Platform services require regional API endpoints.
client_options = {"api_endpoint": api_endpoint}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client = aiplatform.gapic.JobServiceClient(client_options=client_options)

# study_spec
metric = {
    "metric_id": "accuracy",
    "goal": aiplatform.gapic.StudySpec.MetricSpec.GoalType.MAXIMIZE,
}

depth = {
        "parameter_id": "depth",
        "integer_value_spec": {"min_value": 1, "max_value": 10},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
}
dropout_rate = {
        "parameter_id": "dropout_rate",
        "double_value_spec": {"min_value": 0.001, "max_value": 0.1},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE,
}
learning_rate = {
        "parameter_id": "learning_rate",
        "double_value_spec": {"min_value": 0.00001, "max_value": 0.01},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE,
}
batch_size = {
        "parameter_id": "batch_size",
        "integer_value_spec": {"min_value": 1, "max_value": 16},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
}
epochs = {
        "parameter_id": "epochs",
        "integer_value_spec": {"min_value": 1, "max_value": 5},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
}

# trial_job_spec
machine_spec = {
    "machine_type": machine_type,
}
worker_pool_spec = {
    "machine_spec": machine_spec,
    "replica_count": 1,
    "python_package_spec": {
        "executor_image_uri": executor_image_uri,
        "package_uris": [PACKAGE_URIS],
        "python_module": python_module,
        "args": [
            '--job-dir',
            JOB_DIR_HPT,
            '--train_feature_name',
            TRAIN_FEATURE_PATH,
            '--train_label_name',
            TRAIN_LABEL_PATH,
            '--test_feature_name',
            TEST_FEATURE_PATH,
            '--test_label_name',
            TEST_LABEL_PATH,
        ],
    },
}

# hyperparameter_tuning_job
hyperparameter_tuning_job = {
    "display_name": JOBNAME_HPT,
    "max_trial_count": 4,
    "parallel_trial_count": 2,
    "study_spec": {
        "metrics": [metric],
        "parameters": [depth, dropout_rate, learning_rate, batch_size, epochs],
#         "algorithm": aiplatform.gapic.StudySpec.Algorithm.RANDOM_SEARCH,
    },
    "trial_job_spec": {"worker_pool_specs": [worker_pool_spec]},
}
parent = f"projects/{PROJECT}/locations/{REGION}"
response = client.create_hyperparameter_tuning_job(
    parent=parent, hyperparameter_tuning_job=hyperparameter_tuning_job
)
print("response:", response)
job_name_hpt = response.name.split('/')[-1]


response: name: "projects/901951554789/locations/us-central1/hyperparameterTuningJobs/2341033978372292608"
display_name: "tensorflow_train_cchatterjee_101721_1037_hpt"
study_spec {
  metrics {
    metric_id: "accuracy"
    goal: MAXIMIZE
  }
  parameters {
    parameter_id: "depth"
    integer_value_spec {
      min_value: 1
      max_value: 10
    }
    scale_type: UNIT_LINEAR_SCALE
  }
  parameters {
    parameter_id: "dropout_rate"
    double_value_spec {
      min_value: 0.001
      max_value: 0.1
    }
    scale_type: UNIT_LOG_SCALE
  }
  parameters {
    parameter_id: "learning_rate"
    double_value_spec {
      min_value: 1e-05
      max_value: 0.01
    }
    scale_type: UNIT_LOG_SCALE
  }
  parameters {
    parameter_id: "batch_size"
    integer_value_spec {
      min_value: 1
      max_value: 16
    }
    scale_type: UNIT_LINEAR_SCALE
  }
  parameters {
    parameter_id: "epochs"
    integer_value_spec {
      min_value: 1
      max_value: 5
    }
    scale_type: UNIT_LINEAR_

#### Check the status of Long Running Operation (LRO) with Google API Client

Send an API request to Vertex AI to get the detailed information. The most interesting piece of information is the hyperparameter values in the trial with best performance metric.

In [22]:
client_options = {"api_endpoint": api_endpoint}
client = aiplatform.gapic.JobServiceClient(client_options=client_options)
name = client.hyperparameter_tuning_job_path(
    project=PROJECT,
    location=REGION,
    hyperparameter_tuning_job=job_name_hpt,
)
response = client.get_hyperparameter_tuning_job(name=name)
print("Job status = ", response.state)
print("response:", response)
# print("response state: ", str(response.state))
if "JobState.JOB_STATE_SUCCEEDED" == str(response.state):
    print("Job state succeeded.")


Job status =  JobState.JOB_STATE_PENDING
response: name: "projects/901951554789/locations/us-central1/hyperparameterTuningJobs/2341033978372292608"
display_name: "tensorflow_train_cchatterjee_101721_1037_hpt"
study_spec {
  metrics {
    metric_id: "accuracy"
    goal: MAXIMIZE
  }
  parameters {
    parameter_id: "depth"
    integer_value_spec {
      min_value: 1
      max_value: 10
    }
    scale_type: UNIT_LINEAR_SCALE
  }
  parameters {
    parameter_id: "dropout_rate"
    double_value_spec {
      min_value: 0.001
      max_value: 0.1
    }
    scale_type: UNIT_LOG_SCALE
  }
  parameters {
    parameter_id: "learning_rate"
    double_value_spec {
      min_value: 1e-05
      max_value: 0.01
    }
    scale_type: UNIT_LOG_SCALE
  }
  parameters {
    parameter_id: "batch_size"
    integer_value_spec {
      min_value: 1
      max_value: 16
    }
    scale_type: UNIT_LINEAR_SCALE
  }
  parameters {
    parameter_id: "epochs"
    integer_value_spec {
      min_value: 1
      max_va

#### Get the hyperparameters associated with the best metrics

In [None]:
max_ind = 0
max_val = 0
for ind, trials in enumerate(response.trials):
    value = trials.final_measurement.metrics[0].value
    print("Metrics Value (larger is better):", value)
    if value > max_val:
        max_val = value
        max_ind = ind
        
param_dict = {}
for params in response.trials[max_ind].parameters:
    param_dict[params.parameter_id] = params.value

print(param_dict)

depth=str(int(param_dict['depth']))
dropout_rate=str(param_dict['dropout_rate'])
learning_rate=str(param_dict['learning_rate'])
batch_size=str(int(param_dict['batch_size']))
epochs=str(int(param_dict['epochs']))


#### Get the best model

In [None]:
best_model_dir_hpt = find_best_model_dir(JOB_DIR_HPT+'/checkpoints', offset=1, maxFlag=1)


------
### Training with Tuned Parameters

Once your hyperparameter training jobs are done. You can use the optimized combination of hyperparameters from your trials and start a single training job on Cloud AI Platform to train your final model.

In [None]:
# Google Cloud AI Platform requires each job to have unique name, 
# Therefore, we use prefix + timestamp to form job names.
JOBNAME_TRN = 'tensorflow_train_{}_{}'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    )
# We use the job names as folder names to store outputs.
JOB_DIR_TRN = 'gs://{}/{}/jobdir/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    datetime.now(timezone(TIMEZONE)).strftime("model_%m%d%y_%H%M")
    )

print("JOB_NAME_TRN = ", JOBNAME_TRN)
print("JOB_DIR_TRN = ", JOB_DIR_TRN)


In [None]:
executor_image_uri = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-2:latest'
python_module = "trainer.train"
api_endpoint = "{}-aiplatform.googleapis.com".format(REGION)
machine_type = "n1-standard-4"
        
# The AI Platform services require regional API endpoints.
client_options = {"api_endpoint": api_endpoint}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client = aiplatform.gapic.JobServiceClient(client_options=client_options)
custom_job = {
    "display_name": JOBNAME_TRN,
    "job_spec": {
        "worker_pool_specs": [
            {
                "machine_spec": {
                    "machine_type": machine_type,
                },
                "replica_count": 1,
                "python_package_spec": {
                    "executor_image_uri": executor_image_uri,
                    "package_uris": [PACKAGE_URIS],
                    "python_module": python_module,
                    "args": [
                        '--job-dir',
                        JOB_DIR_TRN,
                        '--train_feature_name',
                        TRAIN_FEATURE_PATH,
                        '--train_label_name',
                        TRAIN_LABEL_PATH,
                        '--test_feature_name',
                        TEST_FEATURE_PATH,
                        '--test_label_name',
                        TEST_LABEL_PATH,
                        '--depth',
                        depth,
                        '--dropout_rate',
                        dropout_rate,
                        '--learning_rate',
                        learning_rate,
                        '--batch_size',
                        batch_size,
                        '--epochs',
                        epochs
                    ],
                },
            }
        ]
    },
}
parent = f"projects/{PROJECT}/locations/{REGION}"
response = client.create_custom_job(parent=parent, custom_job=custom_job)
print("response:", response)
job_id_trn = response.name.split('/')[-1]


Check the training job status

In [None]:
# check the training job status
client_options = {"api_endpoint": api_endpoint}
client = aiplatform.gapic.JobServiceClient(client_options=client_options)
name = client.custom_job_path(
    project=PROJECT,
    location=REGION,
    custom_job=job_id_trn,
)
response = client.get_custom_job(name=name)
print(response.state)


#### Get the best model

In [None]:
best_model_dir_trn = find_best_model_dir(JOB_DIR_TRN+'/checkpoints', offset=1, maxFlag=1)


--------
### Deploy the Model

Vertex AI provides tools to upload your trained ML model to the cloud, so that you can send prediction requests to the model.

In order to deploy your trained model on Vertex AI, you must save your trained model using the tools provided by your machine learning framework. This involves serializing the information that represents your trained model into a file which you can deploy for prediction in the cloud.

Then you upload the saved model to a Cloud Storage bucket, and create a model resource on Vertex AI, specifying the Cloud Storage path to your saved model.

When you deploy your model, you can also provide custom code (beta) to customize how it handles prediction requests.



#### Import model artifacts to Vertex AI 

When you import a model, you associate it with a container for Vertex AI to run prediction requests. You can use pre-built containers provided by Vertex AI, or use your own custom containers that you build and push to Container Registry or Artifact Registry.

You can use a pre-built container if your model meets the following requirements:

- Trained in Python 3.7 or later
- Trained using TensorFlow, scikit-learn, or XGBoost
- Exported to meet framework-specific requirements for one of the pre-built prediction containers

The link to the list of pre-built predict container images:

https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers?_ga=2.125143370.-1302053296.1620920844&_gac=1.221340266.1622086653.CjwKCAjw47eFBhA9EiwAy8kzNOkCqVAmokRvQaxBDOoa8AhGOpzzW69x64rRzfgWxogIn3m6moQoBRoCuOsQAvD_BwE

In [None]:
MODEL_NAME = "my_first_tensorflow_model"

response = aiplatform.Model.upload(
    display_name = MODEL_NAME,
    serving_container_image_uri = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-2:latest',
    artifact_uri = best_model_dir_hpt, #best_model_dir_trn,
)

model_id = response.name.split('/')[-1]
print("model_id = ", model_id)


#### Create Endpoint

You need the endpoint ID to deploy the model.

In [None]:
MODEL_ENDPOINT_DISPLAY_NAME = "my_first_tensorflow_model_endpoint"

aiplatform.init(project=PROJECT, location=REGION)
endpoint = aiplatform.Endpoint.create(
    display_name=MODEL_ENDPOINT_DISPLAY_NAME, project=PROJECT, location=REGION,
)

endpoint_id = endpoint.resource_name.split('/')[-1]

print("endpoint.display_name  = ", endpoint.display_name)
print("endpoint.resource_name = ", endpoint.resource_name)
#print(endpoint.uri)
print("endpoint_id = ", endpoint_id)


#### Deploy Model to the endpoint

You must deploy a model to an endpoint before that model can be used to serve online predictions; deploying a model associates physical resources with the model so it can serve online predictions with low latency. An undeployed model can serve batch predictions, which do not have the same low latency requirements.

In [None]:
MODEL_NAME = "my_first_tensorflow_model"
DEPLOYED_MODEL_DISPLAY_NAME = "my_first_tensorflow_model_deployed"

aiplatform.init(project=PROJECT, location=REGION)
model = aiplatform.Model(model_name=model_id)

# The explanation_metadata and explanation_parameters should only be
# provided for a custom trained model and not an AutoML model.
model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=DEPLOYED_MODEL_DISPLAY_NAME,
    machine_type = "n1-standard-4",
    sync=True
)

print(model.display_name)
print(model.resource_name)


### Explore models and endpoints

In [None]:
print("Models:")
!gcloud beta ai models list --region=$REGION
print("Endpoints:")
!gcloud beta ai endpoints list --region=$REGION


In [None]:
from google.cloud.aiplatform import gapic as aip
def list_models():
    PARENT = "projects/" + PROJECT + "/locations/" + REGION
    API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
    client_options = {"api_endpoint": API_ENDPOINT}
    client = aip.ModelServiceClient(client_options=client_options)
    response = client.list_models(parent=PARENT)
    model_list = []
    for model in response:
        model_list.append(
            {
                "name": model.name,
                "display_name": model.display_name,
                "create_time": model.create_time,
                "container":  model.container_spec.image_uri,
                "artifact_uri": model.artifact_uri
            }
        )
    return(model_list)

model_list = list_models()
model_list


In [None]:
from google.cloud.aiplatform import gapic as aip
def list_endpoints():
    PARENT = "projects/" + PROJECT + "/locations/" + REGION
    API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
    client_options = {"api_endpoint": API_ENDPOINT}
    client = aip.EndpointServiceClient(client_options=client_options)
    response = client.list_endpoints(parent=PARENT)
    endpoint_list = []
    for endpoint in response:
        model_name = ''
        if (len(endpoint.deployed_models) > 0):
            model_name = endpoint.deployed_models[0].model
        endpoint_list.append(
            {
                "name": endpoint.name,
                "display_name": endpoint.display_name,
                "create_time": endpoint.create_time,
                "deployed_models": model_name
            }
        )
    return(endpoint_list)

endpoint_list = list_endpoints()
endpoint_list


In [None]:
# deployed_model_id = endpoint.list_models()[0].id
# print(deployed_model_id)
# endpoint.undeploy(deployed_model_id=deployed_model_id)

In [None]:
# print(endpoint.list_models())
# print(endpoint.resource_name)

------
### Send inference requests to your model

Vertex AI provides the services you need to request predictions from your model in the cloud.

There are two ways to get predictions from trained models: online prediction (sometimes called HTTP prediction) and batch prediction. In both cases, you pass input data to a cloud-hosted machine-learning model and get inferences for each data instance.

Vertex AI online prediction is a service optimized to run your data through hosted models with as little latency as possible. You send small batches of data to the service and it returns your predictions in the response.

#### Call Google API for online inference

In [None]:
from googleapiclient import errors

# Load test feature and labels
x_test = pd.read_csv(TEST_FEATURE_PATH)
#y_test = pd.read_csv(TEST_LABEL_PATH)

# Fill nan value with zeros (Prediction lacks the ability to handle nan values for now)
x_test = x_test.fillna(0)

pprobas = []
batch_size = 16
n_samples = min(160,x_test.shape[0])
print("batch_size=", batch_size)
print("n_samples=", n_samples)

aiplatform.init(project=PROJECT, location=REGION)

for i in range(0, n_samples, batch_size):
    j = min(i+batch_size, n_samples)
    print("Processing samples", i, j)
    response = aiplatform.Endpoint(endpoint_id).predict(instances=x_test.iloc[i:j].values.tolist())
    try:
        for prediction_ in response.predictions:
            pprobas.append(prediction_)
    except errors.HttpError as err:
        # Something went wrong, print out some information.
        tf.compat.v1.logging.error('There was an error getting the job info, Check the details:')
        tf.compat.v1.logging.error(err._get_reason())
        break


In [None]:
np.array(pprobas)

#### Call Google GCLOUD API for online inference

In [None]:
# Load test feature and labels
x_test = pd.read_csv(TEST_FEATURE_PATH)
#y_test = pd.read_csv(TEST_LABEL_PATH)

# Fill nan value with zeros (Prediction lacks the ability to handle nan values for now)
x_test = x_test.fillna(0)

# Create a temporary json file to contain data to be predicted
JSON_TEMP = 'tf_test_data.json' # temp json file name to hold the inference data
batch_size = 100                # data batch size
start = 0
end = min(ind+batch_size, len(x_test))
body={'instances': x_test.iloc[start:end].values.tolist()}
# body = json.dumps(body).encode().decode()
with open(JSON_TEMP, 'w') as fp:
    fp.write(json.dumps(body))


In [None]:
!gcloud beta ai endpoints predict $endpoint_id \
  --region=$REGION \
  --json-request=$JSON_TEMP


#### Call Google API for batch inference

In [None]:
# Write batch data to file in GCS

import shutil
import os

# Clean current directory
DATA_DIR = './batch_data'
shutil.rmtree(DATA_DIR, ignore_errors=True)
os.makedirs(DATA_DIR)

n_samples = min(1000,x_test.shape[0])
nFiles = 10
nRecsPerFile = min(1000,n_samples//nFiles)
print("n_samples =", n_samples)
print("nFiles =", nFiles)
print("nRecsPerFile =", nRecsPerFile)

# Create nFiles files with nImagesPerFile images each
for i in range(nFiles):
    with open(f'{DATA_DIR}/unkeyed_batch_{i}.json', "w") as file:
        for z in range(nRecsPerFile):
            print(f'{{"dense_input": {np.array(x_test)[i*nRecsPerFile+z].tolist()}}}', file=file)
            #print(f'{{"{model_layers[0]}": {np.array(x_test)[i*nRecsPerFile+z].tolist()}}}', file=file)
            #key = f'key_{i}_{z}'
            #print(f'{{"image": {x_test_images[z].tolist()}, "key": "{key}"}}', file=file)

# Write batch data to gcs file
!gsutil -m cp -r ./batch_data gs://$BUCKET_NAME/$FOLDER_NAME/
    
# Remove old batch prediction results
!gsutil -m rm -r gs://$BUCKET_NAME/$FOLDER_NAME/batch_predictions


In [None]:
JOBNAME_BATCH = 'tensorflow_batch_{}_{}'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    )
# We use the job names as folder names to store outputs.
JOB_DIR_BATCH = 'gs://{}/{}/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    JOBNAME_BATCH,
    )

INPUT_PATH='gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/batch_data/*'
OUTPUT_PATH='gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/batch_predictions'

print("JOB_NAME_BATCH = ", JOBNAME_BATCH)
print("JOB_DIR_BATCH = ", JOB_DIR_BATCH)


In [None]:
aiplatform.init(project=PROJECT, location=REGION)

my_model = aiplatform.Model(model_name=model_id)


# Make SDK batch_predict method call
batch_prediction_job = my_model.batch_predict(
    instances_format="jsonl",
    predictions_format="jsonl",
    job_display_name=JOBNAME_BATCH,
    gcs_source=INPUT_PATH,
    gcs_destination_prefix=OUTPUT_PATH,
    model_parameters=None,
    machine_type="n1-standard-4",
    starting_replica_count=1,
    max_replica_count=1,
    sync=True,
)
print(batch_prediction_job.display_name)
print(batch_prediction_job.resource_name)
print(batch_prediction_job.state)


In [None]:
print("errors")
!gsutil cat $OUTPUT_PATH/prediction.errors_stats-00000-of-00001
print("batch prediction results")
!gsutil cat $OUTPUT_PATH/prediction.results-00000-of-00010
