 ============================================================================== \
 Copyright 2021 Google LLC. This software is provided as-is, without warranty \
 or representation for any use or purpose. Your use of it is subject to your \
 agreement with Google. \
 ============================================================================== 
 
 Author: Elvin Zhu, Chanchal Chatterjee \
 Email: elvinzhu@google.com \
<img src="img/google-cloud-icon.jpg" alt="Drawing" style="width: 200px;"/>

### Import packages

In [None]:
!cd /home/jupyter/vapit/ai-platform-tf/Vertex
!python3 -m pip install -r ./requirements.txt -U -q --user
!python3 -m pip install -U -q google-cloud-aiplatform
!python3 -m pip install -U -q google-cloud-storage==1.32
!gcloud components update --quiet
!python3 -m pip install -U -q build


In [121]:
# Automatically restart kernel after installs
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)  

{'status': 'ok', 'restart': True}

In [None]:
# Import packages

import json
import logging
import pandas as pd
import numpy as np
from datetime import datetime
from pytz import timezone
from googleapiclient import discovery
from google.cloud import aiplatform

### Configure Global Variables

List your current GCP project name

In [12]:
project_id = !gcloud config list --format 'value(core.project)' 2>/dev/null
project_id

['cchatterjee-sandbox']

Configure your system variables

In [13]:
# Configure your global variables
PROJECT = project_id[0]  # Replace with your project ID
USER = 'cchatterjee'             # Replace with your user name
BUCKET_NAME = 'vapit_data'       # Replace with your gcs bucket name - gloablly unique

FOLDER_NAME = 'tf_models'
TIMEZONE = 'US/Pacific'
REGION = 'us-central1'
PACKAGE_URIS = f"gs://{BUCKET_NAME}/trainer/tensorflow/trainer-0.1.tar.gz" 
TRAIN_FEATURE_PATH = f"gs://{BUCKET_NAME}/tf_data/mortgage_structured_x_train.csv" 
TRAIN_LABEL_PATH = f"gs://{BUCKET_NAME}/tf_data/mortgage_structured_y_train.csv" 
TEST_FEATURE_PATH = f"gs://{BUCKET_NAME}/tf_data/mortgage_structured_x_test.csv" 
TEST_LABEL_PATH = f"gs://{BUCKET_NAME}/tf_data/mortgage_structured_y_test.csv"


### Authenticate your GCP account

This is required if you run the notebook in Colab

In [14]:
try:
  from google.colab import auth
  auth.authenticate_user()
  print("Colab user is authenticated.")
except: pass

Create your bucket

In [15]:
!gsutil mb -l $REGION gs://$BUCKET_NAME 

Creating gs://vapit_data/...
ServiceException: 409 A Cloud Storage bucket named 'vapit_data' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


Build python package and upload to your bucket

In [5]:
!cd /home/jupyter/vapit/ai-platform-tf/Vertex
!python3 -m build
!gsutil cp ./dist/trainer-0.1.tar.gz $PACKAGE_URIS

Found existing installation: setuptools 47.1.0
Uninstalling setuptools-47.1.0:
  Successfully uninstalled setuptools-47.1.0
Collecting setuptools>=40.8.0
  Using cached setuptools-57.0.0-py3-none-any.whl (821 kB)
Collecting wheel
  Using cached wheel-0.36.2-py2.py3-none-any.whl (35 kB)
Installing collected packages: setuptools, wheel
Successfully installed setuptools-57.0.0 wheel-0.36.2
You should consider upgrading via the '/tmp/build-env-mg9qwwwt/bin/python -m pip install --upgrade pip' command.[0m
running egg_info
writing trainer.egg-info/PKG-INFO
writing dependency_links to trainer.egg-info/dependency_links.txt
writing requirements to trainer.egg-info/requires.txt
writing top-level names to trainer.egg-info/top_level.txt
reading manifest file 'trainer.egg-info/SOURCES.txt'
writing manifest file 'trainer.egg-info/SOURCES.txt'
running sdist
running egg_info
writing trainer.egg-info/PKG-INFO
writing dependency_links to trainer.egg-info/dependency_links.txt
writing requirements to tra

In [6]:
# freddie mac public mortgage data (Don't change it)
INPUT_DATA = "gs://tuti_asset/datasets/mortgage_structured.csv" # public mortgage data 
TARGET_COLUMN = "TARGET" # Column name for target labels

-----------
### Special functions


In [8]:
#------
def find_best_model_dir(model_dir, offset=1, maxFlag=1):
    # Get a list of model directories
    all_models = ! gsutil ls $model_dir
    print("")
    print("All Models = ")
    print(*all_models, sep='\n')

    # Check if model dirs exist
    if (("CommandException" in all_models[0]) or (len(all_models) <= 1)):
        print("Create the models first.")
        return ""

    # Find the best model from checkpoints
    import re
    best_acc = -np.Inf
    if (maxFlag != 1):
        best_acc = np.Inf
    best_model_dir = ""
    tup_list = []
    for i in range(1,len(all_models)):
        all_floats = re.findall(r"[-+]?\d*\.\d+|\d+", all_models[i]) #Find the floats in the string
        cur_acc = -float(all_floats[-offset]) #which item is the model optimization metric
        tup_list.append([all_models[i],cur_acc])
        if (maxFlag*(cur_acc > best_acc) or (1-maxFlag)*(cur_acc < best_acc)):
            best_acc = cur_acc
            best_model_dir = all_models[i]
    if maxFlag:
        tup_list.sort(key=lambda tup: tup[1], reverse=False)
    else:
        tup_list.sort(key=lambda tup: tup[1], reverse=True)
    #for i in range(len(tup_list)):
    #    print(tup_list[i][0])
    print("Best Accuracy  from Checkpoints = ", best_acc)
    print("Best Model Dir from Checkpoints = ", best_model_dir)
    
    return best_model_dir


-----------
### Dataset preprocessing

Preprocess input data by

    1. Dropping unique ID column;
    2. Convert categorical into one-hot encodings;
    3. Count number of unique classes;
    4. Split train/test
    5. Save process data into gcs

In [9]:
!python3 preprocessing.py \
    --input_file $INPUT_DATA \
    --x_train_name $TRAIN_FEATURE_PATH \
    --x_test_name $TEST_FEATURE_PATH \
    --y_train_name $TRAIN_LABEL_PATH \
    --y_test_name $TEST_LABEL_PATH \
    --target_column $TARGET_COLUMN

INFO:root:Preprocessing raw data:
INFO:root: => Drop id column:
INFO:root: => One hot encoding categorical features
INFO:root: => Count number of classes
INFO:root: => Perform train/test split
INFO:root:Reading raw data file: gs://tuti_asset/datasets/mortgage_structured.csv
INFO:root:Drop unique id column which is not an useful feature for ML: LOAN_SEQUENCE_NUMBER
INFO:root:Convert categorical columns into one-hot encodings
INFO:root:categorical feature: first_time_home_buyer_flag
INFO:root:categorical feature: occupancy_status
INFO:root:categorical feature: channel
INFO:root:categorical feature: property_state
INFO:root:categorical feature: property_type
INFO:root:categorical feature: loan_purpose
INFO:root:categorical feature: seller_name
INFO:root:categorical feature: service_name
INFO:root:Count number of unique classes ...
INFO:root:No. of Classes: 4
INFO:root:Perform train/test split ...
INFO:root:Get feature/label shapes ...
INFO:root:x_train shape = (93639, 149)
INFO:root:x_tes

------
### Training with Google Vertex AI 

For the full article, please visit: https://cloud.google.com/vertex-ai/docs

Where Vertex AI fits in the ML workflow \
The diagram below gives a high-level overview of the stages in an ML workflow. The blue-filled boxes indicate where Vertex AI provides managed services and APIs:

<img src="img/ml-workflow.svg" alt="Drawing">

As the diagram indicates, you can use Vertex AI to manage the following stages in the ML workflow:

- Train an ML model on your data:
 - Train model
 - Evaluate model accuracy
 - Tune hyperparameters
 
 
- Deploy your trained model.

- Send prediction requests to your model:
 - Online prediction
 - Batch prediction (for TensorFlow only)
 
 
- Monitor the predictions on an ongoing basis.

- Manage your models and model versions.

- For the latest list, see 
  - Pre-built containers for training: https://cloud.google.com/vertex-ai/docs/training/pre-built-containers
    and 
  - Pre-built containers for prediction: https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers


#### Train at local

Before submitting training jobs to Cloud AI Platform, you can test your train.py code in the local environment. You can test by running your python script in command line, but another and maybe better choice is to use `gcloud ai-platform local train` command. The latter method could make sure your your entire python package are ready to be submitted to the remote VMs.

In [10]:
# Train on local machine with python command
!python3 trainer/train.py \
    --job-dir ./models \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --test_feature_name $TEST_FEATURE_PATH \
    --test_label_name $TEST_LABEL_PATH

2021-06-21 15:23:42.360886: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2021-06-21 15:23:42.361003: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2021-06-21 15:23:42.361022: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
Namespace(batch_size=4, depth=3, dropout_rate=0.02, epochs=1, job_dir='./models', learnin

------
### Hyperparameter Tuning

To use hyperparameter tuning in your training job you must perform the following steps:

- Specify the hyperparameter tuning configuration for your training job by including a HyperparameterSpec in your TrainingInput object.

- Include the following code in your training application:

 - Parse the command-line arguments representing the hyperparameters you want to tune, and use the values to set the hyperparameters for your training trial.
 - Add your hyperparameter metric to the summary for your graph.


In [40]:
# Google Vertex AI requires each job to have unique name, 
# Therefore, we use prefix + timestamp to form job names.
JOBNAME_HPT = 'tensorflow_train_{}_{}_hpt'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    ) # define unique job name

# We use the job names as folder names to store outputs.
JOB_DIR_HPT = 'gs://{}/{}/jobdir/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    datetime.now(timezone(TIMEZONE)).strftime("model_%m%d%y_%H%M")
    )

print("JOB_NAME_HPT = ", JOBNAME_HPT)
print("JOB_DIR_HPT = ", JOB_DIR_HPT)


JOB_NAME_HPT =  tensorflow_train_cchatterjee_062121_1058_hpt
JOB_DIR_HPT =  gs://vapit_data/tf_models/jobdir/model_062121_1058


### Submit the hyperparameter job to vertex AI

In [41]:
executor_image_uri = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-2:latest'
python_module =  "trainer.train_hpt"
api_endpoint = "{}-aiplatform.googleapis.com".format(REGION)
machine_type = "n1-standard-4"

# The AI Platform services require regional API endpoints.
client_options = {"api_endpoint": api_endpoint}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client = aiplatform.gapic.JobServiceClient(client_options=client_options)

# study_spec
metric = {
    "metric_id": "accuracy",
    "goal": aiplatform.gapic.StudySpec.MetricSpec.GoalType.MAXIMIZE,
}

depth = {
        "parameter_id": "depth",
        "integer_value_spec": {"min_value": 1, "max_value": 10},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
}
dropout_rate = {
        "parameter_id": "dropout_rate",
        "double_value_spec": {"min_value": 0.001, "max_value": 0.1},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE,
}
learning_rate = {
        "parameter_id": "learning_rate",
        "double_value_spec": {"min_value": 0.00001, "max_value": 0.01},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE,
}
batch_size = {
        "parameter_id": "batch_size",
        "integer_value_spec": {"min_value": 1, "max_value": 16},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
}
epochs = {
        "parameter_id": "epochs",
        "integer_value_spec": {"min_value": 1, "max_value": 5},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
}

# trial_job_spec
machine_spec = {
    "machine_type": machine_type,
}
worker_pool_spec = {
    "machine_spec": machine_spec,
    "replica_count": 1,
    "python_package_spec": {
        "executor_image_uri": executor_image_uri,
        "package_uris": [PACKAGE_URIS],
        "python_module": python_module,
        "args": [
            '--job-dir',
            JOB_DIR_HPT,
            '--train_feature_name',
            TRAIN_FEATURE_PATH,
            '--train_label_name',
            TRAIN_LABEL_PATH,
            '--test_feature_name',
            TEST_FEATURE_PATH,
            '--test_label_name',
            TEST_LABEL_PATH,
        ],
    },
}

# hyperparameter_tuning_job
hyperparameter_tuning_job = {
    "display_name": JOBNAME_HPT,
    "max_trial_count": 4,
    "parallel_trial_count": 2,
    "study_spec": {
        "metrics": [metric],
        "parameters": [depth, dropout_rate, learning_rate, batch_size, epochs],
#         "algorithm": aiplatform.gapic.StudySpec.Algorithm.RANDOM_SEARCH,
    },
    "trial_job_spec": {"worker_pool_specs": [worker_pool_spec]},
}
parent = f"projects/{PROJECT}/locations/{REGION}"
response = client.create_hyperparameter_tuning_job(
    parent=parent, hyperparameter_tuning_job=hyperparameter_tuning_job
)
print("response:", response)
job_name_hpt = response.name.split('/')[-1]


response: name: "projects/901951554789/locations/us-central1/hyperparameterTuningJobs/2001959985528963072"
display_name: "tensorflow_train_cchatterjee_062121_1058_hpt"
study_spec {
  metrics {
    metric_id: "accuracy"
    goal: MAXIMIZE
  }
  parameters {
    parameter_id: "depth"
    integer_value_spec {
      min_value: 1
      max_value: 10
    }
    scale_type: UNIT_LINEAR_SCALE
  }
  parameters {
    parameter_id: "dropout_rate"
    double_value_spec {
      min_value: 0.001
      max_value: 0.1
    }
    scale_type: UNIT_LOG_SCALE
  }
  parameters {
    parameter_id: "learning_rate"
    double_value_spec {
      min_value: 1e-05
      max_value: 0.01
    }
    scale_type: UNIT_LOG_SCALE
  }
  parameters {
    parameter_id: "batch_size"
    integer_value_spec {
      min_value: 1
      max_value: 16
    }
    scale_type: UNIT_LINEAR_SCALE
  }
  parameters {
    parameter_id: "epochs"
    integer_value_spec {
      min_value: 1
      max_value: 5
    }
    scale_type: UNIT_LINEAR_

#### Check the status of Long Running Operation (LRO) with Google API Client

Send an API request to Vertex AI to get the detailed information. The most interesting piece of information is the hyperparameter values in the trial with best performance metric.

In [63]:
client_options = {"api_endpoint": api_endpoint}
client = aiplatform.gapic.JobServiceClient(client_options=client_options)
name = client.hyperparameter_tuning_job_path(
    project=PROJECT,
    location=REGION,
    hyperparameter_tuning_job=job_name_hpt,
)
response = client.get_hyperparameter_tuning_job(name=name)
print("Job status = ", response.state)
print("response:", response)
# print("response state: ", str(response.state))
if "JobState.JOB_STATE_SUCCEEDED" == str(response.state):
    print("Job state succeeded.")


Job status =  JobState.JOB_STATE_SUCCEEDED
response: name: "projects/901951554789/locations/us-central1/hyperparameterTuningJobs/2001959985528963072"
display_name: "tensorflow_train_cchatterjee_062121_1058_hpt"
study_spec {
  metrics {
    metric_id: "accuracy"
    goal: MAXIMIZE
  }
  parameters {
    parameter_id: "depth"
    integer_value_spec {
      min_value: 1
      max_value: 10
    }
    scale_type: UNIT_LINEAR_SCALE
  }
  parameters {
    parameter_id: "dropout_rate"
    double_value_spec {
      min_value: 0.001
      max_value: 0.1
    }
    scale_type: UNIT_LOG_SCALE
  }
  parameters {
    parameter_id: "learning_rate"
    double_value_spec {
      min_value: 1e-05
      max_value: 0.01
    }
    scale_type: UNIT_LOG_SCALE
  }
  parameters {
    parameter_id: "batch_size"
    integer_value_spec {
      min_value: 1
      max_value: 16
    }
    scale_type: UNIT_LINEAR_SCALE
  }
  parameters {
    parameter_id: "epochs"
    integer_value_spec {
      min_value: 1
      max_

#### Get the hyperparameters associated with the best metrics

In [64]:
max_ind = 0
max_val = 0
for ind, trials in enumerate(response.trials):
    value = trials.final_measurement.metrics[0].value
    print("Metrics Value (larger is better):", value)
    if value > max_val:
        max_val = value
        max_ind = ind
        
param_dict = {}
for params in response.trials[max_ind].parameters:
    param_dict[params.parameter_id] = params.value

print(param_dict)

depth=str(int(param_dict['depth']))
dropout_rate=str(param_dict['dropout_rate'])
learning_rate=str(param_dict['learning_rate'])
batch_size=str(int(param_dict['batch_size']))
epochs=str(int(param_dict['epochs']))


Metrics Value (larger is better): 0.9525838494300842
Metrics Value (larger is better): 0.9389036893844604
Metrics Value (larger is better): 0.9535022974014282
Metrics Value (larger is better): 0.9531925916671753
{'batch_size': 10.0, 'depth': 7.0, 'dropout_rate': 0.00912489357267528, 'epochs': 4.0, 'learning_rate': 0.0003417788166851768}


#### Get the best model

In [65]:
best_model_dir_hpt = find_best_model_dir(JOB_DIR_HPT+'/checkpoints', offset=1, maxFlag=1)



All Models = 
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-110939-0-0.9289/
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-110939-0-0.9476/
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-110955-0-0.9411/
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-110955-0-0.9544/
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-110955-0-0.9552/
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-111537-0-0.9544/
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-111539-0-0.9523/
gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-111539-0-0.9548/
Best Accuracy  from Checkpoints =  0.9552
Best Model Dir from Checkpoints =  gs://vapit_data/tf_models/jobdir/model_062121_1058/checkpoints/cp-110955-0-0.9552/


------
### Training with Tuned Parameters

Once your hyperparameter training jobs are done. You can use the optimized combination of hyperparameters from your trials and start a single training job on Cloud AI Platform to train your final model.

In [17]:
# Google Cloud AI Platform requires each job to have unique name, 
# Therefore, we use prefix + timestamp to form job names.
JOBNAME_TRN = 'tensorflow_train_{}_{}'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    )
# We use the job names as folder names to store outputs.
JOB_DIR_TRN = 'gs://{}/{}/jobdir/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    datetime.now(timezone(TIMEZONE)).strftime("model_%m%d%y_%H%M")
    )

print("JOB_NAME_TRN = ", JOBNAME_TRN)
print("JOB_DIR_TRN = ", JOB_DIR_TRN)


JOB_NAME_TRN =  tensorflow_train_cchatterjee_062121_0900
JOB_DIR_TRN =  gs://vapit_data/tf_models/jobdir/model_062121_0900


In [18]:
executor_image_uri = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-2:latest'
python_module = "trainer.train"
api_endpoint = "{}-aiplatform.googleapis.com".format(REGION)
machine_type = "n1-standard-4"
        
# The AI Platform services require regional API endpoints.
client_options = {"api_endpoint": api_endpoint}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client = aiplatform.gapic.JobServiceClient(client_options=client_options)
custom_job = {
    "display_name": JOBNAME_TRN,
    "job_spec": {
        "worker_pool_specs": [
            {
                "machine_spec": {
                    "machine_type": machine_type,
                },
                "replica_count": 1,
                "python_package_spec": {
                    "executor_image_uri": executor_image_uri,
                    "package_uris": [PACKAGE_URIS],
                    "python_module": python_module,
                    "args": [
                        '--job-dir',
                        JOB_DIR_TRN,
                        '--train_feature_name',
                        TRAIN_FEATURE_PATH,
                        '--train_label_name',
                        TRAIN_LABEL_PATH,
                        '--test_feature_name',
                        TEST_FEATURE_PATH,
                        '--test_label_name',
                        TEST_LABEL_PATH,
                        '--depth',
                        depth,
                        '--dropout_rate',
                        dropout_rate,
                        '--learning_rate',
                        learning_rate,
                        '--batch_size',
                        batch_size,
                        '--epochs',
                        epochs
                    ],
                },
            }
        ]
    },
}
parent = f"projects/{PROJECT}/locations/{REGION}"
response = client.create_custom_job(parent=parent, custom_job=custom_job)
print("response:", response)
job_id_trn = response.name.split('/')[-1]


response: name: "projects/901951554789/locations/us-central1/customJobs/7417538537441984512"
display_name: "tensorflow_train_cchatterjee_062121_0900"
job_spec {
  worker_pool_specs {
    machine_spec {
      machine_type: "n1-standard-4"
    }
    replica_count: 1
    disk_spec {
      boot_disk_type: "pd-ssd"
      boot_disk_size_gb: 100
    }
    python_package_spec {
      executor_image_uri: "us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-2:latest"
      package_uris: "gs://vapit_data/trainer/tensorflow/trainer-0.1.tar.gz"
      python_module: "trainer.train"
      args: "--job-dir"
      args: "gs://vapit_data/tf_models/jobdir/model_062121_0900"
      args: "--train_feature_name"
      args: "gs://vapit_data/tf_data/mortgage_structured_x_train.csv"
      args: "--train_label_name"
      args: "gs://vapit_data/tf_data/mortgage_structured_y_train.csv"
      args: "--test_feature_name"
      args: "gs://vapit_data/tf_data/mortgage_structured_x_test.csv"
      args: "--test_label_name"

Check the training job status

In [20]:
# check the training job status
client_options = {"api_endpoint": api_endpoint}
client = aiplatform.gapic.JobServiceClient(client_options=client_options)
name = client.custom_job_path(
    project=PROJECT,
    location=REGION,
    custom_job=job_id_trn,
)
response = client.get_custom_job(name=name)
print(response.state)


JobState.JOB_STATE_SUCCEEDED


#### Get the best model

In [21]:
best_model_dir_trn = find_best_model_dir(JOB_DIR_TRN+'/checkpoints', offset=1, maxFlag=1)



All Models = 
gs://vapit_data/tf_models/jobdir/model_062121_0900/checkpoints/
gs://vapit_data/tf_models/jobdir/model_062121_0900/checkpoints/cp-091203-0-0.9518/
gs://vapit_data/tf_models/jobdir/model_062121_0900/checkpoints/cp-091203-0-0.9540/
Best Accuracy  from Checkpoints =  0.954
Best Model Dir from Checkpoints =  gs://vapit_data/tf_models/jobdir/model_062121_0900/checkpoints/cp-091203-0-0.9540/


--------
### Deploy the Model

Vertex AI provides tools to upload your trained ML model to the cloud, so that you can send prediction requests to the model.

In order to deploy your trained model on Vertex AI, you must save your trained model using the tools provided by your machine learning framework. This involves serializing the information that represents your trained model into a file which you can deploy for prediction in the cloud.

Then you upload the saved model to a Cloud Storage bucket, and create a model resource on Vertex AI, specifying the Cloud Storage path to your saved model.

When you deploy your model, you can also provide custom code (beta) to customize how it handles prediction requests.



#### Import model artifacts to Vertex AI 

When you import a model, you associate it with a container for Vertex AI to run prediction requests. You can use pre-built containers provided by Vertex AI, or use your own custom containers that you build and push to Container Registry or Artifact Registry.

You can use a pre-built container if your model meets the following requirements:

- Trained in Python 3.7 or later
- Trained using TensorFlow, scikit-learn, or XGBoost
- Exported to meet framework-specific requirements for one of the pre-built prediction containers

The link to the list of pre-built predict container images:

https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers?_ga=2.125143370.-1302053296.1620920844&_gac=1.221340266.1622086653.CjwKCAjw47eFBhA9EiwAy8kzNOkCqVAmokRvQaxBDOoa8AhGOpzzW69x64rRzfgWxogIn3m6moQoBRoCuOsQAvD_BwE

In [24]:
MODEL_NAME = "my_first_tensorflow_model"

response = aiplatform.Model.upload(
    display_name = MODEL_NAME,
    serving_container_image_uri = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-2:latest',
    artifact_uri = best_model_dir_hpt, #best_model_dir_trn,
)

model_id = response.name.split('/')[-1]
print("model_id = ", model_id)


INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/901951554789/locations/us-central1/models/2825173137337876480/operations/2312682246416367616
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/901951554789/locations/us-central1/models/2825173137337876480
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/901951554789/locations/us-central1/models/2825173137337876480')
model_id =  2825173137337876480


#### Create Endpoint

You need the endpoint ID to deploy the model.

In [25]:
MODEL_ENDPOINT_DISPLAY_NAME = "my_first_tensorflow_model_endpoint"

aiplatform.init(project=PROJECT, location=REGION)
endpoint = aiplatform.Endpoint.create(
    display_name=MODEL_ENDPOINT_DISPLAY_NAME, project=PROJECT, location=REGION,
)

endpoint_id = endpoint.resource_name.split('/')[-1]

print("endpoint.display_name  = ", endpoint.display_name)
print("endpoint.resource_name = ", endpoint.resource_name)
#print(endpoint.uri)
print("endpoint_id = ", endpoint_id)


INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/901951554789/locations/us-central1/endpoints/7537108227939368960/operations/1550448009483911168
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/901951554789/locations/us-central1/endpoints/7537108227939368960
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/901951554789/locations/us-central1/endpoints/7537108227939368960')
endpoint.display_name  =  my_first_tensorflow_model_endpoint
endpoint.resource_name =  projects/901951554789/locations/us-central1/endpoints/7537108227939368960
endpoint_id =  7537108227939368960


#### Deploy Model to the endpoint

You must deploy a model to an endpoint before that model can be used to serve online predictions; deploying a model associates physical resources with the model so it can serve online predictions with low latency. An undeployed model can serve batch predictions, which do not have the same low latency requirements.

In [26]:
MODEL_NAME = "my_first_tensorflow_model"
DEPLOYED_MODEL_DISPLAY_NAME = "my_first_tensorflow_model_deployed"

aiplatform.init(project=PROJECT, location=REGION)
model = aiplatform.Model(model_name=model_id)

# The explanation_metadata and explanation_parameters should only be
# provided for a custom trained model and not an AutoML model.
model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=DEPLOYED_MODEL_DISPLAY_NAME,
    machine_type = "n1-standard-4",
    sync=True
)

print(model.display_name)
print(model.resource_name)


INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/901951554789/locations/us-central1/endpoints/7537108227939368960
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/901951554789/locations/us-central1/endpoints/7537108227939368960/operations/100288929470611456
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/901951554789/locations/us-central1/endpoints/7537108227939368960
my_first_tensorflow_model
projects/901951554789/locations/us-central1/models/2825173137337876480


### Explore models and endpoints

In [98]:
print("Models:")
!gcloud beta ai models list --region=$REGION
print("Endpoints:")
!gcloud beta ai endpoints list --region=$REGION


Models:
Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
7870893569853095936  my_first_tensorflow_model
2825173137337876480  my_first_tensorflow_model
Endpoints:
Using endpoint [https://us-central1-aiplatform.googleapis.com/]
ENDPOINT_ID          DISPLAY_NAME
5035921584888479744  my_first_tensorflow_model_endpoint
7537108227939368960  my_first_tensorflow_model_endpoint


In [10]:
from google.cloud.aiplatform import gapic as aip
def list_models():
    PARENT = "projects/" + PROJECT + "/locations/" + REGION
    API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
    client_options = {"api_endpoint": API_ENDPOINT}
    client = aip.ModelServiceClient(client_options=client_options)
    response = client.list_models(parent=PARENT)
    model_list = []
    for model in response:
        model_list.append(
            {
                "name": model.name,
                "display_name": model.display_name,
                "create_time": model.create_time,
                "container":  model.container_spec.image_uri,
                "artifact_uri": model.artifact_uri
            }
        )
    return(model_list)

model_list = list_models()
model_list


[{'name': 'projects/901951554789/locations/us-central1/models/6353180495429238784',
  'display_name': 'freddiemacdata_20216247028',
  'create_time': DatetimeWithNanoseconds(2021, 6, 24, 7, 1, 20, 984405, tzinfo=datetime.timezone.utc),
  'container': '',
  'artifact_uri': ''},
 {'name': 'projects/901951554789/locations/us-central1/models/7870893569853095936',
  'display_name': 'my_first_tensorflow_model',
  'create_time': DatetimeWithNanoseconds(2021, 6, 23, 5, 7, 45, 691387, tzinfo=datetime.timezone.utc),
  'container': 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-4:latest',
  'artifact_uri': 'gs://vapit_data/tf_models/jobdir/model_062121_0825/checkpoints/cp-084311-0-0.9555/'},
 {'name': 'projects/901951554789/locations/us-central1/models/2825173137337876480',
  'display_name': 'my_first_tensorflow_model',
  'create_time': DatetimeWithNanoseconds(2021, 6, 21, 16, 20, 45, 560741, tzinfo=datetime.timezone.utc),
  'container': 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-

In [11]:
from google.cloud.aiplatform import gapic as aip
def list_endpoints():
    PARENT = "projects/" + PROJECT + "/locations/" + REGION
    API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
    client_options = {"api_endpoint": API_ENDPOINT}
    client = aip.EndpointServiceClient(client_options=client_options)
    response = client.list_endpoints(parent=PARENT)
    endpoint_list = []
    for endpoint in response:
        model_name = ''
        if (len(endpoint.deployed_models) > 0):
            model_name = endpoint.deployed_models[0].model
        endpoint_list.append(
            {
                "name": endpoint.name,
                "display_name": endpoint.display_name,
                "create_time": endpoint.create_time,
                "deployed_models": model_name
            }
        )
    return(endpoint_list)

endpoint_list = list_endpoints()
endpoint_list


[{'name': 'projects/901951554789/locations/us-central1/endpoints/5417038703354707968',
  'display_name': 'freddimac_deployed',
  'create_time': DatetimeWithNanoseconds(2021, 6, 24, 15, 26, 21, 375987, tzinfo=datetime.timezone.utc),
  'deployed_models': 'projects/901951554789/locations/us-central1/models/6353180495429238784'},
 {'name': 'projects/901951554789/locations/us-central1/endpoints/5035921584888479744',
  'display_name': 'my_first_tensorflow_model_endpoint',
  'create_time': DatetimeWithNanoseconds(2021, 6, 23, 5, 14, 35, 882989, tzinfo=datetime.timezone.utc),
  'deployed_models': 'projects/901951554789/locations/us-central1/models/7870893569853095936'},
 {'name': 'projects/901951554789/locations/us-central1/endpoints/7537108227939368960',
  'display_name': 'my_first_tensorflow_model_endpoint',
  'create_time': DatetimeWithNanoseconds(2021, 6, 21, 16, 20, 50, 482709, tzinfo=datetime.timezone.utc),
  'deployed_models': 'projects/901951554789/locations/us-central1/models/28251731

In [None]:
# deployed_model_id = endpoint.list_models()[0].id
# print(deployed_model_id)
# endpoint.undeploy(deployed_model_id=deployed_model_id)

In [None]:
# print(endpoint.list_models())
# print(endpoint.resource_name)

------
### Send inference requests to your model

Vertex AI provides the services you need to request predictions from your model in the cloud.

There are two ways to get predictions from trained models: online prediction (sometimes called HTTP prediction) and batch prediction. In both cases, you pass input data to a cloud-hosted machine-learning model and get inferences for each data instance.

Vertex AI online prediction is a service optimized to run your data through hosted models with as little latency as possible. You send small batches of data to the service and it returns your predictions in the response.

#### Call Google API for online inference

In [29]:
from googleapiclient import errors

# Load test feature and labels
x_test = pd.read_csv(TEST_FEATURE_PATH)
#y_test = pd.read_csv(TEST_LABEL_PATH)

# Fill nan value with zeros (Prediction lacks the ability to handle nan values for now)
x_test = x_test.fillna(0)

pprobas = []
batch_size = 16
n_samples = min(160,x_test.shape[0])
print("batch_size=", batch_size)
print("n_samples=", n_samples)

aiplatform.init(project=PROJECT, location=REGION)

for i in range(0, n_samples, batch_size):
    j = min(i+batch_size, n_samples)
    print("Processing samples", i, j)
    response = aiplatform.Endpoint(endpoint_id).predict(instances=x_test.iloc[i:j].values.tolist())
    try:
        for prediction_ in response.predictions:
            pprobas.append(prediction_)
    except errors.HttpError as err:
        # Something went wrong, print out some information.
        tf.compat.v1.logging.error('There was an error getting the job info, Check the details:')
        tf.compat.v1.logging.error(err._get_reason())
        break


batch_size= 16
n_samples= 160
Processing samples 0 16
Processing samples 16 32
Processing samples 32 48
Processing samples 48 64
Processing samples 64 80
Processing samples 80 96
Processing samples 96 112
Processing samples 112 128
Processing samples 128 144
Processing samples 144 160


In [30]:
np.array(pprobas)

array([[9.99997377e-01, 2.00586533e-06, 5.29692898e-07, 9.32733428e-08],
       [9.55196559e-01, 1.73548907e-02, 1.54394908e-02, 1.20089827e-02],
       [9.98132408e-01, 9.90457367e-04, 5.93925361e-04, 2.83146452e-04],
       [9.94612217e-01, 2.60814885e-03, 1.78224512e-03, 9.97423194e-04],
       [9.91742134e-01, 3.84100503e-03, 2.76598916e-03, 1.65084680e-03],
       [9.95701000e-01, 2.12387322e-03, 1.41158560e-03, 7.63524000e-04],
       [9.99646187e-01, 2.12135885e-04, 1.03383027e-04, 3.82123071e-05],
       [6.87042415e-01, 6.49056211e-02, 1.03922844e-01, 1.44129068e-01],
       [9.97439384e-01, 1.32306630e-03, 8.24943000e-04, 4.12572583e-04],
       [9.64753091e-01, 1.40774259e-02, 1.21303163e-02, 9.03925207e-03],
       [9.58985448e-01, 1.60693582e-02, 1.41284261e-02, 1.08167827e-02],
       [8.68227720e-01, 3.90487537e-02, 4.58238311e-02, 4.68996316e-02],
       [9.97919261e-01, 1.09388109e-03, 6.64780000e-04, 3.22173437e-04],
       [9.86150682e-01, 6.12180540e-03, 4.69711376e

#### Call Google GCLOUD API for online inference

In [31]:
# Load test feature and labels
x_test = pd.read_csv(TEST_FEATURE_PATH)
#y_test = pd.read_csv(TEST_LABEL_PATH)

# Fill nan value with zeros (Prediction lacks the ability to handle nan values for now)
x_test = x_test.fillna(0)

# Create a temporary json file to contain data to be predicted
JSON_TEMP = 'tf_test_data.json' # temp json file name to hold the inference data
batch_size = 100                # data batch size
start = 0
end = min(ind+batch_size, len(x_test))
body={'instances': x_test.iloc[start:end].values.tolist()}
# body = json.dumps(body).encode().decode()
with open(JSON_TEMP, 'w') as fp:
    fp.write(json.dumps(body))


In [32]:
!gcloud beta ai endpoints predict $endpoint_id \
  --region=$REGION \
  --json-request=$JSON_TEMP


Using endpoint [https://us-central1-prediction-aiplatform.googleapis.com/]
[[0.999997377, 2.00586715e-06, 5.29693921e-07, 9.32733428e-08], [0.955196559, 0.0173548907, 0.0154394908, 0.0120089827], [0.998132408, 0.000990457367, 0.000593925361, 0.000283146452], [0.994612217, 0.00260814629, 0.00178224419, 0.000997422379], [0.991742134, 0.00384100503, 0.00276598916, 0.0016508468], [0.995701, 0.00212387228, 0.0014115849, 0.000763523218], [0.999646187, 0.000212135885, 0.000103383027, 3.82123071e-05], [0.687042356, 0.0649056435, 0.103922866, 0.144129127], [0.997439384, 0.00132306502, 0.000824942195, 0.000412572204], [0.964753091, 0.0140774325, 0.0121303275, 0.00903926138], [0.958985329, 0.0160693731, 0.0141284373, 0.0108167864], [0.86822772, 0.0390487723, 0.0458238758, 0.0468996763], [0.997919261, 0.00109388051, 0.000664779393, 0.000322173117], [0.986150682, 0.0061218054, 0.00469711376, 0.0030303516], [0.989074767, 0.00494527677, 0.0036855978, 0.00229437626], [0.995706, 0.00212164293, 0.001409

#### Call Google API for batch inference

In [None]:
# Write batch data to file in GCS

import shutil
import os

# Clean current directory
DATA_DIR = './batch_data'
shutil.rmtree(DATA_DIR, ignore_errors=True)
os.makedirs(DATA_DIR)

n_samples = min(1000,x_test.shape[0])
nFiles = 10
nRecsPerFile = min(1000,n_samples//nFiles)
print("n_samples =", n_samples)
print("nFiles =", nFiles)
print("nRecsPerFile =", nRecsPerFile)

# Create nFiles files with nImagesPerFile images each
for i in range(nFiles):
    with open(f'{DATA_DIR}/unkeyed_batch_{i}.json', "w") as file:
        for z in range(nRecsPerFile):
            print(f'{{"dense_input": {np.array(x_test)[i*nRecsPerFile+z].tolist()}}}', file=file)
            #print(f'{{"{model_layers[0]}": {np.array(x_test)[i*nRecsPerFile+z].tolist()}}}', file=file)
            #key = f'key_{i}_{z}'
            #print(f'{{"image": {x_test_images[z].tolist()}, "key": "{key}"}}', file=file)

# Write batch data to gcs file
!gsutil -m cp -r ./batch_data gs://$BUCKET_NAME/$FOLDER_NAME/
    
# Remove old batch prediction results
!gsutil -m rm -r gs://$BUCKET_NAME/$FOLDER_NAME/batch_predictions


In [None]:
JOBNAME_BATCH = 'tensorflow_batch_{}_{}'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    )
# We use the job names as folder names to store outputs.
JOB_DIR_BATCH = 'gs://{}/{}/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    JOBNAME_BATCH,
    )

INPUT_PATH='gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/batch_data/*'
OUTPUT_PATH='gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/batch_predictions'

print("JOB_NAME_BATCH = ", JOBNAME_BATCH)
print("JOB_DIR_BATCH = ", JOB_DIR_BATCH)


In [None]:
aiplatform.init(project=PROJECT, location=REGION)

my_model = aiplatform.Model(model_name=model_id)


# Make SDK batch_predict method call
batch_prediction_job = my_model.batch_predict(
    instances_format="jsonl",
    predictions_format="jsonl",
    job_display_name=JOBNAME_BATCH,
    gcs_source=INPUT_PATH,
    gcs_destination_prefix=OUTPUT_PATH,
    model_parameters=None,
    machine_type="n1-standard-4",
    starting_replica_count=1,
    max_replica_count=1,
    sync=True,
)
print(batch_prediction_job.display_name)
print(batch_prediction_job.resource_name)
print(batch_prediction_job.state)


In [None]:
print("errors")
!gsutil cat $OUTPUT_PATH/prediction.errors_stats-00000-of-00001
print("batch prediction results")
!gsutil cat $OUTPUT_PATH/prediction.results-00000-of-00010
