In [None]:
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================


In [None]:
!gcloud config list --format 'value(core.project)' 2>/dev/null

In [None]:
# Import packages
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from datetime import datetime
from pytz import timezone

-----------
# Dataset preprocessing

In [None]:
!python3 preprocessing.py

------
# Hyperparameter Tuning

In [None]:
PROJECT_ID = 'img-seg-3d' # Replace with your project ID
USER = 'elvinzhu' # Replace with your User name
BUCKET_NAME = 'tuti_asset' # Replace with your bucket name
FOLDER_NAME = 'xgb_train_job' # Replace with your Folder name
REGION = 'us-central1' # Replace with your region
TIMEZONE = 'US/Pacific'

JOBNAME = 'xgb_train_{}_{}_hpt'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    )

JOB_DIR = 'gs://{}/{}/jobdir'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    )
JOB_CONFIG = "./config/config_hpt.yaml"
print("JOB_NAME = ", JOBNAME)
print("JOB_DIR = ", JOB_DIR)
print("JOB_CONFIG = ", JOB_CONFIG)

TRAIN_FEATURE_PATH = 'gs://tuti_asset/datasets/mortgage_structured_x_train.csv'
TRAIN_LABEL_PATH = 'gs://tuti_asset/datasets/mortgage_structured_y_train.csv'
VAL_FEATURE_NAME = 'gs://tuti_asset/datasets/mortgage_structured_x_test.csv'
VAL_LABEL_NAME = 'gs://tuti_asset/datasets/mortgage_structured_y_test.csv'

In [None]:
# submit the hyperparameter training job
!gcloud ai-platform jobs submit training $JOBNAME \
    --package-path $(pwd)/trainer \
    --module-name trainer.train_hpt \
    --python-version 3.7 \
    --runtime-version 2.2 \
    --job-dir $JOB_DIR \
    --region $REGION \
    --config $JOB_CONFIG \
    -- \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --val_feature_name $VAL_FEATURE_NAME \
    --val_label_name $VAL_LABEL_NAME

In [None]:
#check the hyperparameter training job status using gcloud
!gcloud ai-platform jobs describe $JOBNAME

In [None]:
#check the hyperparameter training job status using googleapiclient

from googleapiclient import discovery
import json

# Define the project id and the job id and format it for the api request
job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOBNAME)

# Build the service
ml = discovery.build('ml', 'v1', cache_discovery=False)
# Execute the request and pass in the job id
request = ml.projects().jobs().get(name=job_id).execute()

# Print response
print(json.dumps(request, indent=4))

In [None]:
import pandas as pd
trials = request['trainingOutput']['trials']
trials = pd.DataFrame(trials)
trials['hyperparameters.booster'] = trials['hyperparameters'].apply(lambda x: x['booster'])
trials['hyperparameters.max_depth'] = trials['hyperparameters'].apply(lambda x: x['max_depth'])
trials['hyperparameters.n_estimators'] = trials['hyperparameters'].apply(lambda x: x['n_estimators'])
trials['finalMetric.trainingStep'] = trials['finalMetric'].apply(lambda x: x['trainingStep'])
trials['finalMetric.objectiveValue'] = trials['finalMetric'].apply(lambda x: x['objectiveValue'])
trials = trials.sort_values(['finalMetric.objectiveValue'], ascending=False)

------
# Training with Tuned Parameters

In [None]:
PROJECT_ID = 'img-seg-3d' # Replace with your project ID
USER = 'elvinzhu' # Replace with your User name
BUCKET_NAME = 'tuti_asset' # Replace with your bucket name
FOLDER_NAME = 'xgb_train_job' # Replace with your Folder name
REGION = 'us-central1' # Replace with your region
TIMEZONE = 'US/Pacific'

JOBNAME = 'xgb_train_{}_{}'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    )

JOB_DIR = 'gs://{}/{}/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    JOBNAME,
    )
JOB_CONFIG = "./config/config.yaml"

print("JOB_NAME = ", JOBNAME)
print("JOB_DIR = ", JOB_DIR)
print("JOB_CONFIG = ", JOB_CONFIG)

# MODEL_PATH = 'models/{}/model.bst'.format(JOBNAME)
TRAIN_FEATURE_PATH = 'gs://tuti_asset/datasets/mortgage_structured_x_train.csv'
TRAIN_LABEL_PATH = 'gs://tuti_asset/datasets/mortgage_structured_y_train.csv'
N_CLASSES = 4

print("TRAIN_FEATURE_PATH = ", TRAIN_FEATURE_PATH)
print("TRAIN_LABEL_PATH = ", TRAIN_LABEL_PATH)
print("N_CLASSES = ", N_CLASSES)

# Getthe best hypertuned model parameters
BOOSTER=trials['hyperparameters'][0]['booster']
MAX_DEPTH=trials['hyperparameters'][0]['max_depth']
N_ESTIMATORS=trials['hyperparameters'][0]['n_estimators']

print("BOOSTER = ", BOOSTER)
print("MAX_DEPTH = ", MAX_DEPTH)
print("N_ESTIMATORS = ", N_ESTIMATORS)

In [None]:
# Train on local machine
!python3 trainer/train.py \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --no_classes $N_CLASSES \
    --n_estimators $N_ESTIMATORS \
    --max_depth $MAX_DEPTH \
    --booster $BOOSTER

In [None]:
! gcloud ai-platform local train \
    --job-dir $JOB_DIR \
    --package-path $(pwd)/trainer \
    --module-name trainer.train \
    -- \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --no_classes $N_CLASSES \
    --n_estimators $N_ESTIMATORS \
    --max_depth $MAX_DEPTH \
    --booster $BOOSTER

In [None]:
# https://cloud.google.com/sdk/gcloud/reference/ai-platform/jobs/submit/training

# submit the training job
! gcloud ai-platform jobs submit training $JOBNAME \
    --job-dir $JOB_DIR \
    --package-path $(pwd)/trainer \
    --module-name trainer.train \
    --region $REGION \
    --python-version 3.7 \
    --runtime-version 2.2 \
    --config $JOB_CONFIG \
    -- \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --no_classes $N_CLASSES \
    --n_estimators $N_ESTIMATORS \
    --max_depth $MAX_DEPTH \
    --booster $BOOSTER

In [None]:
# check the training job status
! gcloud ai-platform jobs describe $JOBNAME

--------
# Deploy the Model


In [None]:
MODEL_NAME = "xgb_model"
MODEL_VERSION = "elvinzhu_xgb_bst_v0_1"
REGION = "global"
MODEL_FRAMEWORK = "XGBOOST"
MODEL_DESCRIPTION = "best_xgb_hpt"

In [None]:
# List all models in region
!gcloud ai-platform models list --region $REGION

In [None]:
# create model if not exist
!gcloud ai-platform models create $MODEL_NAME --region $REGION --enable-logging

In [None]:
# list model versions under model
!gcloud ai-platform versions list --model $MODEL_NAME --region $REGION

In [None]:
LATEST_MODEL_DIR = "gs://{}/{}/{}".format(BUCKET_NAME, FOLDER_NAME, JOBNAME)
print("LATEST_MODEL_DIR: ", LATEST_MODEL_DIR)

In [None]:
# # Get a list of model directories
# ALL_MODEL_DIRS = ! gsutil ls $MODEL_DIR
# # Pick the directory with the latest timestamp, in case you have trained multiple times
# if ("CommandException" in ALL_MODEL_DIRS[0]):
#     print("Create the model directory first")
# else:
#     LATEST_MODEL_DIR = ALL_MODEL_DIRS[-1]
# print("Latest Model Directory = ", LATEST_MODEL_DIR)

# Deploy the model
! gcloud beta ai-platform versions create $MODEL_VERSION \
  --model=$MODEL_NAME \
  --origin=$LATEST_MODEL_DIR \
  --runtime-version=2.2 \
  --python-version=3.7 \
  --framework=$MODEL_FRAMEWORK \
  --description=$MODEL_DESCRIPTION \
  --region=$REGION 


In [None]:
# List all models
!gcloud ai-platform models list --region $REGION
# List all versions of the created model
!gcloud ai-platform versions list --model $MODEL_NAME --region $REGION
# Describe the Model
!gcloud ai-platform models describe $MODEL_NAME --region $REGION


------
# Predictions from the deployed model with test data

### Load testing data

In [None]:
# Load test feature and labels
test_feature_url = 'gs://tuti_asset/datasets/mortgage_structured_x_test.csv'
test_label_url = 'gs://tuti_asset/datasets/mortgage_structured_y_test.csv'

x_test = pd.read_csv(test_feature_url)
y_test = pd.read_csv(test_label_url)

### Call Google API for online prediction

In [None]:
# Create google API client 
import googleapiclient.discovery
import numpy as np

PROJECT_ID = "img-seg-3d"
MODEL_NAME = "xgb_model"
VERSION = "elvinzhu_xgb_bst"
prediction_file = './prediction.json'
batch_size = 1000

model_name = 'projects/{}/models/{}/versions/{}'.format(
    PROJECT_ID, 
    MODEL_NAME, 
    VERSION
    )

service = googleapiclient.discovery.build(
    'ml', 
    'v1', 
    cache_discovery=False, 
    cache=False
    )

prediction_list = []

for ind in range(0, len(x_test), batch_size):
    start = ind
    end = min(ind+batch_size, len(x_test))
    response = service.projects().predict(
        name=model_name,
        body={'instances': x_test.iloc[start:end].values.tolist()}
        ).execute()
    prediction_list += response['predictions']
    
prediction_list = np.array(prediction_list)
preds = np.argmax(prediction_list, axis=1)
print("Predict array size = ", np.array(preds).shape)

# Other way to call Cloud AI Platform API using gcloud command for prediction

In [None]:
body={'instances': x_test.iloc[0:1000].values.tolist()}
with open('xgb_test_data.json', 'w') as fp:
    json.dump(body, fp)

In [None]:
%%time
predict_results = !gcloud ai-platform predict \
  --model=$MODEL_NAME \
  --version=$MODEL_VERSION \
  --format='text' \
  --json-request='xgb_test_data.json' \
  --region='global'
