In [None]:
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================


In [None]:
!gcloud config list --format 'value(core.project)' 2>/dev/null

In [None]:
# Import packages
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from datetime import datetime
from pytz import timezone

-----------
# Dataset preprocessing

In [None]:
!python3 preprocessing.py

------
# Hyperparameter Tuning

In [340]:
PROJECT_ID = 'img-seg-3d' # Replace with your project ID
USER = 'elvinzhu' # Replace with your User name
BUCKET_NAME = 'tuti_asset' # Replace with your bucket name
FOLDER_NAME = 'xgb_train_job' # Replace with your Folder name
REGION = 'us-central1' # Replace with your region
TIMEZONE = 'US/Pacific'

JOBNAME = 'xgb_train_{}_{}_hpt'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    )

JOB_DIR = 'gs://{}/{}/jobdir'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    )
JOB_CONFIG = "./config/config_hpt.yaml"
print("JOB_NAME = ", JOBNAME)
print("JOB_DIR = ", JOB_DIR)
print("JOB_CONFIG = ", JOB_CONFIG)

TRAIN_FEATURE_PATH = 'gs://tuti_asset/datasets/mortgage_structured_x_train.csv'
TRAIN_LABEL_PATH = 'gs://tuti_asset/datasets/mortgage_structured_y_train.csv'
VAL_FEATURE_NAME = 'gs://tuti_asset/datasets/mortgage_structured_x_test.csv'
VAL_LABEL_NAME = 'gs://tuti_asset/datasets/mortgage_structured_y_test.csv'

JOB_NAME =  xgb_train_elvinzhu_022221_2153_hpt
JOB_DIR =  gs://tuti_asset/xgb_train_job/jobdir
JOB_CONFIG =  ./config/config_hpt.yaml


In [341]:
# submit the hyperparameter training job
!gcloud ai-platform jobs submit training $JOBNAME \
    --package-path $(pwd)/trainer \
    --module-name trainer.train_hpt \
    --python-version 3.7 \
    --runtime-version 2.2 \
    --job-dir $JOB_DIR \
    --region $REGION \
    --config $JOB_CONFIG \
    -- \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --val_feature_name $VAL_FEATURE_NAME \
    --val_label_name $VAL_LABEL_NAME

Job [xgb_train_elvinzhu_022221_2153_hpt] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe xgb_train_elvinzhu_022221_2153_hpt

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs xgb_train_elvinzhu_022221_2153_hpt
jobId: xgb_train_elvinzhu_022221_2153_hpt
state: QUEUED


In [342]:
!gcloud ai-platform jobs describe xgb_train_elvinzhu_022221_2153_hpt

createTime: '2021-02-23T05:53:07Z'
etag: F0WHi7tWEFE=
jobId: xgb_train_elvinzhu_022221_2153_hpt
startTime: '2021-02-23T05:53:10Z'
state: RUNNING
trainingInput:
  args:
  - --train_feature_name
  - gs://tuti_asset/datasets/mortgage_structured_x_train.csv
  - --train_label_name
  - gs://tuti_asset/datasets/mortgage_structured_y_train.csv
  - --val_feature_name
  - gs://tuti_asset/datasets/mortgage_structured_x_test.csv
  - --val_label_name
  - gs://tuti_asset/datasets/mortgage_structured_y_test.csv
  hyperparameters:
    enableTrialEarlyStopping: true
    goal: MAXIMIZE
    hyperparameterMetricTag: roc_auc
    maxParallelTrials: 5
    maxTrials: 50
    params:
    - maxValue: 20.0
      minValue: 2.0
      parameterName: max_depth
      type: INTEGER
    - maxValue: 200.0
      minValue: 10.0
      parameterName: n_estimators
      type: INTEGER
    - categoricalValues:
      - gbtree
      - gblinear
      - dart
      parameterName: booster
      type: CATEGORICAL
  jobDir: gs://tuti_a

In [None]:
#check the hyperparameter training job status using googleapiclient

from googleapiclient import discovery
import json

# Define the project id and the job id and format it for the api request
job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOBNAME)

# Build the service
ml = discovery.build('ml', 'v1', cache_discovery=False)
# Execute the request and pass in the job id
request = ml.projects().jobs().get(name=job_id).execute()

# Print response
print(json.dumps(request, indent=4))

In [None]:
import pandas as pd
trials = request['trainingOutput']['trials']
trials = pd.DataFrame(trials)
trials['hyperparameters.booster'] = trials['hyperparameters'].apply(lambda x: x['booster'])
trials['hyperparameters.max_depth'] = trials['hyperparameters'].apply(lambda x: x['max_depth'])
trials['hyperparameters.n_estimators'] = trials['hyperparameters'].apply(lambda x: x['n_estimators'])
trials['finalMetric.trainingStep'] = trials['finalMetric'].apply(lambda x: x['trainingStep'])
trials['finalMetric.objectiveValue'] = trials['finalMetric'].apply(lambda x: x['objectiveValue'])
trials = trials.sort_values(['finalMetric.objectiveValue'], ascending=False)

------
# Training with Tuned Parameters

In [307]:
PROJECT_ID = 'img-seg-3d' # Replace with your project ID
USER = 'elvinzhu' # Replace with your User name
BUCKET_NAME = 'tuti_asset' # Replace with your bucket name
FOLDER_NAME = 'xgb_train_job' # Replace with your Folder name
REGION = 'us-central1' # Replace with your region
TIMEZONE = 'US/Pacific'

JOBNAME = 'xgb_train_{}_{}'.format(
    USER,
    datetime.now(timezone(TIMEZONE)).strftime("%m%d%y_%H%M")
    )

JOB_DIR = 'gs://{}/{}/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    JOBNAME,
    )
JOB_CONFIG = "./config/config.yaml"

print("JOB_NAME = ", JOBNAME)
print("JOB_DIR = ", JOB_DIR)
print("JOB_CONFIG = ", JOB_CONFIG)

# MODEL_PATH = 'models/{}/model.bst'.format(JOBNAME)
TRAIN_FEATURE_PATH = 'gs://tuti_asset/datasets/mortgage_structured_x_train.csv'
TRAIN_LABEL_PATH = 'gs://tuti_asset/datasets/mortgage_structured_y_train.csv'
N_CLASSES = 4

print("TRAIN_FEATURE_PATH = ", TRAIN_FEATURE_PATH)
print("TRAIN_LABEL_PATH = ", TRAIN_LABEL_PATH)
print("N_CLASSES = ", N_CLASSES)

# Getthe best hypertuned model parameters
BOOSTER=trials['hyperparameters'][0]['booster']
MAX_DEPTH=trials['hyperparameters'][0]['max_depth']
N_ESTIMATORS=trials['hyperparameters'][0]['n_estimators']

print("BOOSTER = ", BOOSTER)
print("MAX_DEPTH = ", MAX_DEPTH)
print("N_ESTIMATORS = ", N_ESTIMATORS)

JOB_NAME =  xgb_train_elvinzhu_022221_2003
JOB_DIR =  gs://tuti_asset/xgb_train_job/xgb_train_elvinzhu_022221_2003
JOB_CONFIG =  ./config/config.yaml
TRAIN_FEATURE_PATH =  gs://tuti_asset/datasets/mortgage_structured_x_train.csv
TRAIN_LABEL_PATH =  gs://tuti_asset/datasets/mortgage_structured_y_train.csv
N_CLASSES =  4
BOOSTER =  gbtree
MAX_DEPTH =  14
N_ESTIMATORS =  40


In [308]:
# Train on local machine
!python3 trainer/train.py \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --no_classes $N_CLASSES \
    --n_estimators $N_ESTIMATORS \
    --max_depth $MAX_DEPTH \
    --booster $BOOSTER

Namespace(booster='gbtree', job_dir='./', max_depth=14, n_estimators=40, no_classes=4, train_feature_name='gs://tuti_asset/datasets/mortgage_structured_x_train.csv', train_label_name='gs://tuti_asset/datasets/mortgage_structured_y_train.csv')
(93639, 149)
(93639, 1)
CommandException: cp: "file://model.bst" and "file://./model.bst" are the same file - abort.
Traceback (most recent call last):
  File "trainer/train.py", line 88, in <module>
  File "trainer/train.py", line 73, in train_xgboost
    return xgb_model
  File "/opt/conda/lib/python3.7/subprocess.py", line 363, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['gsutil', 'cp', 'model.bst', './model.bst']' returned non-zero exit status 1.


In [None]:
! gcloud ai-platform local train \
    --job-dir $JOB_DIR \
    --package-path $(pwd)/trainer \
    --module-name trainer.train \
    -- \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --no_classes $N_CLASSES \
    --n_estimators $N_ESTIMATORS \
    --max_depth $MAX_DEPTH \
    --booster $BOOSTER

In [None]:
# https://cloud.google.com/sdk/gcloud/reference/ai-platform/jobs/submit/training

# submit the training job
! gcloud ai-platform jobs submit training $JOBNAME \
    --job-dir $JOB_DIR \
    --package-path $(pwd)/trainer \
    --module-name trainer.train \
    --region $REGION \
    --python-version 3.7 \
    --runtime-version 2.2 \
    --config $JOB_CONFIG \
    -- \
    --train_feature_name $TRAIN_FEATURE_PATH \
    --train_label_name $TRAIN_LABEL_PATH \
    --no_classes $N_CLASSES \
    --n_estimators $N_ESTIMATORS \
    --max_depth $MAX_DEPTH \
    --booster $BOOSTER

In [None]:
# check the training job status
! gcloud ai-platform jobs describe $JOBNAME

--------
# Deploy the Model


In [344]:
MODEL_NAME = "xgb_model"
MODEL_VERSION = "elvinzhu_xgb_bst_v0_1"
REGION = "global"
MODEL_FRAMEWORK = "XGBOOST"
MODEL_DESCRIPTION = "best_xgb_hpt"

In [345]:
# List all models in region
!gcloud ai-platform models list --region $REGION

Using endpoint [https://ml.googleapis.com/]
NAME       DEFAULT_VERSION_NAME
xgb_model  elvinzhu_xgb_bst


In [None]:
# create model if not exist
!gcloud ai-platform models create $MODEL_NAME --region $REGION --enable-logging

In [None]:
# list model versions under model
!gcloud ai-platform versions list --model $MODEL_NAME --region $REGION

In [None]:
LATEST_MODEL_DIR = "gs://{}/{}/{}".format(BUCKET_NAME, FOLDER_NAME, JOBNAME)
print("LATEST_MODEL_DIR: ", LATEST_MODEL_DIR)

In [None]:
# Deploy the model
! gcloud beta ai-platform versions create $MODEL_VERSION \
  --model=$MODEL_NAME \
  --origin=$LATEST_MODEL_DIR \
  --runtime-version=2.2 \
  --python-version=3.7 \
  --framework=$MODEL_FRAMEWORK \
  --description=$MODEL_DESCRIPTION \
  --region=$REGION 


In [348]:
# List all models
!gcloud ai-platform models list --region $REGION
# List all versions of the created model
!gcloud ai-platform versions list --model $MODEL_NAME --region $REGION
# Describe the Model
!gcloud ai-platform models describe $MODEL_NAME --region $REGION


Using endpoint [https://ml.googleapis.com/]
NAME       DEFAULT_VERSION_NAME
xgb_model  elvinzhu_xgb_bst
Using endpoint [https://ml.googleapis.com/]
NAME                   DEPLOYMENT_URI                                                STATE
elvinzhu_xgb_bst       gs://tuti_asset/datasets/models/model_20210131_061916         READY
elvinzhu_xgb_bst_v0_1  gs://tuti_asset/xgb_train_job/xgb_train_elvinzhu_020821_2057  READY
Using endpoint [https://ml.googleapis.com/]
defaultVersion:
  createTime: '2021-02-02T05:23:31Z'
  deploymentUri: gs://tuti_asset/datasets/models/model_20210131_061916
  description: best_xgb_hpt
  etag: MYHO2cqGljM=
  framework: XGBOOST
  isDefault: true
  machineType: mls1-c1-m2
  name: projects/img-seg-3d/models/xgb_model/versions/elvinzhu_xgb_bst
  pythonVersion: '3.7'
  runtimeVersion: '2.2'
  state: READY
etag: WXWsM_7Tg8w=
name: projects/img-seg-3d/models/xgb_model
onlinePredictionLogging: true
regions:
- us-central1


------
# Predictions from the deployed model with test data

### Load testing data

In [229]:
# Load test feature and labels
test_feature_url = 'gs://tuti_asset/datasets/mortgage_structured_x_test.csv'
test_label_url = 'gs://tuti_asset/datasets/mortgage_structured_y_test.csv'

x_test = pd.read_csv(test_feature_url)
y_test = pd.read_csv(test_label_url)

### Call Google API for online prediction

In [108]:
# Create google API client 
import googleapiclient.discovery
import numpy as np

PROJECT_ID = "img-seg-3d"
MODEL_NAME = "xgb_model"
VERSION = "elvinzhu_xgb_bst"
prediction_file = './prediction.json'
batch_size = 1000

model_name = 'projects/{}/models/{}/versions/{}'.format(
    PROJECT_ID, 
    MODEL_NAME, 
    VERSION
    )

service = googleapiclient.discovery.build(
    'ml', 
    'v1', 
    cache_discovery=False, 
    cache=False
    )

prediction_list = []

for ind in range(0, len(x_test), batch_size):
    start = ind
    end = min(ind+batch_size, len(x_test))
    response = service.projects().predict(
        name=model_name,
        body={'instances': x_test.iloc[start:end].values.tolist()}
        ).execute()
    prediction_list += response['predictions']
    
prediction_list = np.array(prediction_list)
preds = np.argmax(prediction_list, axis=1)
print("Predict array size = ", np.array(preds).shape)

Predict array size =  (10405,)


# Other way to call Cloud AI Platform API using gcloud command for prediction

In [306]:
def post_process(predict, n_sample, n_class):  
    predictions = np.empty([n_sample, n_class])
    for entry in predict[1:]:
        key, value = entry.split(":")
        exec("{} = {}".format(key, value))
    predictions = np.argmax(predictions, axis=1)
    return predictions.tolist()

def accuracy_score(y_true, y_pred):
    from sklearn import metrics
    return metrics.accuracy_score(y_true, y_pred)

In [302]:
PROJECT_ID = "img-seg-3d"
MODEL_NAME = "xgb_model"
VERSION = "elvinzhu_xgb_bst"
JSON_TEMP = 'xgb_test_data.json'
batch_size = 1000
y_pred = []

for ind in range(0, len(x_test), batch_size):
    start = ind
    end = min(ind+batch_size, len(x_test))
    body={'instances': x_test.iloc[start:end].values.tolist()}
    with open(JSON_TEMP, 'w') as fp:
        json.dump(body, fp)
    
    predict = !gcloud ai-platform predict \
      --model=$MODEL_NAME \
      --version=$VERSION \
      --format='text' \
      --json-request=$JSON_TEMP \
      --region=$REGION
    
    y_pred += post_process(predict[1:], end-start, N_CLASSES)

In [304]:
accuracy = accuracy_score(y_test['TARGET'].tolist(), y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.9807784718885152
