In [1]:
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================


In [1]:
!pip uninstall -r requirements-uninstall.txt -y

Found existing installation: xgboost 1.1.1
Uninstalling xgboost-1.1.1:
  Successfully uninstalled xgboost-1.1.1
Found existing installation: google-api-python-client 1.9.3
Uninstalling google-api-python-client-1.9.3:
  Successfully uninstalled google-api-python-client-1.9.3
Found existing installation: gcsfs 0.7.0
Uninstalling gcsfs-0.7.0:
  Successfully uninstalled gcsfs-0.7.0
Found existing installation: cloudml-hypertune 0.1.0.dev6
Uninstalling cloudml-hypertune-0.1.0.dev6:
  Successfully uninstalled cloudml-hypertune-0.1.0.dev6
Found existing installation: google-cloud 0.34.0
Uninstalling google-cloud-0.34.0:
  Successfully uninstalled google-cloud-0.34.0
Found existing installation: google-cloud-storage 1.29.0
Uninstalling google-cloud-storage-1.29.0:
  Successfully uninstalled google-cloud-storage-1.29.0
Found existing installation: numpy 1.18.5
Uninstalling numpy-1.18.5:
  Successfully uninstalled numpy-1.18.5
Found existing installation: pandas 1.0.4
Uninstalling pandas-1.0.4:


In [2]:
# https://cloud.google.com/ai-platform/training/docs/runtime-version-list
!pip install -r requirements.txt


Collecting xgboost==1.1.1
  Using cached xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)
Collecting google-api-python-client==1.9.3
  Using cached google_api_python_client-1.9.3-py3-none-any.whl (59 kB)
Collecting gcsfs==0.7.0
  Using cached gcsfs-0.7.0-py2.py3-none-any.whl (20 kB)
Processing /home/jupyter/.cache/pip/wheels/a7/ff/87/e7bed0c2741fe219b3d6da67c2431d7f7fedb183032e00f81e/cloudml_hypertune-0.1.0.dev6-py2.py3-none-any.whl
Collecting google-cloud==0.34.0
  Using cached google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)
Collecting google-cloud-storage==1.29.0
  Using cached google_cloud_storage-1.29.0-py2.py3-none-any.whl (85 kB)
Collecting numpy==1.18.5
  Using cached numpy-1.18.5-cp37-cp37m-manylinux1_x86_64.whl (20.1 MB)
Collecting pandas==1.0.4
  Using cached pandas-1.0.4-cp37-cp37m-manylinux1_x86_64.whl (10.1 MB)
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
Collecting matplotlib==3.2.1
  Using cached

In [3]:
# Import packages
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier


In [4]:
PROJECT_ID = 'codev-257422'
USER = 'cchatterj'
BUCKET_NAME = 'chanchal-sandbox'
FOLDER_NAME = 'ht-xgb-data'
REGION = 'us-central1'


In [5]:
%%writefile ./setup.py

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================

from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = ['xgboost==1.1.1',
                     'pandas==1.0.4',
                     'scikit-learn==0.23.2',
                     'google-cloud-storage==1.29.0',
                     'cloudml-hypertune',
                    ]
setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Trainer package for XGBoost Task'
)


Overwriting ./setup.py


In [6]:
# Create the trainer directory and load the trainer files in it
!mkdir -p trainer


In [7]:
%%writefile ./trainer/__init__.py

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================



Overwriting ./trainer/__init__.py


In [8]:
# Create the config directory and load the trainer files in it
!mkdir -p config


In [9]:
%%writefile ./config/config.yaml

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================

#trainingInput:
#  scaleTier: CUSTOM
#  masterType: n1-highmem-8
#  masterConfig:
#    acceleratorConfig:
#      count: 1
#      type: NVIDIA_TESLA_T4

trainingInput:
  scaleTier: STANDARD-1


Overwriting ./config/config.yaml


In [10]:
%%writefile ./trainer/train.py

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================


import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from google.cloud import storage
from datetime import datetime
from pytz import timezone
import hypertune
from sklearn import metrics
from sklearn.model_selection import train_test_split

if __name__ == "__main__":

    INPUT_FILE_NAME = 'Step10_Final_dataset.csv'
    BUCKET_NAME = 'chanchal-sandbox'
    FOLDER_NAME = 'ht-xgb-data'
    _TARGET_COLUMN = 'TARGET'

    input_file = 'gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/' + INPUT_FILE_NAME

    # Read the data
    try:
        dataset = pd.read_csv(input_file)
    except:
        print("Oops! That is invalid filename. Try again...")

    print(dataset.shape)

    # ---------------------------------------
    # Pre-processing code from customer
    # ---------------------------------------

    # Drop useless columns
    dataset.drop(['LOAN_SEQUENCE_NUMBER'], axis=1, inplace=True)

    # Inputs to an XGBoost model must be numeric. One hot encoding was previously found to yield better results 
    # than label encoding for the particular
    strcols = [col for col in dataset.columns if dataset[col].dtype == 'object']
    dataset = pd.get_dummies(dataset, columns=strcols)

    # Train Test Split and write out the train-test files

    # Split with a small test size so as to allow our model to train on more data
    X_train, X_test, y_train, y_test = \
        train_test_split(dataset.drop(_TARGET_COLUMN, axis=1), 
                                      dataset[_TARGET_COLUMN], stratify=dataset[_TARGET_COLUMN], 
                                      shuffle=True, test_size=0.2
                                     )
    print("X_train shape = ", X_train.shape)
    print("X_test  shape = ", X_test.shape)

    # count number of classes
    values, counts = np.unique(y_train, return_counts=True)
    NUM_CLASSES = len(values)

    # ---------------------------------------
    # Train model
    # ---------------------------------------

    params = {
        'n_estimators': 100,
        'max_depth': 3,
        'booster': 'gbtree',
        'min_child_weight': 1,
        'learning_rate': 0.1,
        'gamma': 0,
        'subsample': 1,
        'colsample_bytree': 1,
        'reg_alpha': 0,
        'objective': 'multi:softprob',
        'num_class': NUM_CLASSES
        }
    xgb_model = XGBClassifier(**params)
    #xgb_model.set_params(**params)
    xgb_model.fit(X_train, y_train)

    # ---------------------------------------
    # Save the model to GCS
    # ---------------------------------------

    bst_filename = 'model.bst'
    bst = xgb_model.get_booster()
    bst.save_model(bst_filename)
    bucket = storage.Client().bucket(BUCKET_NAME)
    blob = bucket.blob('{}/{}'.format(
        datetime.now().strftime(FOLDER_NAME+'/models/model_%Y%m%d_%H%M%S'),
        bst_filename))
    blob.upload_from_filename(bst_filename)


Overwriting ./trainer/train.py


------
# Hyper Parameter Tuning

In [11]:
%%writefile ./config/hptuning_config.yaml

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================


# hptuning_config.yaml
trainingInput:
  scaleTier: STANDARD-1
  hyperparameters:
    goal: MAXIMIZE
    maxTrials: 5
    maxParallelTrials: 5
    hyperparameterMetricTag: roc_auc
    enableTrialEarlyStopping: TRUE
    params:
      - parameterName: max_depth
        type: INTEGER
        minValue: 3
        maxValue: 8
      - parameterName: n_estimators
        type: INTEGER
        minValue: 50
        maxValue: 200
      - parameterName: booster
        type: CATEGORICAL
        categoricalValues: [
          "gbtree",
          "gblinear",
          "dart"
        ]


Overwriting ./config/hptuning_config.yaml


In [12]:
# Create the xgb_trainer directory and load the trainer files in it
!cp ./trainer/train.py ./trainer/train_hpt.py


In [13]:
%%writefile -a ./trainer/train_hpt.py

    # predict the model with test file
    y_pred = xgb_model.predict(X_test)

    # Binarize multiclass labels
    from sklearn import preprocessing
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)

    # Define the score we want to use to evaluate the classifier on
    #score = metrics.accuracy_score(y_test, y_pred)
    #score = metrics.average_precision_score(y_test, y_pred, average='macro')
    #score = metrics.f1_score(y_test, y_pred, average='macro')
    #score = metrics.fbeta_score(y_test, y_pred, average='macro', beta=0.5)
    #score = metrics.hamming_loss(y_test, y_pred)
    #score = metrics.log_loss(y_test, y_pred)
    #score = metrics.precision_score(y_test, y_pred, average='macro')
    #score = metrics.recall_score(y_test, y_pred, average='macro')
    score = metrics.roc_auc_score(y_test, y_pred, average='macro')
    #score = metrics.zero_one_loss(y_test, y_pred)

    # The default name of the metric is training/hptuning/metric. 
    # We recommend that you assign a custom name. The only functional difference is that 
    # if you use a custom name, you must set the hyperparameterMetricTag value in the 
    # HyperparameterSpec object in your job request to match your chosen name.
    # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#HyperparameterSpec
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='roc_auc',
        metric_value=score,
        global_step=1000
    )


Appending to ./trainer/train_hpt.py


In [21]:
from datetime import datetime
from pytz import timezone

JOBNAME_HPT = 'xgb_train_' + USER + '_' + \
              datetime.now(timezone('US/Pacific')).strftime("%m%d%y_%H%M") + '_HPT'
JOB_DIR = 'gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/' + 'jobdir'
JOB_CONFIG = "./config/hptuning_config.yaml"
print("Job Name = ", JOBNAME_HPT)
print("Job Dir  = ", JOB_DIR)


Job Name =  xgb_train_cchatterj_100520_2155_HPT
Job Dir  =  gs://chanchal-sandbox/ht-xgb-data/jobdir


In [22]:
# submit the hyperparameter training job
! gcloud ai-platform jobs submit training $JOBNAME_HPT \
  --package-path $(pwd)/trainer \
  --module-name trainer.train_hpt \
  --python-version 3.7 \
  --runtime-version 2.2 \
  --job-dir $JOB_DIR \
  --region $REGION \
  --config $JOB_CONFIG


Job [xgb_train_cchatterj_100520_2155_HPT] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe xgb_train_cchatterj_100520_2155_HPT

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs xgb_train_cchatterj_100520_2155_HPT
jobId: xgb_train_cchatterj_100520_2155_HPT
state: QUEUED


In [26]:
#check the hyperparameter training job status
! gcloud ai-platform jobs describe $JOBNAME_HPT


createTime: '2020-10-06T04:55:06Z'
etag: cZE056HS38g=
jobId: xgb_train_cchatterj_100520_2155_HPT
startTime: '2020-10-06T04:55:07Z'
state: RUNNING
trainingInput:
  hyperparameters:
    enableTrialEarlyStopping: true
    goal: MAXIMIZE
    hyperparameterMetricTag: roc_auc
    maxParallelTrials: 5
    maxTrials: 5
    params:
    - maxValue: 8.0
      minValue: 3.0
      parameterName: max_depth
      type: INTEGER
    - maxValue: 200.0
      minValue: 50.0
      parameterName: n_estimators
      type: INTEGER
    - categoricalValues:
      - gbtree
      - gblinear
      - dart
      parameterName: booster
      type: CATEGORICAL
  jobDir: gs://chanchal-sandbox/ht-xgb-data/jobdir
  packageUris:
  - gs://chanchal-sandbox/ht-xgb-data/jobdir/packages/21466ad77472afdd348fe852c64a2b49e7b02caa1ada41f96344774dc7e21577/trainer-0.1.tar.gz
  pythonModule: trainer.train_hpt
  pythonVersion: '3.7'
  region: us-central1
  runtimeVersion: '2.2'
  scaleTier: STANDARD_1
trainingOutput:
  completedTrialC

In [27]:
# Getthe best hypertuned model

from googleapiclient import discovery
#from google.oauth2 import service_account
import pandas as pd
import json

# Define the credentials for the service account
#credentials = service_account.Credentials.from_service_account_file(<PATH TO CREDENTIALS JSON>)

# Define the project id and the job id and format it for the api request
project_id = 'projects/{}'.format(PROJECT_ID)
job_id = '{}/jobs/{}'.format(project_id, JOBNAME_HPT)

# Build the service
ml = discovery.build('ml', 'v1', cache_discovery=False) #, credentials=credentials)

# Execute the request and pass in the job id
request = ml.projects().jobs().get(name=job_id).execute()

# Get just the best hp values
best_model = request['trainingOutput']['trials'][0]
print('Best Hyperparameters:')
print(json.dumps(best_model, indent=4))

# Or put all the results into a dataframe
# Create a list for each field
trial_id, objective, booster, max_depth, n_estimators  = [], [], [], [], []

# Loop through the json and append the values of each field to the lists
for each in request['trainingOutput']['trials']:
    trial_id.append(each['trialId'])
    objective.append(each['finalMetric']['objectiveValue']) 
    booster.append(each['hyperparameters']['booster']) 
    max_depth.append(each['hyperparameters']['max_depth']) 
    n_estimators.append(each['hyperparameters']['n_estimators'])

# Put the lsits into a df, transpose and name the columns
df = pd.DataFrame([trial_id, objective, booster, max_depth, n_estimators]).T
df.columns = ['trial_id', 'objective', 'booster', 'max_depth', 'n_estimators']

# Display the df
print(df)


Best Hyperparameters:
{
    "trialId": "5",
    "hyperparameters": {
        "n_estimators": "173",
        "booster": "gbtree",
        "max_depth": "5"
    },
    "finalMetric": {
        "trainingStep": "1000",
        "objectiveValue": 0.9044898550139779
    },
    "startTime": "2020-10-06T04:55:45.340243937Z",
    "endTime": "2020-10-06T05:01:32Z",
    "state": "SUCCEEDED"
}
  trial_id objective   booster max_depth n_estimators
0        5   0.90449    gbtree         5          173
1        3  0.904366      dart         7           69
2        1  0.899116  gblinear         6          125
3        2  0.897606    gbtree         8           95
4        4  0.890275      dart         6          137


------
# Training with Tuned Parameters

In [28]:
# Getthe best hypertuned model parameters
BOOSTER = best_model['hyperparameters']['booster']
MAX_DEPTH = int(best_model['hyperparameters']['max_depth'])
N_ESTIMATORS = int(best_model['hyperparameters']['n_estimators'])
print(BOOSTER, MAX_DEPTH, N_ESTIMATORS)


gbtree 5 173


In [29]:
from datetime import datetime
from pytz import timezone

JOBNAME = 'xgb_train_' + USER + '_' + \
              datetime.now(timezone('US/Pacific')).strftime("%m%d%y_%H%M")
JOB_DIR = 'gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/' + 'jobdir'
JOB_CONFIG = "./config/config.yaml"
print("Job Name = ", JOBNAME)
print("Job Dir  = ", JOB_DIR)


Job Name =  xgb_train_cchatterj_100520_2210
Job Dir  =  gs://chanchal-sandbox/ht-xgb-data/jobdir


In [30]:
# https://cloud.google.com/sdk/gcloud/reference/ai-platform/jobs/submit/training

# submit the training job
! gcloud ai-platform jobs submit training $JOBNAME \
  --package-path $(pwd)/trainer \
  --module-name trainer.train \
  --region $REGION \
  --python-version 3.7 \
  --runtime-version 2.2 \
  --job-dir $JOB_DIR \
  --config $JOB_CONFIG \
  -- \
  --max_depth=$MAX_DEPTH \
  --n_estimators=$N_ESTIMATORS \
  --booster=$BOOSTER


Job [xgb_train_cchatterj_100520_2210] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe xgb_train_cchatterj_100520_2210

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs xgb_train_cchatterj_100520_2210
jobId: xgb_train_cchatterj_100520_2210
state: QUEUED


In [32]:
# check the training job status
! gcloud ai-platform jobs describe $JOBNAME


createTime: '2020-10-06T05:10:06Z'
endTime: '2020-10-06T05:11:39Z'
etag: kusm44RdTdc=
jobId: xgb_train_cchatterj_100520_2210
startTime: '2020-10-06T05:10:38Z'
state: SUCCEEDED
trainingInput:
  args:
  - --max_depth=5
  - --n_estimators=173
  - --booster=gbtree
  jobDir: gs://chanchal-sandbox/ht-xgb-data/jobdir
  packageUris:
  - gs://chanchal-sandbox/ht-xgb-data/jobdir/packages/1217a8da84965ca6bbe79b3857364a0bb8b7ca61cd83a8e5874ab86222be53ed/trainer-0.1.tar.gz
  pythonModule: trainer.train
  pythonVersion: '3.7'
  region: us-central1
  runtimeVersion: '2.2'
  scaleTier: STANDARD_1
trainingOutput:
  consumedMLUnits: 0.07

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/xgb_train_cchatterj_100520_2210?project=codev-257422

View logs at:
https://console.cloud.google.com/logs?resource=ml_job%2Fjob_id%2Fxgb_train_cchatterj_100520_2210&project=codev-257422


--------
# Deploy the Model


In [33]:
MODEL_NAME = "xgb_model"
MODEL_VERSION = "cchatterj_xgb_bst"
MODEL_DIR = 'gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/xgb_models'
MODEL_FRAMEWORK = "XGBOOST"
MODEL_DESCRIPTION = "SET8_MSE_loss_0.87"

!gcloud ai-platform models list


Using endpoint [https://ml.googleapis.com/]
NAME       DEFAULT_VERSION_NAME
tf_model   cchatterj_tf
xgb_model  cchatterj_xgb_bst


In [34]:
#!gcloud ai-platform models list
#!gcloud ai-platform versions delete "Chanchals_Pkl_v1" --model "xgb_model" -q
#!gcloud ai-platform models delete "xgb_model" -q

In [35]:
# create the model if it doesn't already exist
modelname = !gcloud ai-platform models list | grep -w $MODEL_NAME
print(modelname)
if len(modelname) < 1:
    print("Creating model " + MODEL_NAME)
    ! gcloud ai-platform models create $MODEL_NAME --regions $REGION --enable-logging
else:
    print("Model " + MODEL_NAME + " exist")


['Using endpoint [https://ml.googleapis.com/]', 'xgb_model  cchatterj_xgb_bst']
Model xgb_model exist


In [36]:
# delete the model version if it already exists
modelver = !gcloud ai-platform versions list --model $MODEL_NAME | grep -w $MODEL_VERSION
print(modelver)
if modelver[1] == 'Listed 0 items.':
    print("Model version " + MODEL_VERSION + " doesnot exist")
else:
    print("Deleting model " + MODEL_NAME + " version " + MODEL_VERSION)
    !gcloud ai-platform versions delete $MODEL_VERSION --model $MODEL_NAME -q

#List the models
#!gcloud ai-platform models list


['Using endpoint [https://ml.googleapis.com/]', 'cchatterj_xgb_bst  gs://chanchal-sandbox/ht-xgb-data/xgb_models/model_20201005_052148/  READY']
Deleting model xgb_model version cchatterj_xgb_bst
Using endpoint [https://ml.googleapis.com/]
Deleting version [cchatterj_xgb_bst]......done.                                


In [37]:
!gcloud ai-platform versions list --model $MODEL_NAME

Using endpoint [https://ml.googleapis.com/]
Listed 0 items.


In [38]:
!gcloud ai-platform models list

Using endpoint [https://ml.googleapis.com/]
NAME       DEFAULT_VERSION_NAME
tf_model   cchatterj_tf
xgb_model


In [39]:
print("Model Directory: ", MODEL_DIR)

# Get a list of model directories
ALL_MODEL_DIRS = ! gsutil ls $MODEL_DIR
# Pick the directory with the latest timestamp, in case you have trained multiple times
if ("CommandException" in ALL_MODEL_DIRS[0]):
    print("Create the model directory first")
else:
    LATEST_MODEL_DIR = ALL_MODEL_DIRS[-1]
print("Latest Model Directory = ", LATEST_MODEL_DIR)

# Deploy the model
! gcloud beta ai-platform versions create $MODEL_VERSION \
  --model=$MODEL_NAME \
  --origin=$LATEST_MODEL_DIR \
  --framework=$MODEL_FRAMEWORK \
  --runtime-version=2.2 \
  --python-version=3.7 \
  --description=$MODEL_DESCRIPTION \
  #--region=$REGION \
  #--labels='some_key'=${'XYZ'},another_key="another_value"


Model Directory:  gs://chanchal-sandbox/ht-xgb-data/xgb_models
Latest Model Directory =  gs://chanchal-sandbox/ht-xgb-data/xgb_models/model_20201005_052148/
Using endpoint [https://ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


In [41]:
# List all models
!gcloud ai-platform models list
# List all versions of the created model
!gcloud ai-platform versions list --model $MODEL_NAME
# Describe the Model
!gcloud ai-platform models describe $MODEL_NAME 


Using endpoint [https://ml.googleapis.com/]
NAME       DEFAULT_VERSION_NAME
tf_model   cchatterj_tf
xgb_model  cchatterj_xgb_bst
Using endpoint [https://ml.googleapis.com/]
NAME               DEPLOYMENT_URI                                                       STATE
cchatterj_xgb_bst  gs://chanchal-sandbox/ht-xgb-data/xgb_models/model_20201005_052148/  READY
Using endpoint [https://ml.googleapis.com/]
defaultVersion:
  createTime: '2020-10-06T05:12:52Z'
  deploymentUri: gs://chanchal-sandbox/ht-xgb-data/xgb_models/model_20201005_052148/
  description: SET8_MSE_loss_0.87
  etag: msePKkMZOjY=
  framework: XGBOOST
  isDefault: true
  machineType: mls1-c1-m2
  name: projects/codev-257422/models/xgb_model/versions/cchatterj_xgb_bst
  pythonVersion: '3.7'
  runtimeVersion: '2.2'
  state: READY
etag: fFEc_OLPHv4=
name: projects/codev-257422/models/xgb_model
onlinePredictionLogging: true
regions:
- us-central1


------
# Predictions from the deployed model with test data

In [42]:
from sklearn.model_selection import train_test_split

INPUT_FILE_NAME = 'Step10_Final_dataset.csv'
BUCKET_NAME = 'chanchal-sandbox'
FOLDER_NAME = 'ht-xgb-data'
_TARGET_COLUMN = 'TARGET'

input_file = 'gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/' + INPUT_FILE_NAME

# Read the data
try:
    dataset = pd.read_csv(input_file)
except:
    print("Oops! That is invalid filename. Try again...")

print(dataset.shape)

# ---------------------------------------
# Pre-processing code from customer
# ---------------------------------------

# Drop useless columns
dataset.drop(['LOAN_SEQUENCE_NUMBER'], axis=1, inplace=True)

# Inputs to an XGBoost model must be numeric. One hot encoding was previously found to yield better results 
# than label encoding for the particular
strcols = [col for col in dataset.columns if dataset[col].dtype == 'object']
dataset = pd.get_dummies(dataset, columns=strcols)

# Train Test Split and write out the train-test files

# Split with a small test size so as to allow our model to train on more data
X_train, X_test, y_train, y_test = \
    train_test_split(dataset.drop(_TARGET_COLUMN, axis=1), 
                                  dataset[_TARGET_COLUMN], stratify=dataset[_TARGET_COLUMN], 
                                  shuffle=True, test_size=0.2
                                 )
print("X_train shape = ", X_train.shape)
print("X_test  shape = ", X_test.shape)

# Write test for prediction
xgb_test = pd.concat([X_test, y_test], axis=1)



(104044, 47)
X_train shape =  (83235, 148)
X_test  shape =  (20809, 148)


In [49]:
%%time
import googleapiclient.discovery

project_id = 'projects/{}'.format(PROJECT_ID)
model_name = '{}/models/{}'.format(project_id, MODEL_NAME)
service = googleapiclient.discovery.build('ml', 'v1', cache_discovery=False)

#if version is not None:
#    name += '/versions/{}'.format(version)

pprobas = []
batch_size = 1000
n_samples = len(xgb_test) # this upper limit takes several seconds to execute
#for i,j in zip([i for i in range(0, n_samples-batch_size-1, batch_size)],
#               [i for i in range(batch_size, n_samples, batch_size)]):
for i in range(0, n_samples, batch_size):
    j = min(i+batch_size, n_samples)
    print("Processing samples", i, j)
    response1 = service.projects().predict(name=model_name, \
                        body={'instances': xgb_test.drop([_TARGET_COLUMN], axis=1).iloc[i:j].values.tolist()} \
                                          ).execute()
    if 'error' in response1:
        print(response1['error']) #raise RuntimeError(response['error'])
    else:
        pprobas += response1['predictions']
preds = np.argmax(pprobas, axis=1)
print("Predict Proba array size = ", np.array(pprobas).shape)
print("Predict array size = ", np.array(preds).shape)


Processing samples 0 1000
Processing samples 1000 2000
Processing samples 2000 3000
Processing samples 3000 4000
Processing samples 4000 5000
Processing samples 5000 6000
Processing samples 6000 7000
Processing samples 7000 8000
Processing samples 8000 9000
Processing samples 9000 10000
Processing samples 10000 11000
Processing samples 11000 12000
Processing samples 12000 13000
Processing samples 13000 14000
Processing samples 14000 15000
Processing samples 15000 16000
Processing samples 16000 17000
Processing samples 17000 18000
Processing samples 18000 19000
Processing samples 19000 20000
Processing samples 20000 20809
Predict Proba array size =  (20809, 4)
Predict array size =  (20809,)
CPU times: user 1.34 s, sys: 8 ms, total: 1.35 s
Wall time: 28.2 s


In [50]:
body={'instances': xgb_test.drop([_TARGET_COLUMN], axis=1).iloc[0:1000].values.tolist()}
import json
with open('xgb_test_data.json', 'w') as fp:
    json.dump(body, fp)

In [51]:
%%time
predict_results = !gcloud ai-platform predict \
  --model=$MODEL_NAME \
  --version=$MODEL_VERSION \
  --format='text' \
  --json-request='xgb_test_data.json'


CPU times: user 8 ms, sys: 12 ms, total: 20 ms
Wall time: 4.3 s


In [52]:
print(type(predict_results))
print(np.array(predict_results).shape)
predict_results[35*4+1:38*4+1]

<class 'IPython.utils.text.SList'>
(4001,)


['predictions[35][0]:  0.963602',
 'predictions[35][1]:  0.0322716',
 'predictions[35][2]:  0.00266592',
 'predictions[35][3]:  0.00146053',
 'predictions[36][0]:  0.961014',
 'predictions[36][1]:  0.0361663',
 'predictions[36][2]:  0.00148224',
 'predictions[36][3]:  0.00133717',
 'predictions[37][0]:  0.998528',
 'predictions[37][1]:  0.00127727',
 'predictions[37][2]:  9.90401e-05',
 'predictions[37][3]:  9.61582e-05']

In [53]:
n_samples = 1000
n_dim = 4
predict_r = np.zeros((n_samples,n_dim), dtype=int)
for i in range(1,len(predict_results)):
    x = predict_results[i].split(':')
    predict_r[(i-1)//n_dim][(i-1)%n_dim] = round(float(x[1]))
predict_r[20:40]
    

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0]])