In [16]:
%%time 
import pickle, gzip, urllib.request, json
import numpy as np
import matplotlib.pyplot as plt
import os
import boto3
import re
import copy
import time
import io
import struct
from time import gmtime, strftime
from sagemaker import get_execution_role

role = get_execution_role()

region = boto3.Session().region_name

bucket='sagemaker-blackjack' # Replace with your s3 bucket name
prefix = 'sagemaker/blackjack' # Used as part of the path in the bucket where you store data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket) # The URL to access the bucket

CPU times: user 68.2 ms, sys: 8.45 ms, total: 76.6 ms
Wall time: 144 ms


In [31]:
import sagemaker

from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'xgboost', '0.90-1')

In [37]:
train_data = 's3://{}/{}/{}'.format(bucket, prefix, 'train')
print(train_data)

s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model_sdk')
print(s3_output_location)

s3://sagemaker-blackjack/sagemaker/blackjack/train
s3://sagemaker-blackjack/sagemaker/blackjack/xgboost_model_sdk


In [42]:
xgb_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, # number of ML compute instances to use for training
                                         train_instance_type='ml.m4.xlarge', # type of ML computer instance for training
                                         train_volume_size = 1, # size of storage to attach to training instance
                                         output_path=s3_output_location,
                                         sagemaker_session=sagemaker.Session())

In [43]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
xgb_model.set_hyperparameters(max_depth = 5, # (Default: 6) Maximum depth of a tree. Increasing this value makes the model more complex and likely to be overfit. 0 indicates no limit.
                              eta = .2, # (Default: 0.3) Step size shrinkage used in updates to prevent overfitting. After each boosting step, you can directly get the weights of new features. The eta parameter actually shrinks the feature weights to make the boosting process more conservative.
                              gamma = 4, # (Default: 0) Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger, the more conservative the algorithm is.
                              min_child_weight = 6, # (Default: 1) Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, the building process gives up further partitioning. In linear regression models, this simply corresponds to a minimum number of instances needed in each node. The larger the algorithm, the more conservative it is.
                              silent = 0, # (Default: 0) 0 means print running messages, 1 means silent mode.
                              objective = "multi:softmax", # (Default: reg:squarederror) Specifies the learning task and the corresponding learning objective. Examples: reg:logistic, multi:softmax, reg:squarederror. For a full list of valid inputs, refer to XGBoost Parameters (https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst).
                              num_class = 12, # The number of classes. Required if objective is set to multi:softmax or multi:softprob.
                              num_round = 10) # The number of rounds to run the training. Required.

In [44]:
train_channel = sagemaker.session.s3_input(train_data, content_type='text/csv')

data_channels = {'train': train_channel} # can also add validation channel here

In [45]:
# Start model training
xgb_model.fit(inputs=data_channels,  logs=True)

2020-03-31 20:14:14 Starting - Starting the training job...
2020-03-31 20:14:15 Starting - Launching requested ML instances......
2020-03-31 20:15:15 Starting - Preparing the instances for training...
2020-03-31 20:16:07 Downloading - Downloading input data...
2020-03-31 20:16:25 Training - Downloading the training image...
2020-03-31 20:16:55 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[20:16:57] 2501256x2 matrix with 5002512 entries load

In [46]:
# Deploy model for testing
xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                                content_type='text/csv',
                                instance_type='ml.t2.medium'
                                )

-------------!

In [55]:
# Test the deployed model
result = xgb_predictor.predict('803,1108')
print(result)

# this data point accurately predicts a split!
result = xgb_predictor.predict('803,1111')
print(result)

result = xgb_predictor.predict('203,1007')
print(result)

b'6.0'
b'11.0'
b'1.0'
