In [1]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [4]:
bucket_name = 'spk-sagemaker-ncal'
training_file_key = 'biketrain/bike_train.csv'
validation_file_key = 'biketrain/bike_validation.csv'
test_file_key = 'biketrain/bike_test.csv'

s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [5]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://spk-sagemaker-ncal/biketrain/model
s3://spk-sagemaker-ncal/biketrain/bike_train.csv
s3://spk-sagemaker-ncal/biketrain/bike_validation.csv
s3://spk-sagemaker-ncal/biketrain/bike_test.csv


In [6]:
# http://boto3.readthedocs.io/en/latest/guide/s3.html
# S3 Naming conventions. 
# ----------------------
# files      = objects in S3.  
# file name  = key name in S3.

def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [7]:
write_to_s3('bike_train.csv',bucket_name,training_file_key)
write_to_s3('bike_validation.csv',bucket_name,validation_file_key)
write_to_s3('bike_test.csv',bucket_name,test_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [8]:
# Registry Path for algorithms provided by SageMaker
#  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

containers = {'us-west-1': '746614075791.dkr.ecr.us-west-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3',  # N.Cal
              'us-west-2': '246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3',  # Oregon 
              'us-east-1': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3',  # N.Virginia
              'us-east-2': '257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3',  # Ohio
              'eu-west-1': '141502667606.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3'}  # Ireland 
              #'eu-west-1': '141502667606.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}  # Ireland 

In [9]:
role = get_execution_role()

In [10]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::420656776846:role/service-role/AmazonSageMaker-ExecutionRole-20201103T123604


## Build Model

In [11]:
sess = sagemaker.Session()

In [12]:
#Access appropriate algorithm container image
#Specify how many instances to use for distributed training and what type of machine to use
#Finally, specify where the trained model artifacts needs to be stored
#Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#Optionally, give a name to the training job using base_job_name

#https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-train-model.html

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                          role, 
                                          train_instance_count=1, 
                                          train_instance_type='ml.m4.xlarge',
                                          output_path=s3_model_output_location,
                                          sagemaker_session=sess,
                                          base_job_name ='xgboost-biketrain-v1')

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [13]:
#Specify hyper parameters that appropriate for the training algorithm
#XGBoost Training Parameter Reference: 
#https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst

#max_depth=5,eta=0.1,subsample=0.7,num_round=150
estimator.set_hyperparameters(max_depth=5,
                              objective="reg:linear",
                              eta=0.1,
                              gamma=4,
                              min_child_weight = 6,
                              subsample=0.7,
                              num_round=150)

### Specify Training Data Location and Optionally, Validation Data Location

In [14]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,content_type="csv")
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,content_type="csv")

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [21]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://spk-sagemaker-ncal/biketrain/bike_train.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://spk-sagemaker-ncal/biketrain/bike_validation.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


### Train the model

In [17]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':training_input_config, 'validation':validation_input_config})
#xgb_model.fit({'train':training_input_config, 'validation':validation_input_config})

2020-11-03 12:19:28 Starting - Starting the training job...
2020-11-03 12:19:33 Starting - Launching requested ML instances.........
2020-11-03 12:21:02 Starting - Preparing the instances for training...
2020-11-03 12:21:41 Downloading - Downloading input data......
2020-11-03 12:22:42 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[12:23:05] 7620x13 matrix with 99060 entries loaded from /opt/ml/input/data/tra


2020-11-03 12:23:23 Uploading - Uploading generated training model
2020-11-03 12:23:30 Completed - Training job completed
Training seconds: 109
Billable seconds: 109


## Deploy Model

In [22]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type = 'ml.m4.xlarge',
                             endpoint_name = 'xgboost-biketrain-ver1')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: xgboost-biketrain-v1-2020-11-03-12-19-28-634


-------------!

## Run Predictions

In [19]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [20]:
predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])

b'47.38355255126953'