In [1]:
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role
# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [2]:
bucket_name = 'sagemaker-gopi'
training_file_key = 'biketrain/bike_train_numeric_columns.recordio'

s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)

s3://sagemaker-gopi/biketrain/model
s3://sagemaker-gopi/biketrain/bike_train_numeric_columns.recordio


In [4]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [5]:
write_to_s3('bike_train_numeric_columns.recordio',bucket_name,training_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [6]:
# Use Spot Instance - Save up to 90% of training cost by using spot instances when compared to on-demand instances
# Reference: https://github.com/aws-samples/amazon-sagemaker-managed-spot-training/blob/main/xgboost_built_in_managed_spot_training_checkpointing/xgboost_built_in_managed_spot_training_checkpointing.ipynb

# if you are still on two-month free-tier you can use the on-demand instance by setting:
#   use_spot_instances = False

# We will use spot for training
use_spot_instances = True
max_run = 3600 # in seconds
max_wait = 3600 if use_spot_instances else None # in seconds

job_name = 'pca-biketrain-v1'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/bikerental/checkpoints/{job_name}'
    
print (f'Checkpoint uri: {checkpoint_s3_uri}')

Checkpoint uri: s3://sagemaker-gopi/bikerental/checkpoints/pca-biketrain-v1


In [7]:
sess = sagemaker.Session()

In [8]:
role = get_execution_role()

In [9]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::512185592969:role/onshore-sagemaker-role


In [10]:
# SDK 2 uses image_uris.retrieve the container image location

# PCA
container = sagemaker.image_uris.retrieve("pca",sess.boto_region_name)

print (f'Using pca Container {container}')

Using pca Container 382416733822.dkr.ecr.us-east-1.amazonaws.com/pca:1


In [11]:
container

'382416733822.dkr.ecr.us-east-1.amazonaws.com/pca:1'

## Build Model

In [12]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

# SDK 2.0
estimator = sagemaker.estimator.Estimator(container,
                                          role, 
                                          instance_count=1, 
                                          instance_type='ml.m5.xlarge',
                                          output_path=s3_model_output_location,
                                          sagemaker_session=sess,
                                          base_job_name = job_name,
                                          use_spot_instances=use_spot_instances,
                                          max_run=max_run,
                                          max_wait=max_wait,
                                          checkpoint_s3_uri=checkpoint_s3_uri)

In [13]:
# Specify hyper parameters that appropriate for the training algorithm
estimator.set_hyperparameters(feature_dim=4,
                        num_components=3,
                        subtract_mean=False,
                        algorithm_mode='regular',
                        mini_batch_size=200)

In [14]:
estimator.hyperparameters()

{'feature_dim': 4,
 'num_components': 3,
 'subtract_mean': False,
 'algorithm_mode': 'regular',
 'mini_batch_size': 200}

### Train the model

In [15]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':s3_training_file_location})

INFO:sagemaker:Creating training-job with name: pca-biketrain-v1-2023-06-08-18-02-57-952


2023-06-08 18:02:58 Starting - Starting the training job.........
2023-06-08 18:04:10 Starting - Preparing the instances for training......
2023-06-08 18:05:30 Downloading - Downloading input data
2023-06-08 18:05:30 Training - Downloading the training image......
2023-06-08 18:06:31 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/08/2023 18:06:40 INFO 140228165941056] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}[0m
[34m[06/08/2023 18:06:40 INFO 140228165941056] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'algorithm_mode': 'r


2023-06-08 18:07:42 Uploading - Uploading generated training model
2023-06-08 18:07:42 Completed - Training job completed
Training seconds: 152
Billable seconds: 95
Managed Spot Training savings: 37.5%


## Deploy Model

In [16]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name = job_name)

INFO:sagemaker:Creating model with name: pca-biketrain-v1-2023-06-08-18-16-22-655
INFO:sagemaker:Creating endpoint-config with name pca-biketrain-v1
INFO:sagemaker:Creating endpoint with name pca-biketrain-v1


-----!

## Run Predictions
This code is configuring the serialization and deserialization formats for the SageMaker predictor.

Serialization is the process of translating data structures or object states into a format (like JSON or CSV) that can be stored (for example, in a file or memory buffer) or transmitted (for example, across a network connection link) and then reconstructed later (possibly in a different computer environment). When you're sending predictions to the model, the input needs to be serialized from its memory representation to a byte stream.

Deserialization is the reverse of that process, translating from the byte stream back into its memory representation. When the model sends its predictions back, the predictions need to be deserialized from the byte stream to a useful data structure in memory.

Here's what each line is doing:

from sagemaker.serializers import CSVSerializer: This is importing the CSVSerializer from the SageMaker SDK. This is a serializer that prepares input data in the CSV format.

from sagemaker.deserializers import JSONDeserializer: This is importing the JSONDeserializer from the SageMaker SDK. This is a deserializer that interprets the returned prediction results from JSON format.

predictor.serializer = CSVSerializer(): This sets the predictor's serializer to CSVSerializer. This means that when you pass data to the predictor.predict function, it will automatically serialize the data into CSV format before sending the data to the model.

predictor.deserializer = JSONDeserializer(): This sets the predictor's deserializer to JSONDeserializer. This means that when the predictor.predict function gets the prediction results from the model, it will automatically deserialize the JSON-formatted results into a Python data structure.

The serializers and deserializers you use will depend on how your model expects to receive and send data. Some models might expect JSON-formatted input and output data, while others might use CSV or another format.

In [17]:
# SDK 2.0 serializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

In [18]:
predictor.predict([[-1.333660693,-1.092736969,0.993213054,1.567753667]])

{'projections': [{'projection': [2.083782434463501,
    -1.355332374572754,
    -0.40431487560272217]}]}

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions