# Training BYOC on SageMaker
>__Note:__ This assumes a container has already been built and deployed to ECR.

## Prepare the Data

In [22]:
# Load the required libraries
import warnings
import zipfile
import boto3
import os
import json
import urllib.request
import sagemaker
import tempfile
import cv2
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
from sagemaker.estimator import Estimator
from sklearn.model_selection import train_test_split

In [2]:
region = boto3.Session().region_name
sagemaker_session = sagemaker.Session()
sagemaker_client = boto3.client('sagemaker')
bucket = sagemaker.Session().default_bucket()
role = sagemaker.get_execution_role()

## Set Hyperparameters

In [3]:
channels = {'train': 's3://robostig-assets-us-west-2'}

In [4]:
hyperparameters = dict(batch_size=32, learning_rate=.0001, epochs=12)

In [5]:
output_location = "s3://{}".format(bucket)

In [6]:
image_name = '500842391574.dkr.ecr.us-west-2.amazonaws.com/pystig:tf-gpu'
BYOC_estimator = Estimator(
    image_name,
    role=role,
    output_path=output_location,
    train_instance_count=1,
    train_instance_type='ml.p3.2xlarge',
    hyperparameters=hyperparameters,
    sagemaker_session=sagemaker_session
)

In [7]:
BYOC_estimator.fit(channels)

INFO:sagemaker:Creating training-job with name: pystig-2018-07-13-03-00-10-948


...................................
[31mUsing TensorFlow backend.[0m
[31mcreating SageMaker trainer environment:[0m
[31mTrainerEnvironment(input_dir='/opt/ml/input', input_config_dir='/opt/ml/input/config', model_dir='/opt/ml/model', output_dir='/opt/ml/output', hyperparameters={'epochs': '12', 'learning_rate': '0.0001', 'batch_size': '32'}, resource_config={'hosts': ['algo-1'], 'network_interface_name': 'ethwe', 'current_host': 'algo-1'}, input_data_config={'train': {'TrainingInputMode': 'File', 'RecordWrapperType': 'None', 'S3DistributionType': 'FullyReplicated'}}, output_data_dir='/opt/ml/output/data', hosts=['algo-1'], channel_dirs={'train': '/opt/ml/input/data/train'}, current_host='algo-1', available_gpus=1, available_cpus=8)[0m
[31mStarting model training ...
[0m
[31mcomma.ai Model Summary
[0m
[31m_________________________________________________________________[0m
[31mLayer (type)                 Output Shape              Param #   [0m
[31mlambda_1 (Lambda)      



[31mEpoch 2/12

  1/226 [..............................] - ETA: 1s - loss: 0.0404
 10/226 [>.............................] - ETA: 1s - loss: 0.0335
 11/226 [>.............................] - ETA: 3s - loss: 0.0338
 12/226 [>.............................] - ETA: 6s - loss: 0.0383[0m
[31m 13/226 [>.............................] - ETA: 9s - loss: 0.0394
 14/226 [>.............................] - ETA: 11s - loss: 0.0384
 15/226 [>.............................] - ETA: 12s - loss: 0.0383
 16/226 [=>............................] - ETA: 14s - loss: 0.0388
 17/226 [=>............................] - ETA: 15s - loss: 0.0409
 18/226 [=>............................] - ETA: 16s - loss: 0.0412[0m
[31m 19/226 [=>............................] - ETA: 17s - loss: 0.0408
 20/226 [=>............................] - ETA: 18s - loss: 0.0419
 21/226 [=>............................] - ETA: 19s - loss: 0.0426
 22/226 [=>............................] - ETA: 20s - loss: 0.0418
 23/226 [==>....................

[31mEpoch 3/12
[0m
[31m  1/226 [..............................] - ETA: 1s - loss: 0.0438
  8/226 [>.............................] - ETA: 1s - loss: 0.0300
 11/226 [>.............................] - ETA: 4s - loss: 0.0283
 12/226 [>.............................] - ETA: 6s - loss: 0.0298
 13/226 [>.............................] - ETA: 9s - loss: 0.0309
 14/226 [>.............................] - ETA: 11s - loss: 0.0313[0m
[31m 15/226 [>.............................] - ETA: 12s - loss: 0.0309
 16/226 [=>............................] - ETA: 14s - loss: 0.0343
 17/226 [=>............................] - ETA: 15s - loss: 0.0372
 18/226 [=>............................] - ETA: 16s - loss: 0.0371
 19/226 [=>............................] - ETA: 17s - loss: 0.0374
 20/226 [=>............................] - ETA: 18s - loss: 0.0370[0m
[31m 21/226 [=>............................] - ETA: 19s - loss: 0.0364
 22/226 [=>............................] - ETA: 19s - loss: 0.0368
 23/226 [==>...........



[31mEpoch 4/12

  1/226 [..............................] - ETA: 1s - loss: 0.0152
  8/226 [>.............................] - ETA: 1s - loss: 0.0128
 11/226 [>.............................] - ETA: 4s - loss: 0.0129
 12/226 [>.............................] - ETA: 6s - loss: 0.0139[0m
[31m 13/226 [>.............................] - ETA: 9s - loss: 0.0167
 14/226 [>.............................] - ETA: 10s - loss: 0.0182
 15/226 [>.............................] - ETA: 12s - loss: 0.0225
 16/226 [=>............................] - ETA: 14s - loss: 0.0224
 17/226 [=>............................] - ETA: 15s - loss: 0.0235
 18/226 [=>............................] - ETA: 16s - loss: 0.0235[0m
[31m 19/226 [=>............................] - ETA: 17s - loss: 0.0233
 20/226 [=>............................] - ETA: 18s - loss: 0.0245
 21/226 [=>............................] - ETA: 19s - loss: 0.0254
 22/226 [=>............................] - ETA: 19s - loss: 0.0271
 23/226 [==>....................

[31mEpoch 5/12

  1/226 [..............................] - ETA: 1s - loss: 0.0264
  9/226 [>.............................] - ETA: 1s - loss: 0.0309[0m
[31m 11/226 [>.............................] - ETA: 4s - loss: 0.0310
 12/226 [>.............................] - ETA: 6s - loss: 0.0306
 13/226 [>.............................] - ETA: 9s - loss: 0.0320
 14/226 [>.............................] - ETA: 11s - loss: 0.0321
 15/226 [>.............................] - ETA: 12s - loss: 0.0312
 16/226 [=>............................] - ETA: 14s - loss: 0.0314[0m
[31m 17/226 [=>............................] - ETA: 15s - loss: 0.0319
 18/226 [=>............................] - ETA: 16s - loss: 0.0318
 19/226 [=>............................] - ETA: 17s - loss: 0.0315
 20/226 [=>............................] - ETA: 18s - loss: 0.0316
 21/226 [=>............................] - ETA: 19s - loss: 0.0319
 22/226 [=>............................] - ETA: 19s - loss: 0.0320[0m
[31m 23/226 [==>...........



[31mEpoch 6/12
[0m
[31m  1/226 [..............................] - ETA: 1s - loss: 0.0318
  8/226 [>.............................] - ETA: 1s - loss: 0.0255
 11/226 [>.............................] - ETA: 4s - loss: 0.0262
 12/226 [>.............................] - ETA: 6s - loss: 0.0255
 13/226 [>.............................] - ETA: 9s - loss: 0.0267
 14/226 [>.............................] - ETA: 11s - loss: 0.0287[0m
[31m 15/226 [>.............................] - ETA: 13s - loss: 0.0291
 16/226 [=>............................] - ETA: 14s - loss: 0.0303
 17/226 [=>............................] - ETA: 15s - loss: 0.0301
 18/226 [=>............................] - ETA: 16s - loss: 0.0297
 19/226 [=>............................] - ETA: 17s - loss: 0.0303
 20/226 [=>............................] - ETA: 18s - loss: 0.0307[0m
[31m 21/226 [=>............................] - ETA: 19s - loss: 0.0307
 22/226 [=>............................] - ETA: 20s - loss: 0.0302
 23/226 [==>...........

[31mEpoch 7/12

  1/226 [..............................] - ETA: 1s - loss: 0.0296
 10/226 [>.............................] - ETA: 1s - loss: 0.0219
 11/226 [>.............................] - ETA: 3s - loss: 0.0210
 12/226 [>.............................] - ETA: 6s - loss: 0.0214
 13/226 [>.............................] - ETA: 9s - loss: 0.0230[0m
[31m 14/226 [>.............................] - ETA: 11s - loss: 0.0234
 15/226 [>.............................] - ETA: 12s - loss: 0.0240
 16/226 [=>............................] - ETA: 14s - loss: 0.0241
 17/226 [=>............................] - ETA: 15s - loss: 0.0248
 18/226 [=>............................] - ETA: 16s - loss: 0.0259[0m
[31m 19/226 [=>............................] - ETA: 17s - loss: 0.0256
 20/226 [=>............................] - ETA: 18s - loss: 0.0258
 21/226 [=>............................] - ETA: 19s - loss: 0.0254
 22/226 [=>............................] - ETA: 19s - loss: 0.0257
 23/226 [==>....................



[31mEpoch 8/12

  1/226 [..............................] - ETA: 1s - loss: 0.0329
  9/226 [>.............................] - ETA: 1s - loss: 0.0217
 11/226 [>.............................] - ETA: 3s - loss: 0.0200[0m
[31m 12/226 [>.............................] - ETA: 6s - loss: 0.0211
 13/226 [>.............................] - ETA: 9s - loss: 0.0208
 14/226 [>.............................] - ETA: 11s - loss: 0.0209
 15/226 [>.............................] - ETA: 12s - loss: 0.0208
 16/226 [=>............................] - ETA: 14s - loss: 0.0211
 17/226 [=>............................] - ETA: 15s - loss: 0.0225[0m
[31m 18/226 [=>............................] - ETA: 16s - loss: 0.0242
 19/226 [=>............................] - ETA: 17s - loss: 0.0243
 20/226 [=>............................] - ETA: 18s - loss: 0.0259
 21/226 [=>............................] - ETA: 19s - loss: 0.0262
 22/226 [=>............................] - ETA: 19s - loss: 0.0266
 23/226 [==>....................

[31mEpoch 9/12

  1/226 [..............................] - ETA: 1s - loss: 0.0373
  9/226 [>.............................] - ETA: 1s - loss: 0.0260[0m
[31m 11/226 [>.............................] - ETA: 3s - loss: 0.0255
 12/226 [>.............................] - ETA: 6s - loss: 0.0279
 13/226 [>.............................] - ETA: 8s - loss: 0.0292
 14/226 [>.............................] - ETA: 10s - loss: 0.0297
 15/226 [>.............................] - ETA: 12s - loss: 0.0307
 16/226 [=>............................] - ETA: 14s - loss: 0.0321[0m
[31m 17/226 [=>............................] - ETA: 15s - loss: 0.0320
 18/226 [=>............................] - ETA: 16s - loss: 0.0316
 19/226 [=>............................] - ETA: 17s - loss: 0.0319
 20/226 [=>............................] - ETA: 18s - loss: 0.0328
 21/226 [=>............................] - ETA: 18s - loss: 0.0328[0m
[31m 22/226 [=>............................] - ETA: 19s - loss: 0.0339
 23/226 [==>...........



[31mEpoch 10/12

  1/226 [..............................] - ETA: 1s - loss: 0.0438
  9/226 [>.............................] - ETA: 1s - loss: 0.0296
 11/226 [>.............................] - ETA: 3s - loss: 0.0295[0m
[31m 12/226 [>.............................] - ETA: 6s - loss: 0.0317
 13/226 [>.............................] - ETA: 9s - loss: 0.0321
 14/226 [>.............................] - ETA: 11s - loss: 0.0343
 15/226 [>.............................] - ETA: 12s - loss: 0.0363
 16/226 [=>............................] - ETA: 14s - loss: 0.0362
 17/226 [=>............................] - ETA: 15s - loss: 0.0363[0m
[31m 18/226 [=>............................] - ETA: 16s - loss: 0.0355
 19/226 [=>............................] - ETA: 17s - loss: 0.0354
 20/226 [=>............................] - ETA: 18s - loss: 0.0360
 21/226 [=>............................] - ETA: 19s - loss: 0.0354
 22/226 [=>............................] - ETA: 19s - loss: 0.0355
 23/226 [==>...................

[31mEpoch 11/12

  1/226 [..............................] - ETA: 1s - loss: 0.0216
  9/226 [>.............................] - ETA: 1s - loss: 0.0164
 11/226 [>.............................] - ETA: 3s - loss: 0.0171[0m
[31m 12/226 [>.............................] - ETA: 6s - loss: 0.0193
 13/226 [>.............................] - ETA: 8s - loss: 0.0208
 14/226 [>.............................] - ETA: 11s - loss: 0.0217
 15/226 [>.............................] - ETA: 12s - loss: 0.0225
 16/226 [=>............................] - ETA: 14s - loss: 0.0234[0m
[31m 17/226 [=>............................] - ETA: 15s - loss: 0.0240
 18/226 [=>............................] - ETA: 16s - loss: 0.0243
 19/226 [=>............................] - ETA: 17s - loss: 0.0240
 20/226 [=>............................] - ETA: 18s - loss: 0.0244
 21/226 [=>............................] - ETA: 19s - loss: 0.0246
 22/226 [=>............................] - ETA: 19s - loss: 0.0245[0m
[31m 23/226 [==>..........



[31mEpoch 12/12
[0m
[31m  1/226 [..............................] - ETA: 1s - loss: 0.0180
  9/226 [>.............................] - ETA: 1s - loss: 0.0198
 11/226 [>.............................] - ETA: 3s - loss: 0.0211
 12/226 [>.............................] - ETA: 6s - loss: 0.0224
 13/226 [>.............................] - ETA: 9s - loss: 0.0228
 14/226 [>.............................] - ETA: 11s - loss: 0.0238[0m
[31m 15/226 [>.............................] - ETA: 12s - loss: 0.0234
 16/226 [=>............................] - ETA: 14s - loss: 0.0241
 17/226 [=>............................] - ETA: 15s - loss: 0.0242
 18/226 [=>............................] - ETA: 16s - loss: 0.0250
 19/226 [=>............................] - ETA: 17s - loss: 0.0256
 20/226 [=>............................] - ETA: 18s - loss: 0.0255[0m
[31m 21/226 [=>............................] - ETA: 19s - loss: 0.0256
 22/226 [=>............................] - ETA: 19s - loss: 0.0264
 23/226 [==>..........

[31mSaving the trained model ...[0m


===== Job Complete =====
Billable seconds: 774


## Training Job Description

In [8]:
response = sagemaker_client.describe_training_job(
    TrainingJobName='pystig-2018-07-13-03-00-10-948'
)
response

{'TrainingJobName': 'pystig-2018-07-13-03-00-10-948',
 'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:722812380636:training-job/pystig-2018-07-13-03-00-10-948',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-us-west-2-722812380636/pystig-2018-07-13-03-00-10-948/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'batch_size': '32',
  'epochs': '12',
  'learning_rate': '0.0001'},
 'AlgorithmSpecification': {'TrainingImage': '500842391574.dkr.ecr.us-west-2.amazonaws.com/pystig:tf-gpu',
  'TrainingInputMode': 'File'},
 'RoleArn': 'arn:aws:iam::722812380636:role/SageMaker',
 'InputDataConfig': [{'ChannelName': 'train',
   'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://robostig-assets-us-west-2',
     'S3DataDistributionType': 'FullyReplicated'}},
   'CompressionType': 'None',
   'RecordWrapperType': 'None'}],
 'OutputDataConfig': {'KmsKeyId': '',
  'S3OutputPath': 's3://sagemaker-us-west-2-7

---
## Deploy model - Standard
__Use `estimator.deploy()` based on GPU Container training__
>__Note:__ This is not cost effective.

```
predictor = BYOC_estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
```

---
## Deploy Model - CPU Container
__Use separate CPU container and the `sagemaker.Session()` API to specify a different serving container__

### Step 1: Create a new model from the training job, specifying a different container for training, in this case a CPU-based container.

In [14]:
job_name = 'pystig-2018-07-13-03-00-10-948'
BYOC_model = sagemaker_session.create_model_from_job(
    name = job_name.split('-')[0]+'-model',
    training_job_name=job_name,
    role=role,
    primary_container_image='500842391574.dkr.ecr.us-west-2.amazonaws.com/pystig:tf-cpu',
    model_data_url='s3://{}/{}/output/model.tar.gz'.format(bucket, job_name)
)

INFO:sagemaker:Creating model with name: pystig-model


### Step 2: Create a SageMaker Endpoint Configuration

In [15]:
BYOC_endpoint_config_name = sagemaker_session.create_endpoint_config(
    name=job_name.split('-')[0]+'-endpoint-config',
    model_name=BYOC_model,
    initial_instance_count=1,
    instance_type='ml.c4.xlarge'
)

INFO:sagemaker:Creating endpoint-config with name pystig-endpoint-config


### Step 3: Deploy the SageMaker Endpoint

In [16]:
create_endpoint_response = sagemaker_session.create_endpoint(
    endpoint_name=job_name.split('-')[0]+'-endpoint',
    config_name=str(BYOC_endpoint_config_name)
)

INFO:sagemaker:Creating endpoint with name pystig-endpoint


--------------------------------------------------!

In [17]:
sagemaker_client.describe_endpoint(EndpointName=create_endpoint_response)

{'EndpointName': 'pystig-endpoint',
 'EndpointArn': 'arn:aws:sagemaker:us-west-2:722812380636:endpoint/pystig-endpoint',
 'EndpointConfigName': 'pystig-endpoint-config',
 'ProductionVariants': [{'VariantName': 'AllTraffic',
   'CurrentWeight': 1.0,
   'DesiredWeight': 1.0,
   'CurrentInstanceCount': 1,
   'DesiredInstanceCount': 1}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2018, 7, 13, 3, 34, 29, 523000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2018, 7, 13, 3, 38, 40, 737000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': '33f9f23d-bc05-40ef-b7ef-e733a795015e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '33f9f23d-bc05-40ef-b7ef-e733a795015e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '676',
   'date': 'Fri, 13 Jul 2018 03:41:09 GMT'},
  'RetryAttempts': 0}}

---
## Test Endpoint (Simuilate pyStig)
### Get Sample Data for predictions

In [18]:
# Helper functions
def download(url):
    """
    Helper function to download individual file from given url.
    
    Arguments:
    url -- full URL of the file to download
    
    Returns:
    filename -- downloaded file name
    """
    filename = url.split("/")[-1]
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)
    return filename

# To download and extract Sample Data
file = download('https://d17h27t6h515a5.cloudfront.net/topher/2016/December/584f6edd_data/data.zip')

# Extract the file
with zipfile.ZipFile(file) as zf:
    zf.extractall()
    
# Image Transofmrations
def crop(image):
    """
    Crop the image (removing the sky at the top and the car front at the bottom).
    
    Returns:
    Cropped image.
    """
    return image[60:-25, :, :]

def resize(image):
    """
    Resize the image to the input shape used by the network model.
    
    Returns:
    Resized image.
    """
    return cv2.resize(image, (IMAGE_WIDTH, IMAGE_HEIGHT), cv2.INTER_AREA)

def rgb2yuv(image):
    """
    Convert the image from RGB to YUV.
    
    Returns:
    YUV image.
    """
    return cv2.cvtColor(image, cv2.COLOR_RGB2YUV)

def load(data_dir, image_file):
    """
    Load RGB images from a file
    """
    return mpimg.imread(os.path.join(data_dir, image_file.strip()))

def transform(image):
    """
    Combine all preprocess functions into one
    """
    image = crop(image)
    image = resize(image)
    image = rgb2yuv(image)
    return image

### Random Sample Image

In [19]:
# Origional 'left' image
IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS = 66, 200, 3
INPUT_SHAPE = (IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS)
data_df = pd.read_csv('./data/driving_log.csv')
X = data_df[['center', 'left', 'right']].values
y = data_df['steering'].values
random_image = X[100][0]
img = load('data', random_image)

In [20]:
# Simulate pyStig call by first pre-preocessing image and converting to 4D array
endpoint_name = sagemaker_client.describe_endpoint(EndpointName=create_endpoint_response)['EndpointName']
payload = np.array([transform(img)])

In [23]:
runtime_client = boto3.client('sagemaker-runtime')
response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=json.dumps(payload.tolist())
)
prediction = float(json.loads(response['Body'].read().decode('utf-8'))[0])

In [24]:
prediction

-0.06341268122196198

In [25]:
y[100]

-0.05975719