# Serving a Model as a REST Endpoint

![SageMaker Endpoint Architecture](img/sagemaker-architecture.png)

In [4]:
!pip install -q --upgrade pip
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q tensorflow==2.1.0
!pip install -q transformers==2.8.0

In [5]:
!pip install --upgrade sagemaker==1.56.1

Collecting sagemaker==1.56.1
  Downloading sagemaker-1.56.1.tar.gz (303 kB)
[K     |████████████████████████████████| 303 kB 38.2 MB/s eta 0:00:01
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-1.56.1-py2.py3-none-any.whl size=399435 sha256=fa415e78b38ec27b0f0296a0df3894e768143975badc727b3e0441c6d1932c7e
  Stored in directory: /home/ec2-user/.cache/pip/wheels/ac/49/c4/d5409f26f5ad1ba8e68d657d24149f6de5f759dc9b3899ecaf
Successfully built sagemaker
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 1.56.1.post1
    Uninstalling sagemaker-1.56.1.post1:
      Successfully uninstalled sagemaker-1.56.1.post1
Successfully installed sagemaker-1.56.1


In [6]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [7]:
%store -r training_job_name

In [8]:
print(training_job_name)

tensorflow-training-2020-04-30-03-09-12-331


# Copy the Model to the Notebook

In [None]:
!aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz ./model.tar.gz

In [None]:
!tar -xvzf ./model.tar.gz

In [None]:
!saved_model_cli show --all --dir ./tensorflow/saved_model/0/

In [12]:
from sagemaker.tensorflow.serving import Model

model = Model(name=training_job_name,
              model_data='s3://{}/{}/output/model.tar.gz'.format(bucket, training_job_name),
              role=role,
              framework_version='2.0.0') # Elastic Inference does not yet support TF 2.1.0 as of sagemaker==1.56.1

In [13]:
model.__dict__

{'model_data': 's3://sagemaker-us-east-1-835319576252/tensorflow-training-2020-04-30-03-09-12-331/output/model.tar.gz',
 'image': None,
 'role': 'arn:aws:iam::835319576252:role/service-role/AmazonSageMaker-ExecutionRole-20191006T135881',
 'predictor_cls': sagemaker.tensorflow.serving.Predictor,
 'env': {},
 'name': 'tensorflow-training-2020-04-30-03-09-12-331',
 'vpc_config': None,
 'sagemaker_session': None,
 '_model_name': None,
 'endpoint_name': None,
 '_is_compiled_model': False,
 '_enable_network_isolation': False,
 'model_kms_key': None,
 'entry_point': None,
 'source_dir': None,
 'dependencies': [],
 'git_config': None,
 'enable_cloudwatch_metrics': False,
 'container_log_level': 20,
 'bucket': None,
 'key_prefix': None,
 'uploaded_code': None,
 'repacked_model_data': None,
 '_framework_version': '2.0.0',
 '_container_log_level': None}

In [16]:
# deployed_model = model.deploy(initial_instance_count=2, # Minimum 2 for high(er) availability 
#                               instance_type='ml.m4.xlarge',
#                               accelerator_type='ml.eia2.medium',
#                               wait=False)

# endpoint_name = deployed_model.endpoint

# print('Endpoint name:  {}'.format(endpoint_name))


Endpoint name:  tensorflow-inference-eia-2020-05-05-02-39-04-365


# Canary Rollouts and A/B Testing

Instead of `deploy()`, we can create an `Endpoint Configuration` with multiple variants for canary rollouts and A/B testing.

In [15]:
import boto3
client = boto3.client("sagemaker")
                      
endpoint_config = client.create_endpoint_config(
    EndpointConfigName = training_job_name,
    ProductionVariants=[
        {
        'InstanceType':'ml.m4.xlarge',
        'InitialInstanceCount': 1,
        'ModelName': model.name,
        'VariantName': 'VariantA-WithOutElasticInference',
#        'AcceleratorType':'ml.eia2.medium'
        },
        {
        'InstanceType':'ml.m4.xlarge',
        'InitialInstanceCount': 1,
        'ModelName': model.name,
        'VariantName': 'VariantB-WithElasticInference',
        'AcceleratorType':'ml.eia2.medium'
        }
    ])

print("Endpoint Config Arn: " + endpoint_config['EndpointConfigArn'])

ClientError: An error occurred (ValidationException) when calling the CreateEndpointConfig operation: Could not find model "arn:aws:sagemaker:us-east-1:835319576252:model/tensorflow-training-2020-04-30-03-09-12-331".

In [None]:
#endpoint_name = 'ImageClassificationEndpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_response = sagemaker.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)

In [17]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">REST Endpoint</a></b>'.format(region, endpoint_name)))


# _Wait Until the ^^ Endpoint ^^ is Deployed_

In [18]:
client = boto3.client('sagemaker')
waiter = client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)

# Simulate a Prediction from an Application

In [19]:
class RequestHandler(object):
    import json
    
    def __init__(self, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __call__(self, instances):
        transformed_instances = []

        for instance in instances:
            tokens_a = self.tokenizer.tokenize(instance)

            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self.max_seq_length - 2:
                tokens_a = tokens_a[0:(self.max_seq_length - 2)]

            tokens = []  
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in tokens_a:
                tokens.append(token)
                segment_ids.append(0)  
            tokens.append("[SEP]")
            segment_ids.append(0)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length
            assert len(segment_ids) == self.max_seq_length

            instance = {"input_ids": input_ids, 
                        "input_mask": input_mask, 
                        "segment_ids": segment_ids}

            transformed_instances.append(instance)

        transformed_data = {"instances": transformed_instances}

        return json.dumps(transformed_data)

In [20]:
class ResponseHandler(object):
    import json
    import tensorflow as tf
    
    def __init__(self, classes):
        self.classes = classes
    
    def __call__(self, response, accept_header):
        import tensorflow as tf

        response_body = response.read().decode('utf-8')

        response_json = json.loads(response_body)

        log_probabilities = response_json["predictions"]

        predicted_classes = []

        # Convert log_probabilities => softmax (all probabilities add up to 1) => argmax (final prediction)
        for log_probability in log_probabilities:
            softmax = tf.nn.softmax(log_probability)    
            predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
            predicted_class = self.classes[predicted_class_idx]
            predicted_classes.append(predicted_class)

        return predicted_classes

In [21]:
import json
from sagemaker.tensorflow.serving import Predictor
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

request_handler = RequestHandler(tokenizer=tokenizer,
                                 max_seq_length=128)

response_handler = ResponseHandler(classes=[1, 2, 3, 4, 5])

predictor = Predictor(endpoint_name=endpoint_name,
                      sagemaker_session=sess,
                      serializer=request_handler,
                      deserializer=response_handler,
                      content_type='application/json',
                      model_name='saved_model',
                      model_version=0)

In [24]:
import tensorflow as tf
import json
    
reviews = ["This is great!", 
           "This is terrible."]

predicted_classes = predictor.predict(reviews)

for predicted_class, review in zip(predicted_classes, reviews):
    print('[Predicted Star Rating: {}]'.format(predicted_class), review)

[Predicted Star Rating: 5] This is great!
[Predicted Star Rating: 1] This is terrible.


# Simulate a (Mini-)Load Test

In [37]:
def _predict(idx):
    reviews = ["This is great!", 
               "This is terrible."]

    predicted_classes = predictor.predict(reviews)

    return predicted_classes

In [38]:
import functools
import multiprocessing

load_predict = functools.partial(_predict)

num_cpus = multiprocessing.cpu_count()

p = multiprocessing.Pool(num_cpus * 3)

In [39]:
%%time

results = p.map(load_predict, range(1,1000))

CPU times: user 113 ms, sys: 176 ms, total: 289 ms
Wall time: 33.5 s


# TODO:  Verify that Elastic Inference is working
https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html

In [None]:
# !aws cloudwatch list-metrics --namespace " AWS/ElasticInference "

# Optimize Cost with TensorFlow and Elastic Inference
https://aws.amazon.com/blogs/machine-learning/optimizing-costs-in-amazon-elastic-inference-with-amazon-tensorflow/