# _Please make sure the previous Endpoints are deleted before proceeding.  Otherwise, you will see a ResourceLimitExceeded error._


# Deploy Multi-Model Endpoint Using Two Trained TensorFlow Models

https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/tensorflow/deploying_tensorflow_serving.rst#deploying-more-than-one-model-to-your-endpoint

If you have a large number of similar models that you can serve through a shared serving container - and don’t need to access all the models at the same time - you can use SageMaker’s multi-model endpoint (MME) capability. When there is a long tail of ML models that are infrequently accessed, using one multi-model endpoint can efficiently serve inference traffic and enable significant cost savings. 

Multi-model endpoints can automatically load and unload models based on traffic and resource utilization. For example, if traffic to model1 goes to zero and model2 traffic spikes, SageMaker will dynamically unload model1 and load another instance of model2. 

While MME lets you deploy multiple models to a single endpoint and serve them using a single container, you can invoke a specific model by specifying the target model name as a parameter in your prediction request.

<img src="img/multi_model.png" width="80%" align="left">

In [None]:
!pip install -q --upgrade pip
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q tensorflow==2.1.0
!pip install -q transformers==2.8.0

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Configure First Training Job

In [None]:
%store -r training_job_name

In [None]:
print(training_job_name)

In [None]:
model1_s3_uri = 's3://{}/{}/output/model.tar.gz'.format(bucket, training_job_name)
print(model1_s3_uri)

# Configure Second Training Job
For now, let's re-use the same model.

In [None]:
%store -r training_job_name

In [None]:
print(training_job_name)

In [None]:
model2_s3_uri = 's3://{}/{}/output/model.tar.gz'.format(bucket, training_job_name)
print(model2_s3_uri)

#  Create a `multi/` Directory Structure
Put them in ./multi/model1 and model2/ directory structure shown here

└── multi

    ├── model1
  
    │   └── <version number>
    
    │       ├── saved_model.pb
  
    │       └── variables
  
    │           └── ...
  
    └── model2
  
        └── <version number>
      
            ├── saved_model.pb
          
            └── variables
          
                └── ...
              

In [None]:
!rm -rf ./multi/

In [None]:
!mkdir -p ./multi/model1
!mkdir -p ./multi/model2

In [None]:
!aws s3 cp $model1_s3_uri model1.tar.gz
!aws s3 cp $model2_s3_uri model2.tar.gz

In [None]:
!tar xvf model1.tar.gz -C ./multi/model1
!tar xvf model2.tar.gz -C ./multi/model2

In [None]:
!ls -al ./multi/model1/tensorflow/saved_model

In [None]:
!mv ./multi/model1/tensorflow/saved_model/* multi/model1/
!mv ./multi/model2/tensorflow/saved_model/* multi/model2/

In [None]:
!rm -rf multi/model1/tensorflow/
!rm -rf multi/model1/transformers/
!rm -rf multi/model2/tensorflow/
!rm -rf multi/model2/transformers/

In [None]:
!ls -alR ./multi/

# Package Both Models into a Single multi.tar.gz
Note:  This may take a minute.  The models are large.

In [None]:
!tar -czvf multi.tar.gz multi/

# Upload the New Archive to S3

In [None]:
multi_model_s3_uri = 's3://{}/{}/multi'.format(bucket, training_job_name)


In [None]:
!aws s3 cp multi.tar.gz $multi_model_s3_uri/

In [None]:
!aws s3 ls --recursive $multi_model_s3_uri/

# Create SageMaker Model from the Multi-Model

In [None]:
from sagemaker.tensorflow.serving import Model, Predictor

# For multi-model endpoints, you should set the default 
#    model name in this environment variable. 
# If it isn't set, the endpoint will work, but the model
#    it will select as default is unpredictable.
env = {
  'SAGEMAKER_TFS_DEFAULT_MODEL_NAME': 'model1'  # <== This must match the directory
}

model_data = '{}/multi.tar.gz'.format(multi_model_s3_uri)
model = Model(model_data=model_data, 
              role=role, 
              framework_version='2.1.0', 
              env=env)

# Deploy the Multi-Model as a SageMaker Endpoint
The predictor returned by `model.deploy()` is only for the default model.

In [None]:
deployed_model = model.deploy(instance_type='ml.m5.large',
                              initial_instance_count=1,
                              wait=False)

multi_model_endpoint_name = deployed_model.endpoint

print('Endpoint name:  {}'.format(multi_model_endpoint_name))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">REST Endpoint</a></b>'.format(region, multi_model_endpoint_name)))


In [None]:
client = boto3.client('sagemaker')
waiter = client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=multi_model_endpoint_name)

# Simulate a Prediction from an Application

In [None]:
class RequestHandler(object):
    import json
    
    def __init__(self, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __call__(self, instances):
        transformed_instances = []

        for instance in instances:
            encode_plus_tokens = tokenizer.encode_plus(instance,
                                                       pad_to_max_length=True,
                                                       max_length=self.max_seq_length)

            input_ids = encode_plus_tokens['input_ids']
            input_mask = encode_plus_tokens['attention_mask']
            segment_ids = [0] * self.max_seq_length

            transformed_instance = {"input_ids": input_ids, 
                                    "input_mask": input_mask, 
                                    "segment_ids": segment_ids}

            transformed_instances.append(transformed_instance)

        transformed_data = {"instances": transformed_instances}

        return json.dumps(transformed_data)

In [None]:
class ResponseHandler(object):
    import json
    import tensorflow as tf
    
    def __init__(self, classes):
        self.classes = classes
    
    def __call__(self, response, accept_header):
        import tensorflow as tf

        response_body = response.read().decode('utf-8')

        response_json = json.loads(response_body)

        log_probabilities = response_json["predictions"]

        predicted_classes = []

        # Convert log_probabilities => softmax (all probabilities add up to 1) => argmax (final prediction)
        for log_probability in log_probabilities:
            softmax = tf.nn.softmax(log_probability)    
            predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
            predicted_class = self.classes[predicted_class_idx]
            predicted_classes.append(predicted_class)

        return predicted_classes

In [None]:
import json
from sagemaker.tensorflow.serving import Predictor
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

request_handler = RequestHandler(tokenizer=tokenizer,
                                 max_seq_length=128)

response_handler = ResponseHandler(classes=[1, 2, 3, 4, 5])

In [None]:
predictor_model1 = Predictor(endpoint_name=multi_model_endpoint_name,
                             sagemaker_session=sess,
                             serializer=request_handler,
                             deserializer=response_handler,
                             content_type='application/json',
                             model_name='model1',
                             model_version=0)

In [None]:
predictor_model2 = Predictor(endpoint_name=multi_model_endpoint_name,
                             sagemaker_session=sess,
                             serializer=request_handler,
                             deserializer=response_handler,
                             content_type='application/json',
                             model_name='model2',
                             model_version=0)

In [None]:
import tensorflow as tf
import json
    
reviews = [
            "This is great!", 
            "This is not good."
          ]

predicted_classes_model1 = predictor_model1.predict(reviews)

for predicted_class, review in zip(predicted_classes_model1, reviews):
    print('[Predicted Star Rating: {}]'.format(predicted_class), review)

In [None]:
import tensorflow as tf
import json
    
reviews = [
            "This is great!", 
            "This is not good."
          ]

predicted_classes_model2 = predictor_model2.predict(reviews)

for predicted_class, review in zip(predicted_classes_model2, reviews):
    print('[Predicted Star Rating: {}]'.format(predicted_class), review)

# Delete Endpoint

In [None]:
client = boto3.client('sagemaker')

client.delete_endpoint(
    EndpointName=multi_model_endpoint_name
)

In [None]:
%%javascript

Jupyter.notebook.session.delete();