# Deploying MAX optimized models at scale with Amazon SageMaker and MAX Serving

In [None]:
# Install and update necessary packages
!pip install -qU pip awscli boto3 sagemaker transformers

In [None]:
import shutil
import os
import boto3
import sagemaker
import tensorflow as tf
from transformers import AutoTokenizer, TFRobertaForSequenceClassification

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "critical"

In [None]:
# Create boto3 and sagemaker session, get role, bucket name, account number and region
sess = boto3.Session()
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket_name    = sagemaker_session.default_bucket()
account = boto3.client('sts').get_caller_identity().get('Account')
region  = sess.region_name

### Step 1: Download a pre-trained Roberta model from HuggingFace

In [None]:
def download_and_save_model(hf_model_name, saved_model_dir):
    model = TFRobertaForSequenceClassification.from_pretrained(hf_model_name)
    shutil.rmtree(saved_model_dir, ignore_errors=True)
    tf.saved_model.save(model, saved_model_dir+"/1/saved_model/")

saved_model_dir = "model-repository/roberta"
hf_model_name = "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest"
download_and_save_model(hf_model_name, saved_model_dir)

In [None]:
%%sh
cat > model-repository/roberta/config.pbtxt <<EOL
instance_group {
  kind: KIND_CPU
}
default_model_filename: "saved_model"
backend: "max"
EOL

tree model-repository

### Step 2: Upload model to Amazon S3 so Amazon SageMaker and MAX Serving container has access to it.

In [None]:
shutil.rmtree('model.tar.gz', ignore_errors=True)
!tar -C model-repository -czf model.tar.gz roberta

model_uri = sagemaker_session.upload_data(path="model.tar.gz", 
                                          key_prefix="max-serving-models/roberta/")

### Step 3: Pull the latest MAX Serving container image and push it to Amazon Elastic Container Registry (Amazon ECR)

In [None]:
repo_name = 'sagemaker-max-serving'
image_label = 'v1'
max_serving_image_uri = "public.ecr.aws/modular/max-serving-de"

image = f'{account}.dkr.ecr.{region}.amazonaws.com/{repo_name}:{image_label}'
image

In [None]:
!aws ecr create-repository --repository-name {repo_name}
!docker pull {max_serving_image_uri}
!docker tag {max_serving_image_uri} {image}
!$(aws ecr get-login --no-include-email --region {region})
!docker push {image}

### Step 4: Create an Amazon SageMaker model and deploy to specified instance type. 
We’ll use Amazon EC2 c6i.4xlarge, on which MAX Engine can deliver up to 2.6x faster performance vs. TensorFlow

In [None]:
from sagemaker.model import Model
from datetime import datetime

date = datetime.now().strftime("%Y-%m-%d-%H-%m-%S")
model_name= f"MAX-model-roberta-{date}"

max_model = Model(
    model_data=model_uri,
    name=model_name,
    role=role,
    image_uri=image,
)

In [None]:
date = datetime.now().strftime("%Y-%m-%d-%H-%m-%S")
endpoint_name = f"MAX-endpoint-roberta-{date}"

predictor = max_model.deploy(
    initial_instance_count=1,
    instance_type="ml.c6i.4xlarge",
    endpoint_name=endpoint_name,
)

### Step 5: Invoke the endpoint to test the endpoint


In [None]:
import numpy as np
import json

model = TFRobertaForSequenceClassification.from_pretrained(hf_model_name)
client = boto3.client("sagemaker-runtime")

In [None]:
text = "MAX Serving and Amazon SageMaker are a match made in heaven"

tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
inputs = tokenizer(text, 
                   return_tensors="np", 
                   return_token_type_ids=True)

payload = {
    "inputs": [
        {"name": "input_ids", 
         "shape": inputs["input_ids"].shape, 
         "datatype": "INT32", "data": inputs["input_ids"].tolist()},
        {"name": "attention_mask", 
         "shape": inputs["attention_mask"].shape, 
         "datatype": "INT32", 
         "data": inputs["attention_mask"].tolist()},
        {"name": "token_type_ids", 
         "shape": inputs["token_type_ids"].shape, 
         "datatype": "INT32", 
         "data": inputs["token_type_ids"].tolist()},
    ]
}

In [None]:
http_response = client.invoke_endpoint(
    EndpointName=endpoint_name, 
    ContentType="application/octet-stream", Body=json.dumps(payload)
)
response = json.loads(http_response["Body"].read().decode("utf8"))
outputs = response["outputs"]
predicted_class_id = np.argmax(outputs[0]['data'],axis=-1)
classification = model.config.id2label[predicted_class_id]
print(f"The sentiment of the input statement is: {classification}")

### Step 6: Clean up AWS resources

In [None]:
sm = sess.client('sagemaker')
endpoint_config_name = sm.describe_endpoint(EndpointName=endpoint_name)['EndpointConfigName']
model_name = sm.describe_endpoint_config(EndpointConfigName=endpoint_config_name)['ProductionVariants'][0]['ModelName']

#### Delete endpoint and clean up model and endpoint config

In [None]:
sm.delete_endpoint(EndpointName=endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm.delete_model(ModelName=model_name)

#### Delete model artifacts in Amazon S3

In [None]:
s3 = boto3.resource("s3")
bucket = s3.Bucket(bucket_name)
bucket.objects.filter(Prefix="max-serving-models/roberta/").all().delete()

#### Delete Amazon ECR registry and all the images we created

In [None]:
ecr = boto3.client('ecr')
ecr.delete_repository(registryId=account,
                      repositoryName=repo_name,
                      force=True)