## Creating `model.tar.gz`

In [None]:
import tarfile

model_path = "./inference/lit-bert-for-maskedlm-yymmdd.pth"
code_path = "./inference/code/"

zipped_model_path = "./model.tar.gz"

with tarfile.open(zipped_model_path, "w:gz") as tar:
    tar.add(model_path, arcname="lit-bert-for-maskedlm-yymmdd.pth")
    tar.add(code_path, arcname="code/")
    
# Upload to S3 Bucket

## Preparation

In [None]:
import boto3
import sagemaker

region = boto3.Session().region_name
client = boto3.client("sagemaker", region_name=region)

# The name of the model that you want to host. This is the name that you specified when creating the model.
model_name='<The_name_of_your_model>'
# Create an endpoint config name.
endpoint_config_name = '<endpoint-config-name>'
# The name of the endpoint. The name must be unique within an AWS Region in your AWS account.
endpoint_name = '<endpoint-name>' 

## Create a Model

In [None]:
#Role to give SageMaker permission to access AWS services.
sagemaker_role = sagemaker.get_execution_role()

bucket_name = '<your bucket_name>'
s3_prefix = '<your s3_prefix>'

# model S3 url
model_url = f"s3://{bucket_name}/{s3_prefix}/model.tar.gz".format(bucket_name=bucket_name, s3_prefix=s3_prefix)

# Get container image (prebuilt example)
from sagemaker import image_uris
image_uri = image_uris.retrieve(
    framework="pytorch",
    region=region,
    version="1.13.1",
    py_version="py39",
    image_scope='inference',
    instance_type="ml.t2.medium"
)

response = client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = sagemaker_role,
    Containers = [{
        "Image": image_uri,
        "Mode": "SingleModel",
        "ModelDataUrl": model_url,
    }]
)

## Create an Endpoint Configuration

Reference:
* [Deploying ML models using SageMaker Serverless Inference](https://aws.amazon.com/blogs/machine-learning/deploying-ml-models-using-sagemaker-serverless-inference-preview/)
* [Docker Registry Paths and Example Code for Asia Pacific(`image_uri`)](https://docs.aws.amazon.com/sagemaker/latest/dg/ecr-ap-northeast-1.html#pytorch-ap-northeast-1.title)
* [Available Deep Learning Containers Images](https://github.com/aws/deep-learning-containers/blob/master/available_images.md)
* [About create-endpoint-config](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/sagemaker/create-endpoint-config.html)

### Real-time

In [None]:
# instance_type='ml.m5.xlarge' # Example                            
instance_type = '<instance-type>'

endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, # You will specify this name in a CreateEndpoint request.
    # List of ProductionVariant objects, one for each model that you want to host at this endpoint.
    ProductionVariants=[
        {
            "VariantName": "mlBert", # The name of the production variant.
            "ModelName": model_name, 
            "InstanceType": instance_type, # Specify the compute instance type.
            "InitialInstanceCount": 1 # Number of instances to launch initially.
        }
    ]
)

print(f"Created EndpointConfig: {endpoint_config_response['EndpointConfigArn']}")

### Serverless

In [None]:
endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
        "VariantName": "mlBert",
        "ModelName": model_name,
        "ServerlessConfig": {
            "MemorySizeInMB": 2048,
            "MaxConcurrency": 1,
            },
        },
    ],
)

print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

## Deploy

In [None]:
# Deploy
create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=mlbert_serverless_epc_name,
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

# wait for endpoint to reach a terminal state (InService) using describe endpoint
import time
describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)
while describe_endpoint_response["EndpointStatus"] == "Creating":
    describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)
    print(describe_endpoint_response["EndpointStatus"])
    time.sleep(30)

describe_endpoint_response

### Make a Prediction

#### SageMaker Studio

In [None]:
import json
import sagemaker

sm = sagemaker.Session().sagemaker_runtime_client

payload ="明日の天気はどうですか。"

response = sm.invoke_endpoint(
    EndpointName=endpoint_name, 
    Body=payload, 
    ContentType="application/json"
)

result = json.loads(response['Body'].read().decode('utf8'))
result

#### Locally

In [None]:
import json
import boto3

client = boto3.client('sagemaker-runtime', region_name='ap-northeast-1')

payload ="今日の天気はどうですか。"

response = client.invoke_endpoint(
    EndpointName=endpoint_name, Body=payload, ContentType="text/csv"
)

resp = json.loads(response['Body'].read().decode('utf-8'))
resp

### Test API Gateway

In [None]:
import json
import requests

# Set throtting at 'Stage -> Stage Editor'

api_id = '<API Gateway ID>'

url = f"https://{api_id}.execute-api.ap-northeast-1.amazonaws.com/api/bertjsc".format(api_id=api_id)

headers = {
    "Content-Type": "application/json",
    "Accept": "application/json"
}
payload = {'text': '今日はいい天気です。'}

resp = requests.post(url, data =json.dumps(payload), headers=headers)

resp.json()