# Load Testing SageMaker Real-time Inference Endpoints with Locust

Learn how model size, instance choice, deployment configuration, and inference parameters impact requests-per-second and latency using Locust.

### Install and import required libraries, and set some variables

In [None]:
%pip install sagemaker boto3 -Uqqq

In [None]:
# Restart kernel
import os

os._exit(00)

In [None]:
import sagemaker
import boto3
import json

In [None]:
role = sagemaker.get_execution_role()
sess = sagemaker.session.Session()
region = sess._region_name
smr_client = boto3.client("sagemaker-runtime")

### Configure the Large Model Inference (LMI) container

In [None]:
HF_TOKEN = "<YOUR_HUGGING_FACE_TOKEN>"

MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
MAX_MODEL_LEN = "128000"

vllm_config = {
    "HF_MODEL_ID": MODEL_ID,
    "HF_TOKEN": HF_TOKEN,
    "OPTION_MAX_MODEL_LEN": MAX_MODEL_LEN,
    "OPTION_MAX_ROLLING_BATCH_SIZE": "8",
    "OPTION_MODEL_LOADING_TIMEOUT": "900",
    "SERVING_FAIL_FAST": "true",
    "OPTION_ROLLING_BATCH": "disable",
    "OPTION_ASYNC_MODE": "true",
    "OPTION_ENTRYPOINT": "djl_python.lmi_vllm.vllm_async_service",
}

### Create the Model

In [None]:
from sagemaker import Model

# Construct container URI
CONTAINER_VERSION = "0.33.0-lmi15.0.0-cu128"
container_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:{CONTAINER_VERSION}"
)

# Select instance type
instance_type = "ml.g6e.xlarge"

model = Model(image_uri=container_uri, role=role, env=vllm_config)

# Construct endpoint name prefix
endpoint_base_name = MODEL_ID.split("/")[1].replace(".", "-").lower()
endpoint_name = sagemaker.utils.name_from_base(endpoint_base_name)

model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    container_startup_health_check_timeout=900,
)

### Invoke the endpoint with a sample prompt

In [None]:
# Add your endpoint here
endpoint_name = "<YOUR_ENDPOINT_NAME>"

In [None]:
messages = [
    {
        "role": "system",
        "content": "Speak in a Medieval British style. Just provide the answer without any preamble or explaination.",
    },
    {
        "role": "user",
        "content": "List least three popular tourist destinations to visit in England, each with a description.",
    },
]

In [None]:
%%time

body = {
    "messages": messages,
    "temperature": 0.5,
    "max_tokens": 512,
    "stream": False,
}


print(f"Prompt: {body['messages'][0]['content']}\n")
print("Response:", end=" ", flush=True)

response = smr_client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(body),
    ContentType="application/json",
)

response_body = json.loads(response["Body"].read().decode("utf8"))
content = response_body["choices"][0]["message"]["content"]
print(content)