In [1]:
%pip install sagemaker --upgrade  --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Step 2: Start preparing model artifacts
In LMI contianer, we expect some artifacts to help setting up the model

serving.properties (required): Defines the model server settings
model.py (optional): A python file to define the core inference logic
requirements.txt (optional): Any additional pip wheel need to install

In [3]:
%%writefile serving.properties
engine=Python
option.model_id=mistralai/Mistral-7B-v0.2
option.dtype=fp16
option.task=text-generation
option.tensor_parallel_degree=1
option.device_map=auto

Writing serving.properties


In [4]:
%%writefile requirements.txt
git+https://github.com/huggingface/transformers
accelerate==0.23.0

Writing requirements.txt


In [5]:
%%sh
mkdir mymodel
mv serving.properties mymodel/
mv requirements.txt mymodel/
tar czvf mymodel.tar.gz mymodel/
rm -rf mymodel

mymodel/
mymodel/requirements.txt
mymodel/serving.properties


Step 3: Start building SageMaker endpoint
In this step, we will build SageMaker endpoint from scratch

Getting the container image URI
Large Model Inference available DLC

In [6]:
image_uri = image_uris.retrieve(
        framework="djl-deepspeed",
        region=sess.boto_session.region_name,
        version="0.27.0"
    )

In [7]:
s3_code_prefix = "large-model-lmi/code"
bucket = sess.default_bucket()  # bucket to house artifacts
code_artifact = sess.upload_data("mymodel.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

model = Model(image_uri=image_uri, model_data=code_artifact, role=role)

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-eu-north-1-254455524940/large-model-lmi/code/mymodel.tar.gz


4.2 Create SageMaker endpoint
You need to specify the instance to use and endpoint names

In [9]:
instance_type = "ml.g4dn.12xlarge"
endpoint_name = sagemaker.utils.name_from_base("lmi-model")

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             endpoint_name=endpoint_name
            )

# our requests and responses will be in json format so we specify the serializer and the deserializer
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=serializers.JSONSerializer(),
)

--------------------------------------------------------*

UnexpectedStatusException: Error hosting endpoint lmi-model-2024-04-14-13-59-52-545: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html