In [2]:
!pip install -r deployment_requirements.txt

Collecting sagemaker
  Downloading sagemaker-2.173.0.tar.gz (854 kB)
[K     |████████████████████████████████| 854 kB 711 kB/s eta 0:00:01
Collecting boto3<2.0,>=1.26.131
  Downloading boto3-1.28.17-py3-none-any.whl (135 kB)
[K     |████████████████████████████████| 135 kB 27.9 MB/s eta 0:00:01
[?25hCollecting cloudpickle==2.2.1
  Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting google-pasta
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 7.0 MB/s  eta 0:00:01
Collecting pathos
  Downloading pathos-0.3.1-py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 314 kB/s  eta 0:00:01
Collecting protobuf<5.0,>=3.12
  Downloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl (304 kB)
[K     |████████████████████████████████| 304 kB 13.2 MB/s eta 0:00:01
[?25hCollecting schema
  Downloading schema-0.7.5-py2.py3-none-any.whl (17 kB)
Collecting smdebug_rulesconfig==1.0.1
  Downloading smdebu

In [2]:
!aws s3 ls

2023-07-21 11:14:13 aws-sam-cli-managed-default-samclisourcebucket-1wvml9o4wtjkt
2022-10-06 10:47:47 sagemaker-studio-z8z2mmtoy7b
2022-10-06 10:53:01 sagemaker-us-east-1-927437237063


In [19]:
import sagemaker
import boto3
# sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
# sagemaker_session_bucket=None
# if sagemaker_session_bucket is None and sess is not None:
#     # set to default bucket if a bucket name is not given
#     sagemaker_session_bucket = sess.default_bucket()

sagemaker_session_bucket = "sagemaker-us-east-1-927437237063"
# try:
#     role = sagemaker.get_execution_role()
# except ValueError:
#     iam = boto3.client('iam')
#     role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

role = "arn:aws:iam::927437237063:role/my-sagemaker-exec-role"

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker role arn: arn:aws:iam::927437237063:role/my-sagemaker-exec-role
sagemaker session region: us-east-1


In [12]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")


llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04


In [16]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
# instance_type = "ml.g5.12xlarge"
instance_type = "ml.g5.xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "google/flan-t5-small", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
#   'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
#   'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)


In [17]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  wait=False,
  # volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)


In [18]:
llm.predict({
	"inputs": "Translate to German:  My name is Arthur",
})

[{'generated_text': 'Meine Namen ist Arthur.'}]