# Deploy the pre-uploaded HuggingFace Model in S3 to Sagemaker

From Github: https://github.com/cloudswb/llm-on-aws.git 

## Step1: Initialize the Deploy Environment

### 1.1 Install Python Packages

In [1]:
!pip install huggingface_hub -U -q -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install -U sagemaker -q -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install --upgrade sagemaker -q -i https://pypi.tuna.tsinghua.edu.cn/simple

### 1.2 Initialize Python Code

In [2]:
from huggingface_hub import snapshot_download
from pathlib import Path
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Step 2: Prepare the parameters for deployment
- model_name ： HuggingFace中的模型名称 
- s3_model_prefix ：模型文件在S3中的位置的文件夹路径（不包含bucket name和文件名称）, 【需要提前准备】
- s3_code_prefix ： 模型执行代码在S3中的位置的文件夹路径（不包含bucket name和文件名称）
- endpoint_config_name ： 部署Sagemaker Configuration 的名称
- endpoint_name ： 部署Sagemaker endpoint的名称
- deploy_cache_location ： 部署时，产生的代码文件所在的本地路径
- inference_image_uri ： 部署所使用的推理容器

In [3]:
model_name = "BAAI/bge-large-en-v1.5"
s3_model_prefix = f"llm/model/{model_name}"
s3_code_prefix = f"llm/code/{model_name}"
deploy_cache_location = f"../cache/{model_name}"

endpoint_model_name = model_name.replace('/', '-').replace('.', '-') 
endpoint_config_name = "huggingface-inference-eb-config" # f"{model_name}-config"
endpoint_name = "huggingface-inference-eb" # f"{model_name}-endpoint"

inference_image_uri = f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.23.0-deepspeed0.9.5-cu118"

!mkdir -p $deploy_cache_location/code

## Step 3: Prepare code of Model

### 3.1 Prepare Model Entry Script：

In [4]:
%%writefile $deploy_cache_location/code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from FlagEmbedding import FlagModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')

def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")

    model =  FlagModel(model_location)
    
    return model

model = None

def handle(inputs: Input):
    global model
    if not model:
        model = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    logging.info(f"inputs data: {data}")
    
    input_sentences = None
    inputs = data["inputs"]
    if isinstance(inputs, list):
        input_sentences = inputs
    else:
        input_sentences =  [inputs]
        
    is_query = data["is_query"]  if "is_query" in data else False
    instruction = data["instruction"] if "instruction" in data else ""
    logging.info(f"inputs: {input_sentences}")
    logging.info(f"is_query: {is_query}")
    logging.info(f"instruction: {instruction}")
    
    if is_query and instruction:
        input_sentences = [ instruction + sent for sent in input_sentences ]
        
    sentence_embeddings =  model.encode(input_sentences)
        
    # result = {"vectors": [sentence_embeddings]}
    # logging.info(f"vectors generated...")
    result = sentence_embeddings
    return Output().add_as_json(result)

Writing ../cache/BAAI/bge-large-en-v1.5/code/model.py


### 3.2 Prepare the model metadata

In [5]:
submission_ending = f'''engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://{bucket}/{s3_model_prefix}/
'''

with open(f'{deploy_cache_location}/code/serving.properties', mode='w+') as file:
    file.write(submission_ending)

### 3.3 Prepare the model depended python packages

In [6]:
%%writefile $deploy_cache_location/code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.28.1
FlagEmbedding

Writing ../cache/BAAI/bge-large-en-v1.5/code/requirements.txt


### 3.4 Package all the model required resources and upload to S3

In [7]:
!rm $deploy_cache_location/code/model.tar.gz
!cd $deploy_cache_location/code && rm -rf ".ipynb_checkpoints"
!tar czvf $deploy_cache_location/code/model.tar.gz -C $deploy_cache_location code

s3_code_artifact = sess.upload_data(f"{deploy_cache_location}/code/model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

rm: cannot remove ‘../cache/BAAI/bge-large-en-v1.5/code/model.tar.gz’: No such file or directory
code/
code/model.py
code/serving.properties
code/requirements.txt
tar: code: file changed as we read it
S3 Code or Model tar ball uploaded to --- > s3://sagemaker-cn-northwest-1-768219110428/llm/code/BAAI/bge-large-en-v1.5/model.tar.gz


## Step 4: Start Deployment

### 4.1 Create Sagemaker Mode

In [8]:
from sagemaker.utils import name_from_base
import boto3

create_model_response = sm_client.create_model(
    ModelName=endpoint_model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

Created Model: arn:aws-cn:sagemaker:cn-northwest-1:768219110428:model/baai-bge-large-en-v1-5


### 4.2 Create Sagemaker Endpoint Configuration

In [9]:
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": endpoint_model_name,
            "InstanceType": "ml.g4dn.xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws-cn:sagemaker:cn-northwest-1:768219110428:endpoint-config/huggingface-inference-eb-config',
 'ResponseMetadata': {'RequestId': 'e28c8f4a-b719-4265-84ea-d93162c62656',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e28c8f4a-b719-4265-84ea-d93162c62656',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '120',
   'date': 'Mon, 13 Nov 2023 08:10:32 GMT'},
  'RetryAttempts': 0}}

### 4.3 Create Sagemaker Endpoint

In [10]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws-cn:sagemaker:cn-northwest-1:768219110428:endpoint/huggingface-inference-eb


### 4.4 Monitor the Sagemaker Endpoint Creating Progress

In [11]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws-cn:sagemaker:cn-northwest-1:768219110428:endpoint/huggingface-inference-eb
Status: InService


## Step 5 : (Optional) Config Sagemaker Endpoint Autoscaling

In [12]:
asg = boto3.client('application-autoscaling')

# Resource type is variant and the unique identifier is the resource ID.
resource_id=f"endpoint/{endpoint_name}/variant/variant1"

# scaling configuration
response = asg.register_scalable_target(
    ServiceNamespace='sagemaker', #
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount', 
    MinCapacity=1,
    MaxCapacity=4
)

In [13]:
response = asg.put_scaling_policy(
    PolicyName=f'Request-ScalingPolicy-{endpoint_name}',
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    PolicyType='TargetTrackingScaling',
    TargetTrackingScalingPolicyConfiguration={
        'TargetValue': 10.0, # Threshold
        'PredefinedMetricSpecification': {
            'PredefinedMetricType': 'SageMakerVariantInvocationsPerInstance',
        },
        'ScaleInCooldown': 300, # duration until scale in
        'ScaleOutCooldown': 60 # duration between scale out
    }
)

## Step 6: (Optional) Testing Model Endpoint

### 6.1 Prepare the Testing method

In [14]:
def get_vector_by_sm_endpoint(questions, sm_client, endpoint_name):
    parameters = {
    }

    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
                # "is_query": False,
                # "instruction" :  "Represent this sentence for searching relevant passages:"
            }
        ),
        ContentType="application/json",
    )
    # 中文instruction => 为这个句子生成表示以用于检索相关文章：
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    # embeddings = json_obj['vectors']
    embeddings = json_obj
    return embeddings

### 6.2 Demo 1: Generate the Embedding Value By Input Text Value

In [15]:
prompts1 = ["what is the default brightness setting on this device?", "how are you going"]

emb = get_vector_by_sm_endpoint(prompts1, smr_client, endpoint_name)
print(emb)

[[-0.02197265625, -0.0224609375, 0.0285491943359375, 0.001750946044921875, -0.05023193359375, -0.0261383056640625, 0.029205322265625, -0.00362396240234375, 0.01363372802734375, 0.0438232421875, -0.021759033203125, -0.01010894775390625, 0.029632568359375, -3.3974647521972656e-06, 0.002918243408203125, -0.033843994140625, -0.041229248046875, -0.003833770751953125, -0.01236724853515625, 0.005950927734375, -0.0023097991943359375, 0.01476287841796875, -0.043731689453125, -0.020294189453125, -0.040435791015625, 0.0377197265625, -0.024688720703125, 0.02069091796875, 0.041961669921875, 0.059326171875, -0.028411865234375, -0.0025043487548828125, 0.0033473968505859375, -0.01641845703125, -0.0016613006591796875, -0.01277923583984375, -0.003093719482421875, 0.031463623046875, -0.037445068359375, -0.0092620849609375, 0.0716552734375, 0.002994537353515625, 0.01145172119140625, -0.067626953125, -0.0181427001953125, -0.0240631103515625, -0.0262603759765625, -0.045013427734375, 0.04327392578125, -0.048

## 7: (Optional) Delete all resources (Sagemaker Model, Endpoint, Endpoint Configuration)

In [17]:
# !aws sagemaker delete-endpoint-config --endpoint-config-name $endpoint_config_name
# !aws sagemaker delete-endpoint --endpoint-name $endpoint_name
# !aws sagemaker delete-model --model-name $endpoint_model_name