### 1. 安装HuggingFace 并下载模型到本地

In [1]:
!pip install huggingface-hub -Uqq -i https://pypi.tuna.tsinghua.edu.cn/simple/
!pip install -U sagemaker -i https://pypi.tuna.tsinghua.edu.cn/simple/
!pip install --upgrade sagemaker -i https://pypi.tuna.tsinghua.edu.cn/simple/

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/, https://pip.repos.neuron.amazonaws.com


In [5]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./sentence2emb-model")
local_model_path.mkdir(exist_ok=True)
model_name = "shibing624/text2vec-base-chinese"
commit_hash = "26420fdf61ddfd92fafbaf3bc21a7c06b1812248"

In [6]:
#执行这段，因为网络原因，可能会多次失败，需要反复执行
while True:
    try:
        snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)
        break
    except Exception:
        pass

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

Downloading (…)12248/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

Downloading (…)06b1812248/README.md:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading (…)c06b1812248/logs.txt:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading (…)b1812248/config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

Downloading (…)1812248/modules.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Downloading (…)06b1812248/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

### 2. 把模型拷贝到S3为后续部署做准备

In [7]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")



In [8]:
s3_model_prefix = "LLM-RAG/workshop/sentence2emb-model"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM-RAG/workshop/sentence2emb_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: LLM-RAG/workshop/sentence2emb_deploy_code
model_snapshot_path: sentence2emb-model/models--shibing624--text2vec-base-chinese/snapshots/26420fdf61ddfd92fafbaf3bc21a7c06b1812248


In [9]:
#上传模型至S3
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

upload: sentence2emb-model/models--shibing624--text2vec-base-chinese/snapshots/26420fdf61ddfd92fafbaf3bc21a7c06b1812248/1_Pooling/config.json to s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/sentence2emb-model/1_Pooling/config.json
upload: sentence2emb-model/models--shibing624--text2vec-base-chinese/snapshots/26420fdf61ddfd92fafbaf3bc21a7c06b1812248/sentence_bert_config.json to s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/sentence2emb-model/sentence_bert_config.json
upload: sentence2emb-model/models--shibing624--text2vec-base-chinese/snapshots/26420fdf61ddfd92fafbaf3bc21a7c06b1812248/config.json to s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/sentence2emb-model/config.json
upload: sentence2emb-model/models--shibing624--text2vec-base-chinese/snapshots/26420fdf61ddfd92fafbaf3bc21a7c06b1812248/modules.json to s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/sentence2emb-model/modules.json
upload: sentence2emb-model/models--shibing624--text2vec-base-c

### 3. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [10]:
#适用于global
#inference_image_uri = (
#    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
#)

#中国区需要替换为下面的image_uri
inference_image_uri = (
     f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
)

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117


In [11]:
!mkdir -p sentence2emb_deploy_code

In [49]:
%%writefile sentence2emb_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from transformers import BertTokenizer, BertModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')


def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = BertTokenizer.from_pretrained(model_location)
   
    model = BertModel.from_pretrained(
        model_location, 
        # device_map="balanced_low_0", 
        #load_in_8bit=True
    )
    # load the model on GPU
    model.to(device) 
    model.requires_grad_(False)
    model.eval()
    
    return model, tokenizer


model = None
tokenizer = None
generator = None

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    
    ebs = []
    
    logging.info(f"inputs: {input_sentences}")
    
    for sentence in input_sentences:
    
        encoded_input = tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt')

        with torch.no_grad():
            model_output = model(**encoded_input)
        # Perform pooling. In this case, mean pooling.
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        
        ebs.append(sentence_embeddings.tolist())
    
    result = {"sentence_embeddings": ebs}
    return Output().add_as_json(result)

Overwriting sentence2emb_deploy_code/model.py


In [51]:
print(f"option.s3url ==> s3://{bucket}/{s3_model_prefix}/")

#使用打印出来的URL，替换下面一段代码中的S3Url

option.s3url ==> s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/sentence2emb-model/


#### Note: option.s3url 需要按照自己的账号进行修改, 可以拷贝上一个cell的输出

In [52]:
%%writefile sentence2emb_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/sentence2emb-model/

Overwriting sentence2emb_deploy_code/serving.properties


In [53]:
!rm s2e_model.tar.gz
!cd sentence2emb_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf s2e_model.tar.gz sentence2emb_deploy_code

sentence2emb_deploy_code/
sentence2emb_deploy_code/serving.properties
sentence2emb_deploy_code/model.py


In [54]:
s3_code_artifact = sess.upload_data("s2e_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-cn-north-1-507392672631/LLM-RAG/workshop/sentence2emb_deploy_code/s2e_model.tar.gz


### 4. 创建模型 & 创建endpoint

In [55]:
from sagemaker.utils import name_from_base
import boto3

model_name ='huggingface-inference-eb' # name_from_base("st-paraphrase-mpnet-base-v2") Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

huggingface-inference-eb
Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117
Created Model: arn:aws-cn:sagemaker:cn-north-1:507392672631:model/huggingface-inference-eb


In [56]:
endpoint_config_name =model_name  # f"{model_name}-config"
endpoint_name = model_name # f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.m5.2xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            #"ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws-cn:sagemaker:cn-north-1:507392672631:endpoint-config/huggingface-inference-eb',
 'ResponseMetadata': {'RequestId': '3f9cd9bb-47b3-4027-81f6-a365622f21f2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '3f9cd9bb-47b3-4027-81f6-a365622f21f2',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '109',
   'date': 'Wed, 26 Jul 2023 06:12:39 GMT'},
  'RetryAttempts': 0}}

In [57]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws-cn:sagemaker:cn-north-1:507392672631:endpoint/huggingface-inference-eb


In [58]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws-cn:sagemaker:cn-north-1:507392672631:endpoint/huggingface-inference-eb
Status: InService


### 5. 模型测试

In [72]:
import json
import boto3

client = boto3.client('runtime.sagemaker')
sagemaker_endpoint_name='huggingface-inference-eb'
def query_endpoint_with_json_payload(encoded_json):
    response = client.invoke_endpoint(EndpointName=sagemaker_endpoint_name, ContentType='application/json', Body=encoded_json)
    #print(response)
    return response

def parse_response_texts(query_response):
    model_predictions = json.loads(query_response['Body'].read().decode())["sentence_embeddings"][0]
    return model_predictions

In [73]:
data = {
  "inputs": "123",
}
query_response = query_endpoint_with_json_payload(json.dumps(data).encode('utf-8'))
#print(query_response)
#display_answer(query_response)
generated_texts = parse_response_texts(query_response)
print(generated_texts)

[[-0.6130953431129456, 0.5462226271629333, 0.5804893374443054, 1.2506815195083618, 0.4986412823200226, -0.7086697220802307, 1.4623937606811523, -0.477876752614975, -0.757655918598175, 0.3172924220561981, 0.21863271296024323, -0.5148187279701233, -0.21170049905776978, -0.6646494269371033, -1.196014165878296, -0.0399162657558918, 0.4442327916622162, 0.10436812043190002, -0.348470538854599, -0.4285512864589691, 0.40278175473213196, -0.3978886604309082, -0.9368495941162109, -0.5383326411247253, -0.15362709760665894, -1.062376856803894, -0.7930343747138977, -0.23352432250976562, 1.127079963684082, 0.7262923121452332, 0.1532280594110489, 0.08634226769208908, -0.522877037525177, 0.8900917172431946, -0.05130075290799141, 0.22277270257472992, 0.6843562722206116, -0.11561364680528641, 0.446622759103775, 0.09158467501401901, 0.4532444477081299, -0.0007762511377222836, -2.1346824169158936, -1.009116768836975, -0.17754371464252472, -0.3280589282512665, 0.8034448623657227, 1.4298042058944702, -1.038

In [61]:
# Inference testing

import time
from sagemaker.huggingface import HuggingFaceModel
hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-eb')
data = {
  "inputs": "123",
}
t0 = time.time()
for i in range(1):
    x = hfp.predict(data)
print(time.time()-t0)
#result = json.loads(response['Body'].read().decode())[0][0][0]

#print(len(x[0][0]))
print(x)

0.18636798858642578
{'sentence_embeddings': [[[-0.6130953431129456, 0.5462226271629333, 0.5804893374443054, 1.2506815195083618, 0.4986412823200226, -0.7086697220802307, 1.4623937606811523, -0.477876752614975, -0.757655918598175, 0.3172924220561981, 0.21863271296024323, -0.5148187279701233, -0.21170049905776978, -0.6646494269371033, -1.196014165878296, -0.0399162657558918, 0.4442327916622162, 0.10436812043190002, -0.348470538854599, -0.4285512864589691, 0.40278175473213196, -0.3978886604309082, -0.9368495941162109, -0.5383326411247253, -0.15362709760665894, -1.062376856803894, -0.7930343747138977, -0.23352432250976562, 1.127079963684082, 0.7262923121452332, 0.1532280594110489, 0.08634226769208908, -0.522877037525177, 0.8900917172431946, -0.05130075290799141, 0.22277270257472992, 0.6843562722206116, -0.11561364680528641, 0.446622759103775, 0.09158467501401901, 0.4532444477081299, -0.0007762511377222836, -2.1346824169158936, -1.009116768836975, -0.17754371464252472, -0.3280589282512665, 0

In [62]:
#!aws sagemaker delete-endpoint --endpoint-name huggingface-inference-eb
#!aws sagemaker delete-endpoint-config --endpoint-config-name huggingface-inference-eb
#!aws sagemaker delete-model --model-name huggingface-inference-eb