# Korean Embedding 모델을 SageMaker 배포 및 추론
- 이 노트북은 SageMaker Notebook Instance 의 conday_pytorch_p39 에서 테스트 되었습니다. 

Model Ref:
- BM-K/KoSimCSE-roberta
    - https://huggingface.co/BM-K/KoSimCSE-roberta
Inference Code Ref:    
- Huggingface Sagemaker-sdk - Deploy 🤗 Transformers for inference
    - https://github.com/huggingface/notebooks/blob/main/sagemaker/11_deploy_model_from_hf_hub/deploy_transformer_model_from_hf_hub.ipynb
- Sentence Embeddings with Hugging Face Transformers, Sentence Transformers and Amazon SageMaker - Custom Inference for creating document embeddings with Hugging Face's Transformers
    - https://github.com/huggingface/notebooks/blob/main/sagemaker/17_custom_inference_script/sagemaker-notebook.ipynb
    

# 0. 기본 환경 설정

In [23]:
%load_ext autoreload
%autoreload 2

# src 폴더 경로 설정
import sys
sys.path.append('../common_code')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import time
import boto3
import logging
import sagemaker
from datetime import datetime
from sagemaker.model import Model
from sagemaker.session import Session
from sagemaker.utils import name_from_base
from sagemaker import image_uris, model_uris, script_uris, hyperparameters

In [9]:
logger = logging.getLogger()
logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr)

In [25]:
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
model_version = "*"
logger.info(f"aws_role={aws_role}, aws_region={aws_region}")

2023-06-14 05:48:43,518,232507896,MainProcess,INFO,aws_role=arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436, aws_region=us-east-1


In [26]:
MODEL_CONFIG_LIST = [
    # {
    #     "model_id": "huggingface-text2text-flan-t5-xxl",
    #     "model_version": "*",
    #     "instance_type": "ml.g5.12xlarge",
    #     "instance_count": 1,
    #     "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
    #     "predictor_cls": Predictor
    # },
    {
        "model_id": "huggingface-textembedding-gpt-j-6b",
        "model_version": "*",
        "instance_type": "ml.g5.12xlarge",
        "instance_count": 1,
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "2"},
    }
]

In [None]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

for model in MODEL_CONFIG_LIST: 
    start = time.time()
    time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    endpoint_name = name_from_base(f"{model['model_id']}-{time_stamp}")
    logger.info(f"going to deploy model={model}, endpoint_name={endpoint_name}")    
    # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
    deploy_image_uri = image_uris.retrieve(
        region=None,
        framework=None,  # automatically inferred from model_id
        image_scope="inference",
        model_id=model['model_id'],
        model_version=model['model_version'],
        instance_type=model['instance_type'],
    )
    # Retrieve the model uri.
    model_uri = model_uris.retrieve(
        model_id=model['model_id'], model_version=model['model_version'], model_scope="inference"
    )
    logger.info(f"deploy_image_uri={deploy_image_uri}, model_uri={model_uri}")
    model_inference = Model(
        image_uri=deploy_image_uri,
        model_data=model_uri,
        role=aws_role,
        predictor_cls=model.get("predictor_cls"),
        name=endpoint_name,
        env=model['env'],
    )
    model_predictor_inference = model_inference.deploy(
        initial_instance_count=model['instance_count'],
        instance_type=model['instance_type'],
        predictor_cls=model.get("predictor_cls"),
        endpoint_name=endpoint_name,
    )
    time_taken = time.time() - start
    logger.info(f"{bold}model={model['model_id']} has been deployed successfully at endpoint={endpoint_name}, took {time_taken}seconds{unbold}{newline}")
    model["endpoint_name"] = endpoint_name

2023-06-14 05:33:52,880,3445534480,MainProcess,INFO,going to deploy model={'model_id': 'huggingface-textembedding-gpt-j-6b', 'model_version': '*', 'instance_type': 'ml.g5.12xlarge', 'instance_count': 1, 'env': {'TS_DEFAULT_WORKERS_PER_MODEL': '2'}}, endpoint_name=huggingface-textembedding-gpt-j-6b-2023-2023-06-14-05-33-52-880
2023-06-14 05:33:52,892,3445534480,MainProcess,INFO,deploy_image_uri=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.12.0-gpu-py38, model_uri=s3://jumpstart-cache-prod-us-east-1/huggingface-infer/prepack/v1.0.1/infer-prepack-huggingface-textembedding-gpt-j-6b.tar.gz
2023-06-14 05:33:52,917,session,MainProcess,INFO,Creating model with name: huggingface-textembedding-gpt-j-6b-2023-2023-06-14-05-33-52-880
2023-06-14 05:33:53,789,session,MainProcess,INFO,Creating endpoint-config with name huggingface-textembedding-gpt-j-6b-2023-2023-06-14-05-33-52-880
2023-06-14 05:33:54,266,session,MainProcess,INFO,Creating endpoint with name huggingface-textembeddin

--------

In [27]:
text1 = "How cute your dog is!"
text2 = "Your dog is so cute."
text3 = "The mitochondria is the powerhouse of the cell."

### Query endpoint that you have created
You can query the endpoint with a batch of input texts within a json payload. Here, we send a single request to the endpoint and the parsed response is a list of the embedding vectors.

In [28]:
newline, bold, unbold = '\n', '\033[1m', '\033[0m'
endpoint_name = 'huggingface-textembedding-gpt-j-6b-2023-2023-06-14-05-33-52-880'


def query_endpoint_with_json_payload(encoded_json):
    client = boto3.client('runtime.sagemaker')
    response = client.invoke_endpoint(EndpointName=endpoint_name, ContentType='application/json', Body=encoded_json)
    return response


def parse_response_multiple_texts(query_response):
    model_predictions = json.loads(query_response['Body'].read())
    embeddings = model_predictions['embedding']
    return embeddings


payload = {"text_inputs": [text1, text2, text3]}
query_response = query_endpoint_with_json_payload(json.dumps(payload).encode('utf-8'))
embeddings = parse_response_multiple_texts(query_response)

In [36]:
embeddings[0]

[0.0013489107368513942,
 -0.018385330215096474,
 -0.016965681686997414,
 -0.009253906086087227,
 0.015878183767199516,
 0.029750213027000427,
 -0.018147243186831474,
 0.0013538436032831669,
 0.015019044280052185,
 -0.008157066069543362,
 -0.0032542694825679064,
 0.024895988404750824,
 -0.0044600097462534904,
 0.01345666404813528,
 0.012466981075704098,
 -0.005835961550474167,
 -0.0019635267090052366,
 -0.015397023409605026,
 -0.010526093654334545,
 0.01141903642565012,
 0.010193578898906708,
 -0.012928756885230541,
 -0.006018274463713169,
 -0.003646490629762411,
 -0.020154835656285286,
 -0.01177288219332695,
 0.006410220172256231,
 0.014053774066269398,
 0.012873170897364616,
 0.004196937195956707,
 -0.015345822088420391,
 -0.0025690183974802494,
 0.010213232599198818,
 -0.010201680473983288,
 0.001645726733841002,
 0.00023351395793724805,
 6.585427763639018e-05,
 -0.002791732084006071,
 0.008848433382809162,
 -0.009319859556853771,
 0.008111496455967426,
 -0.02300303615629673,
 0.0189

# 5. 엔드포인트 삭제

In [31]:
class clean_up():
    
    def __init__(self, ):    
        pass
    
    def delete_endpoint(self, client, endpoint_name ,is_del_model=True):
        
        response = client.describe_endpoint(EndpointName=endpoint_name)
        EndpointConfigName = response['EndpointConfigName']

        response = client.describe_endpoint_config(EndpointConfigName=EndpointConfigName)
        model_name = response['ProductionVariants'][0]['ModelName']    

        if is_del_model: # 모델도 삭제 여부 임.
            client.delete_model(ModelName=model_name)    

        client.delete_endpoint(EndpointName=endpoint_name)
        client.delete_endpoint_config(EndpointConfigName=EndpointConfigName)    

        print(f'--- Deleted model: {model_name}')
        print(f'--- Deleted endpoint: {endpoint_name}')
        print(f'--- Deleted endpoint_config: {EndpointConfigName}')  

In [32]:
clean = clean_up()
sm_client = boto3.client('sagemaker')

## 2.training 
clean.delete_endpoint(sm_client, endpoint_name ,is_del_model=True)

--- Deleted model: huggingface-textembedding-gpt-j-6b-2023-2023-06-14-05-33-52-880
--- Deleted endpoint: huggingface-textembedding-gpt-j-6b-2023-2023-06-14-05-33-52-880
--- Deleted endpoint_config: huggingface-textembedding-gpt-j-6b-2023-2023-06-14-05-33-52-880


## 추론 테스트 및 문장 유사도 측정
- 아래 첫문장, 두번째 문장의 유사도를 구함
- 아래 첫문장, 세째 문장의 유사도를 구함
- 최종적으로 유사도 수치를 비교 함

In [5]:
def show_embedding_score(tokenizer, model, sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    embeddings, _ = model(**inputs, return_dict=False)

    score01 = cal_score(embeddings[0][0], embeddings[1][0])
    score02 = cal_score(embeddings[0][0], embeddings[2][0])

    print(score01, score02)

def cal_score(a, b):
    '''
    코사인 유사도 구하는 함수
    '''
    if len(a.shape) == 1: a = a.unsqueeze(0)
    if len(b.shape) == 1: b = b.unsqueeze(0)

    a_norm = a / a.norm(dim=1)[:, None]
    b_norm = b / b.norm(dim=1)[:, None]
    return torch.mm(a_norm, b_norm.transpose(0, 1)) * 100
    

In [6]:
sentences1 = ['이번 주 일요일에 분당 이마트 점은 문을 여나요',
             '일요일에 분당 이마트는 문 열어요?',
             '분당 이마트 점은 토요일에 몇 시까지 하나요']

show_embedding_score(tokenizer, model, sentences1)    

tensor([[92.7287]], grad_fn=<MulBackward0>) tensor([[79.8030]], grad_fn=<MulBackward0>)


# 2. 세이지 메이커로 모델 배포

In [7]:
import sagemaker
import boto3

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

print(f"sagemaker role arn: {role}")

sagemaker role arn: arn:aws:iam::057716757052:role/mlops-blog-ncf-gsmoon


## HF Model ID, HF_TASK 정의

In [8]:
from sagemaker.huggingface import HuggingFaceModel

# Hub Model configuration. https://huggingface.co/models
hub = {
  'HF_MODEL_ID':'BM-K/KoSimCSE-roberta', # model_id from hf.co/models
  'HF_TASK':'feature-extraction'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   env=hub,
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.26", # transformers version used
   pytorch_version="1.13", # pytorch version used
   py_version="py39", # python version of the DLC
)

## 모델 배포

In [9]:
from datetime import datetime

time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

endpoint_name = f"KoSimCSE-roberta-" + time_stamp

In [10]:
%%time

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   endpoint_name = endpoint_name,
   instance_type="ml.m5.xlarge"
)

----!CPU times: user 246 ms, sys: 5.36 ms, total: 251 ms
Wall time: 2min 32s


# 3. 추론

In [11]:
sentences2 = ['분당 이마트점에 KT 대리점이 있나요?',
             '거기 이마트점에 KT 대리점이 있나요?',
             '분당 아미트 점은 지하 주차장이 있나요?']


In [12]:
import numpy as np

payload_1 = {
    "inputs" : sentences1
}

payload_2 = {
    "inputs" : sentences2
}

def predict_payload(data):
    res = predictor.predict(data=data)
    res = np.array(res) # .squeeze().squeeze()
    # print("res: ", res.shape)
    # print("embedding dimension: ", len(res[0][0][0]))
    return res



## Sample Test (한개의 문장 임베딩 보여 주기)

In [13]:
payload_0 = {
    "inputs" : "이번 주 일요일에 분당 이마트 점은 문을 여나요"
}


response = predict_payload(payload_0)
emb_len = len(response[0][0])
print("payload_0 : \n", payload_0)
print("embeding size: ", emb_len)
print(f"embeding content from 0 to 10 out of {emb_len}: ", response[0][0][0:10])

payload_0 : 
 {'inputs': '이번 주 일요일에 분당 이마트 점은 문을 여나요'}
embeding size:  768
embeding content from 0 to 10 out of 768:  [-0.2569178  -0.19823857  0.89699048 -1.70428038 -0.11973442  0.28725004
  0.39328021 -0.48056296 -0.17160198 -0.66424179]


In [14]:
def show_embedding_score2(payload):
    '''
    # res 
    # 1st dim: samples, 2nd dim: place_hoder, 3rd_dim : CLS, ohter tokens 
    # res.shape --> (3,1)
    # len(res[1][0]) --> 11 두번째 샘플의 11개 토큰
    # len(res[1][0][0]) --> 두번째 샘플의 , 첫번째 토큰 임베딩 (764 사이즈)
    '''
    res = predict_payload(payload)    
    embeddings_0 = torch.Tensor(res[0][0][0]) 
    embeddings_1 = torch.Tensor(res[1][0][0])
    embeddings_2 = torch.Tensor(res[2][0][0])

    score01 = cal_score(embeddings_0, embeddings_1)
    score02 = cal_score(embeddings_0, embeddings_2)    
    print(score01, score02)
    


In [15]:
print("payload_1: \n", payload_1)
show_embedding_score2(payload_1)

payload_1: 
 {'inputs': ['이번 주 일요일에 분당 이마트 점은 문을 여나요', '일요일에 분당 이마트는 문 열어요?', '분당 이마트 점은 토요일에 몇 시까지 하나요']}
tensor([[92.7287]]) tensor([[79.8030]])


  res = np.array(res) # .squeeze().squeeze()


In [16]:
print("payload_2: \n", payload_2)
show_embedding_score2(payload_2)

payload_2: 
 {'inputs': ['분당 이마트점에 KT 대리점이 있나요?', '거기 이마트점에 KT 대리점이 있나요?', '분당 아미트 점은 지하 주차장이 있나요?']}
tensor([[89.2611]]) tensor([[53.1729]])


  res = np.array(res) # .squeeze().squeeze()


# 4. Boto3 invoke_endpoint() 사용하여 추론

In [17]:
endpoint_name = predictor.endpoint_name

In [18]:
import boto3
import json

def query_endpoint_embedding_with_json_payload(encoded_json, endpoint_name, content_type="application/json"):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json
    )
    return response

def transform_output(output: bytes) -> str:
    response_json = json.loads(output.read().decode("utf-8"))
    # return response_json
    return response_json[0][0]


In [19]:
sentences2_1 = "분당 이마트점에 KT 대리점이 있나요?"
sentences2_2 = "거기 이마트점에 KT 대리점이 있나요?"

payload_2_1 = {
    "inputs" : sentences2_1
}

payload_2_2 = {
    "inputs" : sentences2_2
}

# 첫번째 문장
query_response = query_endpoint_embedding_with_json_payload(
    json.dumps(payload_2_1).encode("utf-8"), endpoint_name=endpoint_name
)

emb_1 = transform_output(query_response['Body'])
print("첫문장 임베딩 사이즈: ", len(emb_1))

# 두번째 문장
query_response = query_endpoint_embedding_with_json_payload(
    json.dumps(payload_2_2).encode("utf-8"), endpoint_name=endpoint_name
)
 
emb_2 = transform_output(query_response['Body'])
print("두번째 문장 임베딩 사이즈: ", len(emb_2))

첫문장 임베딩 사이즈:  768
두번째 문장 임베딩 사이즈:  768


In [20]:
def show_embedding_score3(emb1, emb2):

    embeddings_0 = torch.Tensor(emb1) 
    embeddings_1 = torch.Tensor(emb2)

    score01 = cal_score(embeddings_0, embeddings_1)

    print(score01)

show_embedding_score3(emb_1, emb_2)    

tensor([[89.2611]])


# 5. 엔드포인트 삭제

In [21]:
# # delete endpoint
# predictor.delete_model()
# predictor.delete_endpoint()

In [22]:
%store  endpoint_name

Stored 'endpoint_name' (str)
