# embedding-serving - KoSimCSE-roberta
* Container: `Data Science 3.0` (studio, python 3.10), `conda_pytorch_p310` (notebook)

Model Ref:
- BM-K/KoSimCSE-roberta
    - https://huggingface.co/BM-K/KoSimCSE-roberta
Inference Code Ref:    
- Huggingface Sagemaker-sdk - Deploy 🤗 Transformers for inference
    - https://github.com/huggingface/notebooks/blob/main/sagemaker/11_deploy_model_from_hf_hub/deploy_transformer_model_from_hf_hub.ipynb
- Sentence Embeddings with Hugging Face Transformers, Sentence Transformers and Amazon SageMaker - Custom Inference for creating document embeddings with Hugging Face's Transformers
    - https://github.com/huggingface/notebooks/blob/main/sagemaker/17_custom_inference_script/sagemaker-notebook.ipynb

## 0. Install packages

In [1]:
install_needed = True  # should only be True once

In [2]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U sagemaker
    !{sys.executable} -m pip install -U torch
    !{sys.executable} -m pip install -U transformers
    
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting torch
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## 1. loading model and tokenizer from HF 

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer

In [3]:
model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta')
tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta')

Downloading (…)lve/main/config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

## 2. test

### 2.1 sentence to embedding

In [4]:
sample = "이번 주 일요일에 분당 이마트 점은 문을 여나요"

In [5]:
inputs = tokenizer(sample, padding=True, truncation=True, return_tensors="pt")
embeddings, _ = model(**inputs, return_dict=False)

In [6]:
emb_len = len(embeddings[0][0])
print("sample : \n", sample)
print("embeding size: ", emb_len)
print(f"embeding content from 0 to 10 out of {emb_len}: \n", embeddings[0][0][0:10])

sample : 
 이번 주 일요일에 분당 이마트 점은 문을 여나요
embeding size:  768
embeding content from 0 to 10 out of 768: 
 tensor([-0.2569, -0.1982,  0.8970, -1.7043, -0.1197,  0.2872,  0.3933, -0.4806,
        -0.1716, -0.6642], grad_fn=<SliceBackward0>)


### 2.2 similarity
- 아래 첫문장, 두번째 문장의 유사도를 구함
- 아래 첫문장, 세째 문장의 유사도를 구함
- 최종적으로 유사도 수치를 비교 함

In [66]:
def show_embedding_score(tokenizer, model, sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    embeddings, _ = model(**inputs, return_dict=False)

    score01 = cal_score(embeddings[0][0], embeddings[1][0])
    score02 = cal_score(embeddings[0][0], embeddings[2][0])

    print (f'similarity betwwen first and sescond: {score01}')
    print (f'similarity betwwen first and third": {score02}')

In [67]:
def cal_score(a, b):
    '''
    코사인 유사도 구하는 함수
    '''
    if len(a.shape) == 1: a = a.unsqueeze(0)
    if len(b.shape) == 1: b = b.unsqueeze(0)

    a_norm = a / a.norm(dim=1)[:, None]
    b_norm = b / b.norm(dim=1)[:, None]
    return torch.mm(a_norm, b_norm.transpose(0, 1)) * 100

In [68]:
sentences1 = ['이번 주 일요일에 분당 이마트 점은 문을 여나요',
              '일요일에 분당 이마트는 문 열어요?',
              '분당 이마트 점은 토요일에 몇 시까지 하나요']

show_embedding_score(tokenizer, model, sentences1)    

similarity betwwen first and sescond: tensor([[92.7287]], grad_fn=<MulBackward0>)
similarity betwwen first and third": tensor([[79.8030]], grad_fn=<MulBackward0>)


## 3. deploy embedding model

In [60]:
import boto3
import sagemaker
from datetime import datetime
from sagemaker.huggingface import HuggingFaceModel

In [61]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

print(f"sagemaker role arn: {role}")

sagemaker role arn: arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436


### 3.1 define HF Model ID and HF_TASK

In [12]:
# Hub Model configuration. https://huggingface.co/models
hub = {
  'HF_MODEL_ID':'BM-K/KoSimCSE-roberta', # model_id from hf.co/models
  'HF_TASK':'feature-extraction'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   env=hub,
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.26", # transformers version used
   pytorch_version="1.13", # pytorch version used
   py_version="py39", # python version of the DLC
)

### 3.2 depoly

In [13]:
time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
endpoint_name = f"KoSimCSE-roberta-" + time_stamp

In [14]:
%%time
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   endpoint_name = endpoint_name,
   instance_type="ml.p3.8xlarge"
)

------------!CPU times: user 141 ms, sys: 5.37 ms, total: 147 ms
Wall time: 6min 32s


In [100]:
endpoint_name

'KoSimCSE-roberta-2023-07-10-05-26-01'

## 4.inference

In [15]:
import numpy as np

### 4.1. Boto3 invoke_endpoint() 사용하여 추론

In [96]:
import json
import boto3

In [97]:
def query_endpoint_embedding_with_json_payload(encoded_json, endpoint_name, content_type="application/json"):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json
    )
    return response

def transform_output(output: bytes) -> str:
    response_json = json.loads(output.read().decode("utf-8"))
    # return response_json
    return response_json[0][0]

In [98]:
sentences2_1 = "분당 이마트점에 KT 대리점이 있나요?"
sentences2_2 = "거기 이마트점에 KT 대리점이 있나요?"

payload_2_1 = {
    "inputs" : sentences2_1
}

payload_2_2 = {
    "inputs" : sentences2_2
}

# 첫번째 문장
query_response = query_endpoint_embedding_with_json_payload(
    json.dumps(payload_2_1).encode("utf-8"), endpoint_name=endpoint_name
)

emb_1 = transform_output(query_response['Body'])
print("첫문장 임베딩 사이즈: ", len(emb_1))

# 두번째 문장
query_response = query_endpoint_embedding_with_json_payload(
    json.dumps(payload_2_2).encode("utf-8"), endpoint_name=endpoint_name
)
 
emb_2 = transform_output(query_response['Body'])
print("두번째 문장 임베딩 사이즈: ", len(emb_2))

첫문장 임베딩 사이즈:  768
두번째 문장 임베딩 사이즈:  768


In [99]:
def show_embedding_score3(emb1, emb2):

    embeddings_0 = torch.Tensor(emb1) 
    embeddings_1 = torch.Tensor(emb2)

    score01 = cal_score(embeddings_0, embeddings_1)

    print(f'simiarity: {score01}')

show_embedding_score3(emb_1, emb_2)  

simiarity: tensor([[89.2611]])


## 5. Delete endpoint

In [None]:
class clean_up():
    
    def __init__(self, ):    
        pass
    
    def delete_endpoint(self, client, endpoint_name ,is_del_model=True):
        
        response = client.describe_endpoint(EndpointName=endpoint_name)
        EndpointConfigName = response['EndpointConfigName']

        response = client.describe_endpoint_config(EndpointConfigName=EndpointConfigName)
        model_name = response['ProductionVariants'][0]['ModelName']    

        if is_del_model: # 모델도 삭제 여부 임.
            client.delete_model(ModelName=model_name)    

        client.delete_endpoint(EndpointName=endpoint_name)
        client.delete_endpoint_config(EndpointConfigName=EndpointConfigName)    

        print(f'--- Deleted model: {model_name}')
        print(f'--- Deleted endpoint: {endpoint_name}')
        print(f'--- Deleted endpoint_config: {EndpointConfigName}')  

In [None]:
clean = clean_up()
sm_client = boto3.client('sagemaker')
clean.delete_endpoint(sm_client, endpoint_name ,is_del_model=True)