# embedding-serving
* Container: Data Science 3.0 (studio, python 3.10)

## 0. Install packages

In [9]:
install_needed = True  # should only be True once

In [10]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U sagemaker
    
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
Collecting torch
  Using cached torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch)
  Using cached nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
Collecting nvidia-cufft-cu11==10.9.0.58 (from torch)
  Downloading nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.4/168.4 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-curand-cu11==10.2.10.91 (from torch)
  Downloading nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cusolver-cu11==11.4.0.1 (from torch)
  Downloading nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)
[2K     [90m━━━━

## 1. deploy embedding model

In [12]:
import time
import boto3
import sagemaker
from datetime import datetime
from sagemaker.model import Model
from sagemaker.session import Session
from sagemaker.utils import name_from_base
from sagemaker import image_uris, model_uris#, script_uris, hyperparameters

In [13]:
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
model_version = "*"
print(f'aws_role={aws_role}, aws_region={aws_region}')

aws_role=arn:aws:iam::843077212694:role/PepStack-CrossAccountRole55335AA5-1AMOSWE2L94A, aws_region=us-east-1


In [14]:
MODEL_CONFIG_LIST = [
    {
        "model_id": "huggingface-textembedding-gpt-j-6b",
        "model_version": "*",
        "instance_type": "ml.g5.12xlarge",
        "instance_count": 1,
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "2"},
    }
]

In [15]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

for model in MODEL_CONFIG_LIST: 
    start = time.time()
    time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    endpoint_name = name_from_base(f"{model['model_id']}-{time_stamp}")
    print(f"going to deploy model={model}, endpoint_name={endpoint_name}")    
    # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
    deploy_image_uri = image_uris.retrieve(
        region=None,
        framework=None,  # automatically inferred from model_id
        image_scope="inference",
        model_id=model['model_id'],
        model_version=model['model_version'],
        instance_type=model['instance_type'],
    )
    # Retrieve the model uri.
    model_uri = model_uris.retrieve(
        model_id=model['model_id'], model_version=model['model_version'], model_scope="inference"
    )
    print(f'deploy_image_uri={deploy_image_uri}, model_uri={model_uri}')
    model_inference = Model(
        image_uri=deploy_image_uri,
        model_data=model_uri,
        role=aws_role,
        predictor_cls=model.get("predictor_cls"),
        name=endpoint_name,
        env=model['env'],
    )
    model_predictor_inference = model_inference.deploy(
        initial_instance_count=model['instance_count'],
        instance_type=model['instance_type'],
        predictor_cls=model.get("predictor_cls"),
        endpoint_name=endpoint_name,
    )
    time_taken = time.time() - start
    print (f"{bold}model={model['model_id']} has been deployed successfully at endpoint={endpoint_name}, took {time_taken}seconds{unbold}{newline}")
    model["endpoint_name"] = endpoint_name

going to deploy model={'model_id': 'huggingface-textembedding-gpt-j-6b', 'model_version': '*', 'instance_type': 'ml.g5.12xlarge', 'instance_count': 1, 'env': {'TS_DEFAULT_WORKERS_PER_MODEL': '2'}}, endpoint_name=huggingface-textembedding-gpt-j-6b-2023-2023-06-23-02-24-58-218
deploy_image_uri=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.12.0-gpu-py38, model_uri=s3://jumpstart-cache-prod-us-east-1/huggingface-infer/prepack/v1.0.1/infer-prepack-huggingface-textembedding-gpt-j-6b.tar.gz
-----------![1mmodel=huggingface-textembedding-gpt-j-6b has been deployed successfully at endpoint=huggingface-textembedding-gpt-j-6b-2023-2023-06-23-02-24-58-218, took 363.45546865463257seconds[0m



In [366]:
endpoint_name

'huggingface-textembedding-gpt-j-6b-2023-2023-06-16-23-07-40-190'

### 2.1 Invocation

In [443]:
text1 = "How cute your dog is!"
text2 = "Your dog is so cute."
text3 = "The mitochondria is the powerhouse of the cell."

### Query endpoint that you have created
You can query the endpoint with a batch of input texts within a json payload. Here, we send a single request to the endpoint and the parsed response is a list of the embedding vectors.

In [444]:
newline, bold, unbold = '\n', '\033[1m', '\033[0m'
#endpoint_name = 'huggingface-textembedding-gpt-j-6b-2023-2023-06-16-14-22-17-401'


def query_endpoint_with_json_payload(encoded_json):
    client = boto3.client('runtime.sagemaker')
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/json',
        Body=encoded_json
)
    return response


def parse_response_multiple_texts(query_response):
    model_predictions = json.loads(query_response['Body'].read())
    embeddings = model_predictions['embedding']
    return embeddings


payload = {"text_inputs": [text1, text2, text3]}
query_response = query_endpoint_with_json_payload(json.dumps(payload).encode('utf-8'))
embeddings = parse_response_multiple_texts(query_response)

In [445]:
embeddings

[[0.0013489107368513942,
  -0.018385330215096474,
  -0.016965681686997414,
  -0.009253906086087227,
  0.015878183767199516,
  0.029750213027000427,
  -0.018147243186831474,
  0.0013538436032831669,
  0.015019044280052185,
  -0.008157066069543362,
  -0.0032542694825679064,
  0.024895988404750824,
  -0.0044600097462534904,
  0.01345666404813528,
  0.012466981075704098,
  -0.005835961550474167,
  -0.0019635267090052366,
  -0.015397023409605026,
  -0.010526093654334545,
  0.01141903642565012,
  0.010193578898906708,
  -0.012928756885230541,
  -0.006018274463713169,
  -0.003646490629762411,
  -0.020154835656285286,
  -0.01177288219332695,
  0.006410220172256231,
  0.014053774066269398,
  0.012873170897364616,
  0.004196937195956707,
  -0.015345822088420391,
  -0.0025690183974802494,
  0.010213232599198818,
  -0.010201680473983288,
  0.001645726733841002,
  0.00023351395793724805,
  6.585427763639018e-05,
  -0.002791732084006071,
  0.008848433382809162,
  -0.009319859556853771,
  0.008111496

## 3. Delete endpoint

In [344]:
class clean_up():
    
    def __init__(self, ):    
        pass
    
    def delete_endpoint(self, client, endpoint_name ,is_del_model=True):
        
        response = client.describe_endpoint(EndpointName=endpoint_name)
        EndpointConfigName = response['EndpointConfigName']

        response = client.describe_endpoint_config(EndpointConfigName=EndpointConfigName)
        model_name = response['ProductionVariants'][0]['ModelName']    

        if is_del_model: # 모델도 삭제 여부 임.
            client.delete_model(ModelName=model_name)    

        client.delete_endpoint(EndpointName=endpoint_name)
        client.delete_endpoint_config(EndpointConfigName=EndpointConfigName)    

        print(f'--- Deleted model: {model_name}')
        print(f'--- Deleted endpoint: {endpoint_name}')
        print(f'--- Deleted endpoint_config: {EndpointConfigName}')  

In [345]:
clean = clean_up()
sm_client = boto3.client('sagemaker')

## 2.training 
clean.delete_endpoint(sm_client, endpoint_name ,is_del_model=True)

--- Deleted model: huggingface-textembedding-gpt-j-6b-2023-2023-06-16-14-22-17-401
--- Deleted endpoint: huggingface-textembedding-gpt-j-6b-2023-2023-06-16-14-22-17-401
--- Deleted endpoint_config: huggingface-textembedding-gpt-j-6b-2023-2023-06-16-14-22-17-401
