# Deploying an LLM in SageMaker

Setting up the role and S3 bucket that we will need later

In [None]:
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
model_name = "mpt-7b-instruct"  # "mpt-7b-instruct" "dolly-v2-12b" "flan-t5-xxl" "all-MiniLM-L6-v2"

Creating the Hugging Face Model, indicating the package versions we want to use and the S£ location with the inference code

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

huggingface_model = HuggingFaceModel(
    model_data=f"s3://{sess.default_bucket()}/{model_name}/model.tar.gz",
    role=role,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
    model_server_workers=1
)

Deploying the model to an endpoint

In [None]:
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.4xlarge",  # "ml.g5.4xlarge"
    endpoint_name=model_name,
    model_data_download_timeout=3600,
    container_startup_health_check_timeout=3600,
    update_endpoint=True
)

NOTE: Even after the endpoint has been deployed, we still need to wait 1-2 minutes before we can start using it. That's because the model is downloading from the HF Model Hub and due to its size it won't be quite finished when the endpoint is deployed.

In [None]:
predictor.endpoint_name

In [None]:
prompt = """Convert the following input to json:

name: John
last_name: Doe
age: 30
address:
  street: 123 Main St
  city: New York
  state: NY
  zip: 10001
"""

In [None]:
data = {
    "inputs": prompt,
    "min_length": 20,
    "max_length": 50,
    "do_sample": True,
    "temperature": 0.6
}

response = predictor.predict(data)
response[0]["generated_text"]

# Bonus: More sophisticated workflows

You can build more sophisticated workflows by templating the prompts using [langchain](https://python.langchain.com/en/latest/). The following are just a few examples of what you can do by combining `langchain` and `SageMaker`:

In [None]:
!pip install langchain
!aws configure set aws_access_key_id [REDACTED]
!aws configure set aws_secret_access_key [REDACTED]
!aws configure set default.region us-east-1

In [None]:
import json
from typing import Dict
from langchain import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler


class ContentHandlerMPT(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, inputs: list[str], model_kwargs: Dict) -> bytes:
        input_str = json.dumps({"inputs": inputs, **model_kwargs})
        return input_str.encode('utf-8')

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json


class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs) -> bytes:
        input_str = json.dumps({prompt: prompt, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json[0]["generated_text"]


se = SagemakerEndpoint(
    endpoint_name=predictor.endpoint_name,
    region_name="us-east-1",
    credentials_profile_name="default",
    content_handler=ContentHandlerMPT(),  # ContentHandlerMPT
)

se("Tell me a joke")

In [None]:
from langchain import PromptTemplate, LLMChain

prompt_template = "Why is {vegetable} good for you?"
llm_chain = LLMChain(llm=se, prompt=PromptTemplate.from_template(prompt_template))
llm_chain("brocolli")["text"]


In [None]:
import time

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain


def generate_serially():
    prompt = PromptTemplate(
        input_variables=["product"],
        template="What is a good name for a company that makes {product}?",
    )
    chain = LLMChain(llm=se, prompt=prompt)
    for _ in range(5):
        resp = chain.run(product="underwear")
        print(resp)


s = time.perf_counter()
generate_serially()
elapsed = time.perf_counter() - s
print('\033[1m' + f"Serial executed in {elapsed:0.2f} seconds." + '\033[0m')

In [None]:
prompt_template = "Tell me a {adjective} joke"
llm_chain = LLMChain(
    llm=se,
    prompt=PromptTemplate.from_template(prompt_template)
)

llm_chain(inputs={"adjective": "corny"})

# Deleting the endpoint

In [None]:
predictor.delete_model()
predictor.delete_endpoint()
predictor.delete_endpoint_config()