# Deploy Fine Tuned Model
This lab builds off the previous model training lab and will deploy the model we trained. Because the training job takes ~6 hours, we've uploaded a model trained using the same script to Hugging Face's model hub so that we can pull it into our inference container and create a sagemaker endpoint. 

## Steps
1. Install dependencies and create our sagemaker session
2. Load our model into a Hugging Face inference container (based on TGI)
3. Do a "vibe check" to make sure the model can generate SQL

In [None]:
import sagemaker
import boto3
import os
from dotenv import load_dotenv, find_dotenv, set_key
import dotenv

local_env_filename = 'dev.env'
load_dotenv(find_dotenv(local_env_filename),override=True)

os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
HF_TOKEN = os.environ['HF_TOKEN']

os.environ['ENDPOINT_NAME'] = os.getenv('ENDPOINT_NAME')
ENDPOINT_NAME = os.environ['ENDPOINT_NAME']

sess = sagemaker.Session()

# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()
 
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
 
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
 
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


# Deploy Model Endpoint

In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri
 
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.4.0",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
}

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "tannermcrae/Mistral-7B-v0.1-Text2SQL-Instruct", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [None]:
# Deploy model to an endpoint

# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to give SageMaker the time to download the model
)

In [None]:
ENDPOINT_NAME = llm.endpoint

print(f'Endpoint name is: {ENDPOINT_NAME}')
os.environ['ENDPOINT_NAME'] = ENDPOINT_NAME
dotenv.set_key(local_env_filename, "ENDPOINT_NAME", os.environ["ENDPOINT_NAME"])


# Inference Vibe Check - Text to SQL

In [None]:
# ensure we have the latest transformers and tokenizers version
!pip install --upgrade transformers tokenizers

In [None]:
from transformers import AutoTokenizer
import boto3
import json

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

def call_sagemaker_endpoint(endpoint_name, sample):
    # Create a SageMaker runtime client
    runtime = boto3.client('sagemaker-runtime')

    # Use tokenizers chat template to format the incomming request
    prompt = tokenizer.apply_chat_template(sample, tokenize=False, add_generation_prompt=True)

    # Add hyperparams and inputs into sagemaker call.
    input_data = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 512,
            "do_sample": False,
            "return_full_text": False,
            "stop": ["<|im_end|>"],
          }
    }

    # Convert input data to JSON string
    input_json = json.dumps(input_data)

    try:
        # Call the SageMaker endpoint
        response = runtime.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType='application/json',
            Body=input_json
        )

        # Get the response body and decode it
        result = json.loads(response['Body'].read().decode())
        
        return result

    except Exception as e:
        print(f"Error calling SageMaker endpoint: {str(e)}")
        return None

In [None]:
sample_input = {
    'messages': [
        {
            'content': 'You are an AI assistant that generates SQL queries from natural language and given schema information. Create accurate SQL queries based on the user\'s request and the provided table structures.',
            'role': 'system'
        },
        {
            'content': "How many books has each author published? List the author names and book counts, but only for authors who have published more than 5 books. Order the results by the number of books in descending order.\n\n### Context\nCREATE TABLE authors (author_id INT PRIMARY KEY, author_name VARCHAR(100));\nCREATE TABLE books (book_id INT PRIMARY KEY, title VARCHAR(255), author_id INT);",
            'role': 'user'
        },
        {
            'content': "SELECT a.author_name, COUNT(b.book_id) AS book_count\nFROM authors a\nJOIN books b ON a.author_id = b.author_id\nGROUP BY a.author_id, a.author_name\nHAVING COUNT(b.book_id) > 5\nORDER BY book_count DESC;",
            'role': 'assistant'
        }
    ]
}

response = call_sagemaker_endpoint(ENDPOINT_NAME, sample_input['messages'][:2])

In [None]:
print(f"Correct Response:\n{sample_input['messages'][2]['content']}\n\n*******\n\n")
print(f"Model Response:\n{response[0]['generated_text']}")

# Conclusion
In this lab, we took the model we trained, packaged it up into an inference container, and deployed it to SageMaker. We then validated we could get valid SQL and answer basic instructions.