# 1.Install necessary libraries

In [46]:
!pip install "sagemaker==2.163.0" --upgrade --quiet
!pip install transformers --quiet

[0m

# 2. Deploy Falcon 40B model on AWS

In [None]:
import json
import boto3
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.huggingface import get_huggingface_llm_image_uri

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
region = sess.boto_region_name
print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {region}")

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

# Define Model and Endpoint configuration parameter
hf_model_id = "tiiuae/falcon-40b-instruct" # model id from huggingface.co/models
instance_type = "ml.g5.12xlarge" # instance type to use for deployment
number_of_gpu = 4 # number of gpus to use for inference and tensor parallelism
health_check_timeout = 300 # Increase the timeout for the health check to 5 minutes for downloading the model

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env={
    'HF_MODEL_ID': hf_model_id,
    'SM_NUM_GPUS': json.dumps(number_of_gpu),
    # 'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  }
)

model_name = hf_model_id.split("/")[-1].replace(".", "-")
endpoint_name = model_name
endpoint_name

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,
  endpoint_name=endpoint_name,
)

sagemaker role arn: arn:aws:iam::069230569860:role/service-role/AmazonSageMaker-ExecutionRole-20220729T131670
sagemaker session region: eu-west-1
llm image uri: 763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04
-

# 3. Invoke model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "tiiuae/falcon-40b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)

# grab environment variables
ENDPOINT_NAME = "falcon-40b-instruct"
runtime= boto3.client('runtime.sagemaker')
message = "Write a poem about Valencia"
prompt = f"<|prompter|>{message}<|endoftext|><|assistant|>"
prompt = f"{message}"
input_data = {
  "inputs": prompt,
  "parameters": {
    "best_of": None, 
    "temperature": .7,
    "repetition_penalty": None,
    "top_k": None,
    "top_p": None,
    "typical_p": None,
    "do_sample": True,
    "max_new_tokens": 2000,  # tmax number of tokens return; keep in mind prompt +output has to be less than 2048 tokens; be frugal
    "return_full_text": False, # to not return the prompt as part of the ouput
    "stop": [],
    "truncate": None,
    "watermark": True,
    "details": False,
    "seed": 5,
  }
}

response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                   ContentType='application/json',
                                   Body=json.dumps(input_data).encode('utf-8'))
response_json = json.loads(response['Body'].read().decode("utf-8"))

print(response_json[0]['generated_text'])

# 4. Keep in mind the special tokens

In [None]:
# https://huggingface.co/tiiuae/falcon-40b-instruct/blob/main/special_tokens_map.json
# https://huggingface.co/tiiuae/falcon-40b-instruct/blob/main/tokenizer_config.json

# print first 20 tokens
tokenizer.convert_ids_to_tokens(range(20))

In [None]:
sorted_dict = dict(sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]))
first_20_elements = dict(list(sorted_dict.items())[:20])

print(first_20_elements)

# Delete all models, endpoint_configs & endpoints

In [42]:
import boto3

def delete_resources(resource_type):
    client = boto3.client('sagemaker')
    list_method = getattr(client, f"list_{resource_type}s")
    delete_method = getattr(client, f"delete_{resource_type}")
    resource_type_name = resource_type.replace('_', ' ').title().replace(' ', '')
    resources = list_method()[f"{resource_type_name}s"]
    for resource in resources:
        resource_name = resource[f"{resource_type_name}Name"]
        print(f"Deleting {resource_type}: {resource_name}")
        # if resource_name == "falcon-40b-instruct": continue
        # if resource_name == "llama-30b-supercot-2023-06-15-16-54-10-187-endpoint": continue
        delete_method(**{f"{resource_type_name}Name": resource_name})

def main():
    resource_types = ['model', 'endpoint', 'endpoint_config']  # Add more resource types if needed

    for resource_type in resource_types:
        delete_resources(resource_type)

if __name__ == "__main__":
    main()