# 1.Install necessary libraries

In [1]:
!pip install "sagemaker==2.163.0" --upgrade --quiet
!pip install transformers --quiet
!pip install sentencepiece --quiet

[0m

In [2]:
import sagemaker
assert sagemaker.__version__ == '2.163.0'

# 2. Deploy Open Llama 13b model on AWS

In [3]:
import json
import boto3
import sagemaker
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
region = sess.boto_region_name
# print(f"sagemaker role arn: {role}")
# print(f"sagemaker session region: {region}")

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

# Define Model and Endpoint configuration parameter
hf_model_id =  "openlm-research/open_llama_13b" # model id from huggingface.co/models
instance_type = "ml.g5.12xlarge" # instance type to use for deployment -> https://aws.amazon.com/ec2/instance-types/g5/
number_of_gpu = 4 # number of gpus to use for inference and tensor parallelism
health_check_timeout = 300 # Increase the timeout for the health check to 5 minutes for downloading the model

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env={
    'HF_MODEL_ID': hf_model_id,
    'SM_NUM_GPUS': json.dumps(number_of_gpu),
    'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  }
)

endpoint_name = hf_model_id.split("/")[-1].replace("_", "-")
print(endpoint_name)

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,
  endpoint_name=endpoint_name,
)

llm image uri: 763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04
open-llama-13b
-------------!

# 3. Invoke model

In [4]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

model_path = 'openlm-research/open_llama_13b'
tokenizer = LlamaTokenizer.from_pretrained(model_path)

print(f"Vocab size:{tokenizer.vocab_size}")
print(f"Special tokens:{tokenizer.all_special_tokens}")


# grab environment variables
runtime= boto3.client('runtime.sagemaker')
history = "Human: Hi\nAI: Hi. How can I help you today?"""
message= "What is the largest animal?"
prompt = f"The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n\nCurrent conversation:\n{history}\nHuman: {message}\nAI:"
print(prompt)

input_data = {
  "inputs": prompt,
  "parameters": {      
    "best_of": None,
    "temperature": 0.7,
    "repetition_penalty": None,
    "top_k": None,
    "top_p": None,
    "typical_p": None,
    "do_sample": True,
    "max_new_tokens": 1000, # max number of tokens return; keep in mind prompt +output has to be less than 2048 tokens; be frugal
    "return_full_text": False, # to not return the prompt as part of the ouput
    "stop": ["Human:"],
    "truncate": None,
    "watermark": False,
    "details": False,
    "seed": None
  }
}

response = runtime.invoke_endpoint(EndpointName=endpoint_name,
                                   ContentType='application/json',
                                   Body=json.dumps(input_data).encode('utf-8'))
response_json = json.loads(response['Body'].read().decode("utf-8"))

print(response_json[0]['generated_text'].replace("Human:", ""))

  from .autonotebook import tqdm as notebook_tqdm


Vocab size:32000
Special tokens:['<s>', '</s>', '<unk>']
The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: Hi
AI: Hi. How can I help you today?
Human: What is the largest animal?
AI:
The largest animal is the blue whale. It can grow up to 30 metres (100 feet) long, weigh 180 metric tonnes (180,000kg) and have a heart the size of a small car.



# 4. Keep in mind the special tokens

In [5]:
# print first 20 tokens
tokenizer.convert_ids_to_tokens(range(20))

['<unk>',
 '<s>',
 '</s>',
 '<0x00>',
 '<0x01>',
 '<0x02>',
 '<0x03>',
 '<0x04>',
 '<0x05>',
 '<0x06>',
 '<0x07>',
 '<0x08>',
 '<0x09>',
 '<0x0A>',
 '<0x0B>',
 '<0x0C>',
 '<0x0D>',
 '<0x0E>',
 '<0x0F>',
 '<0x10>']

In [6]:
sorted_dict = dict(sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]))
first_20_elements = dict(list(sorted_dict.items())[:20])

print(first_20_elements)

{'<unk>': 0, '<s>': 1, '</s>': 2, '<0x00>': 3, '<0x01>': 4, '<0x02>': 5, '<0x03>': 6, '<0x04>': 7, '<0x05>': 8, '<0x06>': 9, '<0x07>': 10, '<0x08>': 11, '<0x09>': 12, '<0x0A>': 13, '<0x0B>': 14, '<0x0C>': 15, '<0x0D>': 16, '<0x0E>': 17, '<0x0F>': 18, '<0x10>': 19}


# Delete all models, endpoint_configs & endpoints

In [7]:
def delete_resources(resource_type):
    client = boto3.client('sagemaker')
    list_method = getattr(client, f"list_{resource_type}s")
    delete_method = getattr(client, f"delete_{resource_type}")
    resource_type_name = resource_type.replace('_', ' ').title().replace(' ', '')
    resources = list_method()[f"{resource_type_name}s"]
    for resource in resources:
        resource_name = resource[f"{resource_type_name}Name"]
        print(f"Deleting {resource_type}: {resource_name}")
        # if resource_name == "falcon-40b-instruct": continue
        # if resource_name == "llama-30b-supercot-2023-06-15-16-54-10-187-endpoint": continue
        delete_method(**{f"{resource_type_name}Name": resource_name})

def main():
    resource_types = ['model', 'endpoint', 'endpoint_config']  # Add more resource types if needed

    for resource_type in resource_types:
        delete_resources(resource_type)

if __name__ == "__main__":
    main()

Deleting model: huggingface-pytorch-tgi-inference-2023-06-19-22-20-01-036
Deleting endpoint: open-llama-13b
Deleting endpoint_config: open-llama-13b
