# Day 2: Next Steps: Deploying Tuned model on SageMaker 

You need an AWS account to run this tutorial. Let's start by importing the AWS keys.

In [1]:
# %env AWS_ACCESS_KEY_ID=******
# %env AWS_SECRET_ACCESS_KEY=******
# %env AWS_DEFAULT_REGION=us-east-1

To deploy an existing Hugging Face model, such as Falcon 40B, you can follow this tutorial [Deploy Falcon 7B & 40B on Amazon SageMaker](https://www.philschmid.de/sagemaker-falcon-llm). But today we are going to deploy our own tuned model with LoRA.

## Let's install deps

In [2]:
# Install the required libraries
!pip install -q git+https://github.com/huggingface/peft
!pip install -q datasets==2.12.0 evaluate==0.4.0 numpy==1.24.3 torch==2.0.1 tqdm==4.65.0 transformers==4.29.2 ipykernel ipywidgets sagemaker

# Import libs

In [3]:
from transformers import AutoModelForSeq2SeqLM, AdamW, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor
from peft import get_peft_model, TaskType, LoraConfig
import functools
import torch
import datasets
import os
import evaluate
import time
import tarfile
import shutil
from pathlib import Path
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator
from tqdm import tqdm

## Train LoRA model with Trainer. 
For more details look at [Day 2: Next Steps: Finetuning](https://github.com/georgian-io/genai-bootcamp/blob/main/notebooks/day-2/06-fine-tuning.ipynb). 
You can use your own checkpoint as an alternative.

In [4]:
model_name_or_path = "google/flan-t5-large" # https://huggingface.co/google/flan-t5-large
peft_model_id = "model_for_sagemaker"


r = 4 # LoRA attention dimension parameter
dropout_rate = 0.1
batch_size = 8
n_epochs = 3
lr = 3e-4
max_length = 128
grad_accumulation_steps = 1

 
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def preprocess_function(examples, tokenizer, text_column, label_column, max_length):
    inputs = [text.replace('\n', ' ') for text in examples[text_column]]

    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")

    targets = examples["text_label"]
    labels = tokenizer(targets, max_length=2, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels

    return model_inputs

# load the dataset
# sentences_allagree means all annotators agreed on the label
dataset = datasets.load_dataset("financial_phrasebank", "sentences_allagree")
text_column = "sentence"
label_column = "label"

dataset = dataset["train"].train_test_split(test_size=200, seed=42, shuffle=True)
dataset["validation"] = dataset["test"]
del dataset["test"]

classes = dataset["train"].features[label_column].names
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x[label_column]]},
    batched=True,
    num_proc=1,
)


# Build Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

# Use LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=r, lora_alpha=32, lora_dropout=dropout_rate
)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()
model = model.to(device)


# Preprocess dataset
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

processed_datasets = dataset.map(
    functools.partial(preprocess_function, tokenizer=tokenizer, text_column=text_column, label_column=label_column, max_length=max_length),
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)
# Define training args
training_args = Seq2SeqTrainingArguments(
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=batch_size,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_accumulation_steps,
    output_dir=peft_model_id,
    auto_find_batch_size=True,
    learning_rate=lr,
    num_train_epochs=n_epochs,
    logging_dir=f"{peft_model_id}/logs",
)

print(f"training_args = {training_args}")
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

Found cached dataset financial_phrasebank (/home/ubuntu/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /home/ubuntu/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141/cache-70d5ee610c33551c.arrow and /home/ubuntu/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141/cache-d30fb0cf1bd2beb4.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141/cache-f788c6abf00c795b.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141/cache-a5b1561fc5e1eba2.arrow


trainable params: 1,179,648 || all params: 784,329,728 || trainable%: 0.15040205131686657


Running tokenizer on dataset:   0%|          | 0/2064 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

training_args = Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=True,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_config=None,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_

## Run training 

In [5]:
trainer.train()
trainer.evaluate()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.6076,0.075982
2,0.0421,0.04155
3,0.0291,0.054657


{'eval_loss': 0.054656725376844406,
 'eval_runtime': 2.1386,
 'eval_samples_per_second': 93.518,
 'eval_steps_per_second': 11.69,
 'epoch': 3.0}

## Save best model

In [6]:
# Save our LoRA model & tokenizer results
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
trainer.model.base_model.save_pretrained(peft_model_id)

## Check model files

In [7]:
!ls {peft_model_id}

README.md	     checkpoint-516	     pytorch_model.bin
adapter_config.json  checkpoint-774	     special_tokens_map.json
adapter_model.bin    config.json	     tokenizer.json
checkpoint-258	     generation_config.json  tokenizer_config.json


# Packaging model for inference

We are going to use [sagemaker-huggingface-inference-toolkit](https://github.com/aws/sagemaker-huggingface-inference-toolkit/) for infernce. To customize libraries and the inference function, we need to define custom inference.py and requirements.txt files."


### Let's define infenrece.py with custom model_fn to load LoRA model and predict_fn

In [8]:
%%writefile inference.py

from typing import Dict, Tuple

import torch
from peft import PeftConfig, PeftModel
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

MAX_NEW_TOKEN = 256
TEMPERATURE = 0.001


def model_fn(model_dir: str) -> Tuple[PeftModel, AutoTokenizer]:
    peft_model_id = model_dir
    config = PeftConfig.from_pretrained(peft_model_id)
    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
    model_lora = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
    model_lora = PeftModel.from_pretrained(model_lora, peft_model_id)
    model_lora = model_lora.merge_and_unload()
    model_lora.eval()
    model_lora.cuda()
    return model_lora, tokenizer


def predict_fn(data: Dict[str, str], model_and_tokenizer: Tuple[PeftModel, AutoTokenizer]):
    # destruct model and tokenizer
    model, tokenizer = model_and_tokenizer

    # Tokenize sentences
    prompt = data.pop("prompt", data)
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_new_tokens=MAX_NEW_TOKEN, temperature=TEMPERATURE)
    print(f"input prompt: {prompt}\n{'---'* 20}")
    result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    print(f"model output:\n{result}")

    return {"result": result}


Overwriting inference.py


In [9]:
%%writefile requirements.txt

transformers==4.29.1
peft==0.3.0
bitsandbytes==0.38.1

Overwriting requirements.txt


### Now, we need to create a model.tar.gz archive for your model file and code using the package_model function.

In [10]:
def package_model(path_to_model_folder: Path = Path(""), path_to_result: Path = Path("model.tar.gz")) -> None:
    path_to_model_folder = Path(path_to_model_folder)
    path_to_result = Path(path_to_result)

    # Check if path_to_model_folder is a directory
    if not path_to_model_folder.is_dir():
        raise NotADirectoryError(f"{path_to_model_folder} is not a directory")

    # Ensure the result path is a tar.gz file
    if not str(path_to_result).endswith(".tar.gz"):
        raise ValueError(f"{path_to_result} should end with .tar.gz")

    list_files_to_save = [
        "adapter_config.json",
        "adapter_model.bin",
        "config.json",
        "generation_config.json",
        "special_tokens_map.json",
        "tokenizer.json",
        "tokenizer_config.json",
        "pytorch_model.bin",
    ]
    with tarfile.open(path_to_result, "w:gz") as tar:
        print("Adding code folder with inference and requirements.txt.")
        code_foler = path_to_model_folder / "code"
        code_foler.mkdir(exist_ok=True)
        shutil.copy("requirements.txt", code_foler / "requirements.txt")
        shutil.copy("inference.py", code_foler / "inference.py")
        tar.add(code_foler, arcname="code")

        print(f"Adding {list_files_to_save} model files to model tar.gz.")
        for file_name in list_files_to_save:
            tar.add(path_to_model_folder / file_name, arcname=file_name)

In [11]:
package_model(path_to_model_folder=peft_model_id, path_to_result=Path("model.tar.gz"))    

Adding code folder with inference and requirements.txt.
Adding ['adapter_config.json', 'adapter_model.bin', 'config.json', 'generation_config.json', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'pytorch_model.bin'] model files to model tar.gz.


In [12]:
!du -sh model.tar.gz

2.8G	model.tar.gz


### Upload this tar file to S3, where SageMaker can find it

In [13]:
MODEL_DATA_S3_PATH = "s3://genai-bootcamp/test-example/model.tar.gz"
ROLE = "arn:aws:iam::823217009914:role/service-role/AmazonSageMaker-ExecutionRole-20180305T161813"


In [14]:
!aws s3 cp model.tar.gz {MODEL_DATA_S3_PATH}

upload: ./model.tar.gz to s3://genai-bootcamp/test-example/model.tar.gz


# Deploy model: Create model, endpoint configuration and endpoint

In [16]:
endpoint_name = "genai-bootcamp-flan"
instance_type = "ml.g5.2xlarge"


huggingface_model = HuggingFaceModel(
    model_data=MODEL_DATA_S3_PATH,
    role=ROLE,
    name=endpoint_name,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
)

huggingface_model.deploy(initial_instance_count=1, instance_type=instance_type)

---------!

<sagemaker.huggingface.model.HuggingFacePredictor at 0x7feb765938b0>

## Check model 

In [17]:
!aws sagemaker list-models --query 'Models[].ModelName' --output table | grep genai-bootcamp


|  genai-bootcamp-flan                                              |


## Check endpoint

In [18]:
!aws sagemaker list-endpoints --output table | grep genai-bootcamp



||  EndpointArn     |  arn:aws:sagemaker:us-east-1:823217009914:endpoint/genai-bootcamp-flan-2023-06-18-22-57-23-846   ||
||  EndpointName    |  genai-bootcamp-flan-2023-06-18-22-57-23-846                                                     ||


## Query model 

In [20]:

predictor = HuggingFacePredictor("genai-bootcamp-flan-2023-06-18-22-57-23-846")
s = time.monotonic()
prompt = "test"
response = predictor.predict({"prompt": prompt})
e = time.monotonic()
print(f"response = {response}; time = {e - s}")


response = {'result': 'neutral'}; time = 1.5374922808259726


## Clean resources

In [22]:
!aws sagemaker delete-model --model-name {endpoint_name}
!aws sagemaker delete-endpoint-config --endpoint-config-name genai-bootcamp-flan-2023-06-18-22-57-23-846
!aws sagemaker delete-endpoint --endpoint-name genai-bootcamp-flan-2023-06-18-22-57-23-846


An error occurred (ValidationException) when calling the DeleteModel operation: Could not find model "arn:aws:sagemaker:us-east-1:823217009914:model/genai-bootcamp-flan".

An error occurred (ValidationException) when calling the DeleteEndpoint operation: Could not find endpoint "arn:aws:sagemaker:us-east-1:823217009914:endpoint/genai-bootcamp-flan-2023-06-18-22-57-23-846".
