In [None]:
!pip install -q transformers datasets sentencepiece
!pip install -U -q sagemaker

### Deploy Flan T5 on SageMaker and test with hosting services

Here is a nice blog to explain https://www.philschmid.de/deploy-flan-t5-sagemaker

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

#### We can download the tokenizer from huggingface repo

In [None]:
from distutils.dir_util import copy_tree
from pathlib import Path
from tempfile import TemporaryDirectory
from huggingface_hub import snapshot_download

HF_MODEL_ID="google/flan-t5-small"
# create model dir
model_tar_dir = Path('fine-tuned-' + HF_MODEL_ID.split("/")[-1])

# setup temporary directory
with TemporaryDirectory() as tmpdir:
    # download snapshot
    snapshot_dir = snapshot_download(repo_id=HF_MODEL_ID, cache_dir=tmpdir,ignore_patterns=["*.msgpack", "*.h5", "*model*"])
    # copy snapshot to model dir
    copy_tree(snapshot_dir, str(model_tar_dir))


In [None]:
PYTORCH_MODEL_LOCATION = "./model"
MDDEL_TAR_NAME = "model.tar.gz"
training_job_name = "huggingface-finetune-twitter2023-04-21--2023-04-21-02-02-26-020"
s3_model_tar_gz_uri = "s3://{}/{}/output/{}".format(sess.default_bucket(),training_job_name,MDDEL_TAR_NAME)
print("Fine tuned model artifact is located at ", s3_model_tar_gz_uri)

In [None]:
from sagemaker.s3 import S3Downloader

# upload model.tar.gz to s3
S3Downloader.download(s3_uri=s3_model_tar_gz_uri,local_path=PYTORCH_MODEL_LOCATION)

print("model downloaded from {} and saved locally at {}".format(s3_model_tar_gz_uri,PYTORCH_MODEL_LOCATION))

In [None]:
import tarfile
import os

OUTPUT_MODEL_DIR = "sagemaker_finetuned_model"

def extraction(tar_dir, tar_file, output_dir):
    tar_location = os.path.join(tar_dir, tar_file)
    with tarfile.open(tar_location, "r:gz") as tar:
        tar.extractall(os.path.join(tar_dir, output_dir))
        print("Extracted to ",os.path.join(tar_dir, output_dir))

extraction(PYTORCH_MODEL_LOCATION, MDDEL_TAR_NAME, OUTPUT_MODEL_DIR)

#### We now copy the fine funed Pytorch model to the tokenizer dir 

In [None]:
# You can check the model performance with different check point file
check_point = "checkpoint-5500"
PYTORCH_BIN_LOCATION = os.path.join(PYTORCH_MODEL_LOCATION,OUTPUT_MODEL_DIR,check_point,"pytorch_model.bin")
PYTORCH_BIN_LOCATION

In [None]:
model_tar_dir

#### Replace the pytorch_model.bin with fine tuned model on SageMaker

In [None]:
from distutils.file_util import copy_file
copy_file(PYTORCH_BIN_LOCATION, str(model_tar_dir))

#### Let's test our fine tuned model locally to make sure everything work

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"


def model_fn(model_dir):
    # load model and processor from model_dir
    model =  AutoModelForSeq2SeqLM.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model.eval()
    return model.to(device), tokenizer


def predict_fn(data, model_and_tokenizer):
    # unpack model and tokenizer
    model, tokenizer = model_and_tokenizer

    # process input
    inputs = data.pop("inputs", data)
    parameters = data.pop("parameters", None)

    # preprocess
    input_ids = tokenizer(inputs, return_tensors="pt").input_ids
    input_ids = input_ids.to(device)

    with torch.no_grad():
        # pass inputs with all kwargs in data
        if parameters is not None:
            outputs = model.generate(input_ids, **parameters)
        else:
            outputs = model.generate(input_ids)

    # postprocess the prediction
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return [{"generated_text": prediction}]

In [None]:
model_and_tokenizer = model_fn(model_tar_dir)

#### Let's test our fine tuned model summerization 

In [None]:
text = "summarize: Lenny: Babe, can you help me with something?\r\nBob: Sure, what's up?\r\nLenny: Which one should I pick?\r\nBob: Send me photos\r\nLenny:  <file_photo>\r\nLenny:  <file_photo>\r\nLenny:  <file_photo>\r\nBob: I like the first ones best\r\nLenny: But I already have purple trousers. Does it make sense to have two pairs?\r\nBob: I have four black pairs :D :D\r\nLenny: yeah, but shouldn't I pick a different color?\r\nBob: what matters is what you'll give you the most outfit options\r\nLenny: So I guess I'll buy the first or the third pair then\r\nBob: Pick the best quality then\r\nLenny: ur right, thx\r\nBob: no prob :"
input_json = {
    "inputs": text
}
print(f"{text}\n---------------")
results = predict_fn(input_json,model_and_tokenizer)
print(f"{results}\n---------------")

In [None]:
!pygmentize ./scripts/inference_flan_t5_model_hub.py

#### Let's copy our inference code alongside the model artifact and prepare to deploy on SageMaker Endpoint 

In [None]:
from os import path
inference_code_dir = Path(model_tar_dir,"code")
if not path.exists(inference_code_dir):
    inference_code_dir.mkdir()
copy_file(src="./scripts/inference_flan_t5_model_hub.py",dst=path.join(str(inference_code_dir),"inference.py"))

In [None]:
import tarfile
import os

# helper to create the model.tar.gz
def compress(tar_dir=None,output_file="model.tar.gz"):
    parent_dir=os.getcwd()
    os.chdir(tar_dir)
    with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
        for item in os.listdir('.'):
          print(item)
          tar.add(item, arcname=item)
    os.chdir(parent_dir)

compress(str(model_tar_dir))

In [None]:
from sagemaker.s3 import S3Uploader

# upload model.tar.gz to s3
s3_model_uri = S3Uploader.upload(local_path="model.tar.gz", desired_s3_uri=f"s3://{sess.default_bucket()}/flan-t5-small")

print(f"model uploaded to: {s3_model_uri}")

In [None]:
#s3_model_uri = "s3://sagemaker-eu-west-1-707684582322/flan-t5-small/model.tar.gz"

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=s3_model_uri,      # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.17",  # transformers version used
   pytorch_version="1.10",       # pytorch version used
   py_version='py38',            # python version used
)

# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.xlarge"
    )

#### Run inference using the deployed model with python sdk

In [None]:
import boto3
import json

In [None]:
runtime = boto3.client("sagemaker-runtime")


**Put the correct endpoint name**

In [None]:
endpoint_name = "huggingface-pytorch-inference-2023-05-03-02-47-34-073"

In [None]:
text = "Sentiment classification: The house is wonderful"
text = "summarize: Lenny: Babe, can you help me with something?\r\nBob: Sure, what's up?\r\nLenny: Which one should I pick?\r\nBob: Send me photos\r\nLenny:  <file_photo>\r\nLenny:  <file_photo>\r\nLenny:  <file_photo>\r\nBob: I like the first ones best\r\nLenny: But I already have purple trousers. Does it make sense to have two pairs?\r\nBob: I have four black pairs :D :D\r\nLenny: yeah, but shouldn't I pick a different color?\r\nBob: what matters is what you'll give you the most outfit options\r\nLenny: So I guess I'll buy the first or the third pair then\r\nBob: Pick the best quality then\r\nLenny: ur right, thx\r\nBob: no prob :"
input_json = {
    "inputs": text
}
print(f"{text}\n---------------")

In [None]:
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(input_json),
    ContentType="application/json",
)

print(response["Body"].read())

#### Don't forget to delete your endpoint once finished testing