### Local test with hugging Face Pytorch 

#### This notebook is tested with Python 3.9 Pytorch 1.13 GPU optimised container on SageMaker studio

In [None]:
!pip install -q transformers==4.26 datasets sentencepiece
!pip install -U -q sagemaker

#### HuggingFace FLAN-T5
https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/flan-t5#overview

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

#### Task: text generation

In [None]:
inputs = tokenizer("A step by step recipe to make bolognese pasta:", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

#### Task: translation English to French

In [None]:
inputs = tokenizer("Translate English to French: The house is wonderful.", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### Task: translation English to German

In [None]:
inputs = tokenizer("Translate English to German: The house is wonderful.", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### Task: sentiment classification positive negative

In [None]:
inputs = tokenizer('Put below sentences into positive and negative: The house is wonderful', return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Deploy Flan T5 on SageMaker and test with hosting services

Here is a nice blog to explain https://www.philschmid.de/deploy-flan-t5-sagemaker

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
from distutils.dir_util import copy_tree
from pathlib import Path
from tempfile import TemporaryDirectory
from huggingface_hub import snapshot_download

HF_MODEL_ID="google/flan-t5-small"
# create model dir
model_tar_dir = Path(HF_MODEL_ID.split("/")[-1])
model_tar_dir.mkdir()

# setup temporary directory
with TemporaryDirectory() as tmpdir:
    # download snapshot
    snapshot_dir = snapshot_download(repo_id=HF_MODEL_ID, cache_dir=tmpdir,ignore_patterns=["*.msgpack", "*.h5"])
    # copy snapshot to model dir
    copy_tree(snapshot_dir, str(model_tar_dir))


In [None]:
from distutils.dir_util import copy_tree
from distutils.file_util import copy_file
from pathlib import Path
from tempfile import TemporaryDirectory
from os import path


# copy scripts/ to model dir, this step is not neccessary with SageMaker HuggingFace
HF_MODEL_ID="google/flan-t5-small"
# create model dir
model_tar_dir = Path(HF_MODEL_ID.split("/")[-1])
if not path.exists(model_tar_dir):
    model_tar_dir.mkdir()

In [None]:
!pygmentize ./scripts/inference_flan_t5_model_hub.py

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"


def model_fn(model_dir):
    # load model and processor from model_dir
    model =  AutoModelForSeq2SeqLM.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model.eval()
    return model.to(device), tokenizer


def predict_fn(data, model_and_tokenizer):
    # unpack model and tokenizer
    model, tokenizer = model_and_tokenizer

    # process input
    inputs = data.pop("inputs", data)
    parameters = data.pop("parameters", None)

    # preprocess
    input_ids = tokenizer(inputs, return_tensors="pt").input_ids
    input_ids = input_ids.to(device)


    # pass inputs with all kwargs in data
    if parameters is not None:
        outputs = model.generate(input_ids, **parameters)
    else:
        outputs = model.generate(input_ids)

    # postprocess the prediction
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return [{"generated_text": prediction}]


#### Local test the inference code 

In [None]:
text = "Put below sentences into positive and negative: The house is wonderful"
input_json = {
    "inputs": text
}

In [None]:
model_and_tokenizer = model_fn("./flan-t5-small")

In [None]:
results = predict_fn(input_json,model_and_tokenizer)
results

In [None]:
inference_code_dir = Path(HF_MODEL_ID.split("/")[-1] + "/code")
if not path.exists(inference_code_dir):
    inference_code_dir.mkdir()
copy_file(src="./scripts/inference_flan_t5_model_hub.py",dst=path.join(str(inference_code_dir),"inference.py"))

In [None]:
import tarfile
import os

# helper to create the model.tar.gz
def compress(tar_dir=None,output_file="model.tar.gz"):
    parent_dir=os.getcwd()
    os.chdir(tar_dir)
    with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
        for item in os.listdir('.'):
          print(item)
          tar.add(item, arcname=item)
    os.chdir(parent_dir)

compress(str(model_tar_dir))

In [None]:
from sagemaker.s3 import S3Uploader

# upload model.tar.gz to s3
s3_model_uri = S3Uploader.upload(local_path="model.tar.gz", desired_s3_uri=f"s3://{sess.default_bucket()}/flan-t5-large")

print(f"model uploaded to: {s3_model_uri}")

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=s3_model_uri,      # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.26",  # transformers version used
   pytorch_version="1.13",       # pytorch version used
   py_version='py39',            # python version used
)

# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.xlarge"
    )


#### Run inference using the deployed model with python sdk

In [None]:
import boto3
import json

In [None]:
runtime = boto3.client("sagemaker-runtime")
#Put the correct endpoint name 
#endpoint_name = "huggingface-pytorch-inference-2023-03-24-03-24-49-183"

In [None]:
text = "Put below sentences into positive and negative: The house is wonderful"

In [None]:
input_json = {
    "inputs": text
}

In [None]:
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(input_json),
    ContentType="application/json",
)

print(response["Body"].read())