### Install deps

In [3]:
!pip install "sagemaker==2.116.0" "huggingface_hub==0.12.0" --upgrade --quiet

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Setting up environment

In [5]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker role arn: arn:aws:iam::346762710647:role/service-role/AmazonSageMaker-ExecutionRole-20221118T120164
sagemaker bucket: sagemaker-us-east-1-346762710647
sagemaker session region: us-east-1


### Steps
    Create FLAN-T5 XXL inference script with bnb quantization
    Create SageMaker model.tar.gz artifact
    Deploy the model to Amazon SageMaker
    Run inference using the deployed model

### Create FLAN-T5 XXL inference script with bnb quantization
Packing requirements.txt and inference.py in code/

In [7]:
!mkdir code

In [8]:
%%writefile code/requirements.txt
accelerate==0.16.0
transformers==4.26.0
bitsandbytes==0.37.0

Writing code/requirements.txt


In [9]:
%%writefile code/inference.py
from typing import Dict, List, Any
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch


def model_fn(model_dir):
    # load model and processor from model_dir
    model =  AutoModelForSeq2SeqLM.from_pretrained(model_dir, device_map="auto", load_in_8bit=True)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    return model, tokenizer


def predict_fn(data, model_and_tokenizer):
    # unpack model and tokenizer
    model, tokenizer = model_and_tokenizer

    # process input
    inputs = data.pop("inputs", data)
    parameters = data.pop("parameters", None)

    # preprocess
    input_ids = tokenizer(inputs, return_tensors="pt").input_ids

    # pass inputs with all kwargs in data
    if parameters is not None:
        outputs = model.generate(input_ids, **parameters)
    else:
        outputs = model.generate(input_ids)

    # postprocess the prediction
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return [{"generated_text": prediction}]



Writing code/inference.py


### Create SageMaker model.tar.gz artifact
Requires around 30GB in space. Sharded model using fp16 (quantization).

In [10]:
from distutils.dir_util import copy_tree
from pathlib import Path
from tempfile import TemporaryDirectory
from huggingface_hub import snapshot_download

HF_MODEL_ID="philschmid/flan-t5-xxl-sharded-fp16"
# create model dir
model_tar_dir = Path(HF_MODEL_ID.split("/")[-1])
model_tar_dir.mkdir()

# setup temporary directory
with TemporaryDirectory() as tmpdir:
    # download snapshot
    snapshot_dir = snapshot_download(repo_id=HF_MODEL_ID, cache_dir=tmpdir)
    # copy snapshot to model dir
    copy_tree(snapshot_dir, str(model_tar_dir))


HBox(children=(FloatProgress(value=0.0, description='Fetching 23 files', max=23.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)0155a/.gitattributes', max=1477.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)6f60155a/config.json', max=759.0, style=Pr…





HBox(children=(FloatProgress(value=0.0, description='Downloading (…)2a6f60155a/README.md', max=2525.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)a6f60155a/handler.py', max=1167.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00003-of-00012.bin";', max=1929475550.0, s…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)a/createEndpoint.png', max=96271.0, style=…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00001-of-00012.bin";', max=1722882745.0, s…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00002-of-00012.bin";', max=1929475486.0, s…






HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00005-of-00012.bin";', max=1929475550.0, s…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00004-of-00012.bin";', max=1929475550.0, s…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00006-of-00012.bin";', max=1974577874.0, s…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00007-of-00012.bin";', max=1929485961.0, s…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00008-of-00012.bin";', max=1996604032.0, s…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00009-of-00012.bin";', max=1996604032.0, s…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00010-of-00012.bin";', max=1979817673.0, s…





HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00011-of-00012.bin";', max=1979817673.0, s…

HBox(children=(FloatProgress(value=0.0, description='Downloading (…)00012-of-00012.bin";', max=1236336721.0, s…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)model.bin.index.json', max=50781.0, style=…





HBox(children=(FloatProgress(value=0.0, description='Downloading (…)55a/requirements.txt', max=52.0, style=Pro…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)cial_tokens_map.json', max=2201.0, style=P…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)"spiece.model";', max=791656.0, style=Prog…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)0155a/tokenizer.json', max=2422164.0, styl…




HBox(children=(FloatProgress(value=0.0, description='Downloading (…)okenizer_config.json', max=2537.0, style=P…











In [11]:
# copy code/ to model dir
copy_tree("code/", str(model_tar_dir.joinpath("code")))


['flan-t5-xxl-sharded-fp16/code/requirements.txt',
 'flan-t5-xxl-sharded-fp16/code/inference.py']

### Creating tarball

In [12]:
import tarfile
import os

# helper to create the model.tar.gz
def compress(tar_dir=None,output_file="model.tar.gz"):
    parent_dir=os.getcwd()
    os.chdir(tar_dir)
    with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
        for item in os.listdir('.'):
          print(item)
          tar.add(item, arcname=item)
    os.chdir(parent_dir)

compress(str(model_tar_dir))

pytorch_model-00001-of-00012.bin
pytorch_model-00010-of-00012.bin
spiece.model
createEndpoint.png
pytorch_model-00009-of-00012.bin
tokenizer_config.json
pytorch_model-00005-of-00012.bin
pytorch_model-00003-of-00012.bin
pytorch_model-00012-of-00012.bin
requirements.txt
pytorch_model-00007-of-00012.bin
config.json
pytorch_model.bin.index.json
pytorch_model-00011-of-00012.bin
pytorch_model-00002-of-00012.bin
.gitattributes
special_tokens_map.json
handler.py
pytorch_model-00006-of-00012.bin
README.md
tokenizer.json
pytorch_model-00004-of-00012.bin
code
pytorch_model-00008-of-00012.bin


### Upload tarball in S3

In [14]:
from sagemaker.s3 import S3Uploader

# upload model.tar.gz to s3
s3_model_uri = S3Uploader.upload(local_path="model.tar.gz", desired_s3_uri=f"s3://{sess.default_bucket()}/flan-t5-xxl")

print(f"model uploaded to: {s3_model_uri}")

model uploaded to: s3://sagemaker-us-east-1-346762710647/flan-t5-xxl/model.tar.gz


### Deploy the model to Amazon SageMaker

In [15]:
from sagemaker.huggingface.model import HuggingFaceModel


# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=s3_model_uri,      # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.17",  # transformers version used
   pytorch_version="1.10",       # pytorch version used
   py_version='py38',            # python version used
)

# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.xlarge"
)

------------------!

### Run inference using the deployed model

In [17]:
payload = """Summarize the following text:
Peter and Elizabeth took a taxi to attend the night party in the city. While in the party, Elizabeth collapsed and was rushed to the hospital.
Since she was diagnosed with a brain injury, the doctor told Peter to stay besides her until she gets well.
Therefore, Peter stayed with her at the hospital for 3 days without leaving.
"""

parameters = {
  "early_stopping": True,
  "length_penalty": 2.0,
  "max_new_tokens": 50,
  "temperature": 0,
  "min_length": 10,
  "no_repeat_ngram_size": 3,
}

# Run prediction
predictor.predict({
	"inputs": payload,
  "parameters" :parameters
})

[{'generated_text': 'Peter stayed with Elizabeth at the hospital for 3 days.'}]

In [18]:
payload = """Answer the following question step by step:
Roger has 5 tennis balls. He buys 2 more cans of tennis balls.
Each can has 3 tennis balls. How many tennis balls does he have now?
"""

parameters = {
  "early_stopping": True,
  "length_penalty": 2.0,
  "max_new_tokens": 50,
  "temperature": 0,
}

# Run prediction
predictor.predict({
	"inputs": payload,
  "parameters" :parameters
})

[{'generated_text': 'He buys 2 cans of tennis balls, so he has 2 * 3 = 6 tennis balls. He has 5 + 6 = 11 tennis balls now.'}]

In [28]:
payload = """Answer the following question step-by-step:
How many stars are in the milky way?
"""

parameters = {
  "early_stopping": True,
  "length_penalty": 2.0,
  "max_new_tokens": 50,
  "temperature": 0,
}

# Run prediction
predictor.predict({
	"inputs": payload,
  "parameters" :parameters
})

[{'generated_text': 'The Milky Way Galaxy is a spiral galaxy that contains about 200 billion stars. The Milky Way Galaxy is the largest galaxy in the known universe. The Milky Way Galaxy contains about 200 billion stars. Therefore, the final answer is 200 billion'}]

In [29]:
payload = """Answer the following question step-by-step:
translate English to German: The house is wonderful.
"""

parameters = {
  "early_stopping": True,
  "length_penalty": 2.0,
  "max_new_tokens": 50,
  "temperature": 0,
}

# Run prediction
predictor.predict({
	"inputs": payload,
  "parameters" :parameters
})

[{'generated_text': 'Das Haus ist wunderbar.'}]

In [40]:
payload = """
Write an article on space travel:
"""

parameters = {
  "early_stopping": True,
  "length_penalty": 2.0,
  "max_new_tokens": 500,
  "temperature": 0,
}

# Run prediction
predictor.predict({
	"inputs": payload,
  "parameters" :parameters
})

[{'generated_text': 'The first spacecraft to reach the Moon was the Soviet Luna 3 in 1959. The first spacecraft to reach the Moon was the Soviet Luna 3 in 1959. The first spacecraft to reach the Moon was the Soviet Luna 3 in 1959.'}]

### Delete the model and endpoint

In [41]:
#predictor.delete_model()
predictor.delete_endpoint()