# Deploy pre-trained HF model with PyTorch 1.8.1 DL inference container

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'hfdeploypytorch'
hf_cache_dir = 'hf_cache_dir/'

print(region)
print(role)
print(bucket)

# Download model from HF and save to Amazon S3

In [None]:
!pip install transformers==4.5.1

In [None]:
! mkdir -p $hf_cache_dir

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

PRE_TRAINED_MODEL_NAME='facebook/bart-large-cnn'

# Note that we use a specific HF cache dir, to avoid using the default cache dirs that might fill 
# root disk space.
model = BartForConditionalGeneration.from_pretrained(PRE_TRAINED_MODEL_NAME, cache_dir=hf_cache_dir)
model.save_pretrained('./models/bart_model/')

In [None]:
tokenizer = BartTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
tokenizer.save_pretrained('./models/bart_tokenizer/')

## Add inference code and requirements.txt
We are manually adding the inference code and requirements.txt to the model folder, to avoid the SM Python SDK having to repack the model.tar.gz archive when executing deployment.
Since there are large models, the repack operation can take some time (downlaod from S3, repack, re-upload).

In [None]:
! mkdir -p models/code

The custom inference code must be stored in the code/ folder in the model archive, and the name of the entrypoint module is inference.py by default. You can customize that by passing an environment variable named SAGEMAKER_PROGRAM when creating the Model object (see below).

In [None]:
! cp source_dir/inference.py models/code/inference.py
! cp source_dir/requirements.txt models/code/requirements.txt

## Create model archive and upload to S3

In [None]:
!tar -C models/ -cvzf model.tar.gz bart_model/ bart_tokenizer/ code/

In [None]:
from sagemaker.s3 import S3Uploader
model_artifact = S3Uploader.upload('model.tar.gz','s3://{0}/{1}/model'.format(bucket, prefix))
print(model_artifact)

# Deploy model

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class Summarizer(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super().__init__(endpoint_name, sagemaker_session=sagemaker_session,
                         serializer=JSONSerializer(), 
                         deserializer=JSONDeserializer())

In [None]:
from sagemaker.image_uris import retrieve

deploy_instance_type = 'ml.g4dn.xlarge'

pytorch_inference_image_uri = retrieve('pytorch',
                                       region,
                                       version='1.8.1',
                                       py_version='py3',
                                       instance_type = deploy_instance_type,
                                       accelerator_type=None,
                                       image_scope='inference')
print(pytorch_inference_image_uri)

In [None]:
from sagemaker.model import Model

hf_model = Model(model_data=model_artifact,
                 image_uri=pytorch_inference_image_uri,
                 predictor_cls=Summarizer,
                 sagemaker_session=sagemaker_session,
                 #env = {
                 #    'SAGEMAKER_PROGRAM': 'inference.py'
                 #},
                 role=role)

In [None]:
predictor = hf_model.deploy(instance_type=deploy_instance_type,
                            initial_instance_count=1)
predictor

# Test inference

In [None]:
with open('article.txt') as f:
    content = f.read()
content = content.replace('\n', ' ')

json_request_data = {"text": "{0}"}
json_request_data["text"] = json_request_data["text"].format(content)

json_request_data

In [None]:
%%time
prediction = predictor.predict(json_request_data)
print(prediction)

# Delete endpoint

In [None]:
predictor.delete_endpoint()