In [14]:
!python -m pip install --quiet --upgrade pip
!pip install --quiet -U sagemaker
!pip install -U --quiet transformers

In [15]:
import os
import torch
import transformers
!python -V
print(f"torch version: {torch.__version__}")
print(f"transformers version: {transformers.__version__}")

Python 3.6.13
torch version: 1.7.1
transformers version: 4.11.3


In [16]:
import time
import sagemaker

In [17]:
# S3 directory where the model is stored
bucket = "ai-inference-env"
prefix = "summarizer"
key = os.path.join(prefix, "model.tar.gz")
pretrained_model_data = "s3://{}/{}".format(bucket, key)
pretrained_model_data

's3://ai-inference-env/summarizer/model.tar.gz'

In [18]:
!pygmentize code/inference.py

[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m pipeline
[34mfrom[39;49;00m [04m[36mtest_cuda[39;49;00m [34mimport[39;49;00m test_cuda

JSON_CONTENT_TYPE = [33m"[39;49;00m[33mapplication/json[39;49;00m[33m"[39;49;00m

[37m# checking that cuda is available inside container[39;49;00m
test_cuda()


[34mdef[39;49;00m [32mmodel_fn[39;49;00m(model_dir):
    [36mprint[39;49;00m(model_dir)
    summarizer = pipeline(
        [33m"[39;49;00m[33msummarization[39;49;00m[33m"[39;49;00m, model=model_dir, tokenizer=model_dir, framework=[33m"[39;49;00m[33mpt[39;49;00m[33m"[39;49;00m, device=[34m0[39;49;00m
    )

    [34mreturn[39;49;00m summarizer


[34mdef[39;49;00m [32minput_fn[39;49;00m(serialized_input_data, content_type=JSON_CONTENT_TYPE):
    [36mprint[39;49;00m([36mtype[39;49;00m(serialized_input_data))
    [34mif[39;49;00m content_type == JSON_CONTENT_TYPE:
        inp

 ### Deploy Model ###

In [19]:
#instance_type = "local_gpu" # or "local"
instance_type = "ml.g4dn.xlarge"

In [20]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()

In [21]:
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(
    model_data=pretrained_model_data,
    role=role,
    framework_version="1.8.1",
    source_dir="./code",
    py_version="py3",
    entry_point="inference.py",
)

In [22]:
predictor = pytorch_model.deploy(endpoint_name= "summarizer", initial_instance_count=1, instance_type=instance_type)

---------------!

In [23]:
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

### MODEL INFERENCE ###

In [30]:
article_to_summarize = "When Paul Jobs was mustered out of the Coast Guard after World War II, he made a wager with his crewmates. They had arrived in San Francisco, where their ship was decommissioned, and Paul bet that he would find himself a wife within two weeks. He was a taut, tattooed engine mechanic, six feet tall, with a passing resemblance to James Dean. But it wasn’t his looks that got him a date with Clara Hagopian, a sweet-humored daughter of Armenian immigrants. It was the fact that he and his friends had a car, unlike the group she had originally planned to go out with that evening. Ten days later, in March 1946, Paul got engaged to Clara and won his wager. It would turn out to be a happy marriage, one that lasted until death parted them more than forty years later."

In [34]:
result = predictor.predict(article_to_summarize)
print(result)
print()
print(result[0]["summary_text"])

[{'summary_text': ' Paul Jobs was mustered out of the Coast Guard after World War II . He bet that he would find himself a wife within two weeks . Ten days later, in March 1946, he got engaged to Clara Hagopian and won his wager . It would turn out to be a happy marriage, one that lasted until death .'}]

 Paul Jobs was mustered out of the Coast Guard after World War II . He bet that he would find himself a wife within two weeks . Ten days later, in March 1946, he got engaged to Clara Hagopian and won his wager . It would turn out to be a happy marriage, one that lasted until death .


In [29]:
inference_time = []
for _ in range(30):
    start = time.time()
    predictor.predict(article_to_summarize)
    inference_time.append(time.time()-start)
    
print(f"Average inference on GPU is: {sum(inference_time)/len(inference_time):.3} ms")

Average inference on GPU is: 0.552 ms


CLEAN UP

In [None]:
predictor.delete_endpoint(predictor.endpoint)