In [33]:
!python -m pip install --quiet --upgrade pip
!pip install --quiet -U sagemaker
!pip install -U --quiet transformers
import torch
import transformers

In [34]:
!python -V
print(f"torch version: {torch.__version__}")
print(f"transformers version: {transformers.__version__}")

Python 3.6.13
torch version: 1.7.1
transformers version: 4.11.3


In [35]:
import os
import time
import sagemaker

In [36]:
# S3 directory where the model is stored
bucket = "ai-inference-env"
prefix = "sentence-similarity"
key = os.path.join(prefix, "model.tar.gz")
pretrained_model_data = "s3://{}/{}".format(bucket, key)
pretrained_model_data

's3://ai-inference-env/sentence-similarity/model.tar.gz'

In [37]:
!pygmentize code_sentence_similarity/inference.py

[34mimport[39;49;00m [04m[36mre[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m AutoTokenizer, AutoModel
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmetrics[39;49;00m[04m[36m.[39;49;00m[04m[36mpairwise[39;49;00m [34mimport[39;49;00m cosine_similarity
[34mfrom[39;49;00m [04m[36mtest_cuda[39;49;00m [34mimport[39;49;00m test_cuda

logger = logging.getLogger([31m__name__[39;49;00m)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


JSON_CONTENT_TYPE = [33m"[39;49;00m[33mapplication/json[39;49;00m[33m"[39;49;00m

[37m# checking that cuda is available inside container[39;49;00m
test_cuda()

MODEL_PATH = [33m"[39;49;00m[33m/opt/ml/model/[39;49;00m[33m"

In [38]:
#instance_type = "local_gpu" # or "local"
instance_type = "ml.g4dn.xlarge"

In [39]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()

In [40]:
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(
    model_data=pretrained_model_data,
    role=role,
    framework_version="1.8.1",
    source_dir="./code_sentence_similarity",
    py_version="py3",
    entry_point="inference.py",
)

In [41]:
predictor = pytorch_model.deploy(endpoint_name= "sentence-similarity", initial_instance_count=1, instance_type=instance_type)

-------------------!

In [42]:
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [48]:
user_sentence = "I love going to the beach"
ground_truth = "the beach is beautiful"

In [50]:
result = predictor.predict({"user_input": user_sentence, "true_sentence": ground_truth})
print(result)
print(f"result: {result['similarity']}")

{'similarity': '0.6418977975845337'}
result: 0.6418977975845337


### Computes inference average time ###

In [45]:
inference_time = []
for _ in range(50):
    start = time.time()
    predictor.predict({"user_input": user_sentence, "true_sentence": ground_truth})
    inference_time.append(time.time()-start)
    
print(f"Average inference on GPU is: {sum(inference_time)/len(inference_time):.3} ms")

Average inference on GPU is: 0.0347 ms


### Clean Up ###

In [19]:
#predictor.delete_endpoint(predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
