In [573]:
!pip install -qU pip
!pip install -q tensorflow==2.1.0
!pip install -q transformers==2.8.0

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
%store -r training_job_name

In [None]:
print(training_job_name)

In [None]:
from sagemaker.tensorflow.serving import Model

# Following this example:
#    https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/tensorflow/deploying_tensorflow_serving.rst#making-predictions-against-a-sagemaker-endpoint
    
# For network isolation mode:
#    If you are working in a network-isolation situation or if you don't 
#    want to install dependencies at runtime every time your endpoint 
#    starts or a batch transform job runs, you might want to put pre-downloaded 
#    dependencies under a lib directory and this directory as dependency. The container 
#    adds the modules to the Python path. Note that if both lib and requirements.txt are
#    present in the model archive, the requirements.txt is ignored:

# If you change SAGEMAKER_TFS_DEFAULT_MODEL_NAME to something other than 'saved_model', you may see the dreaded ping error in the logs error
env = {
  'SAGEMAKER_TFS_DEFAULT_MODEL_NAME': 'saved_model' # <== change this when using multi-model,
                                                    #     but watch out for the dreaded ping/ error 
                                                    #     if the model name doesn't exist
}

model = Model(#entry_point='inference.py',
              #source_dir='src_inference',
              model_data='s3://{}/{}/output/model.tar.gz'.format(bucket, training_job_name),
              role=role,
              framework_version='2.1.0',
              env=env)

In [None]:
print(type(model))

In [None]:
predictor = model.deploy(initial_instance_count=1, 
                         instance_type='ml.m4.xlarge',
                        )

# Copy the Inference Model

In [None]:
!aws s3 cp s3://$bucket/$predictor.endpoint/model.tar.gz ./model.tar.gz

In [None]:
!tar -xvzf ./model.tar.gz

In [None]:
!saved_model_cli show --all --dir ./tensorflow/saved_model/0/

In [None]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
class RequestHandler(object):
    import json

    def __init__(self, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __call__(self, instances):
        transformed_instances = []

        for instance in instances:
            tokens_a = self.tokenizer.tokenize(instance)

            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self.max_seq_length - 2:
                tokens_a = tokens_a[0:(self.max_seq_length - 2)]

            tokens = []  
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in tokens_a:
                tokens.append(token)
                segment_ids.append(0)  
            tokens.append("[SEP]")
            segment_ids.append(0)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length
            assert len(segment_ids) == self.max_seq_length

            instance = {"input_ids": input_ids, 
                        "input_mask": input_mask, 
                        "segment_ids": segment_ids}

            transformed_instances.append(instance)

        transformed_data = {"instances": transformed_instances}

        return json.dumps(transformed_data)

In [None]:
class ResponseHandler(object):
    import tensorflow as tf

    def __init__(self, classes):
        self.classes = classes
    
    def __call__(self, response, accept_header):
        response_body = response.read().decode('utf-8')

        response_json = json.loads(response_body)

        log_probabilities = response_json["predictions"]

        predicted_classes = []

        for log_probability in log_probabilities:
            softmax = tf.nn.softmax(log_probability)    
            predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
            predicted_class = self.classes[predicted_class_idx]
            predicted_classes.append(predicted_class)

        return json.dumps(predicted_classes)

In [None]:
from sagemaker.tensorflow.serving import Predictor

request_handler = RequestHandler(tokenizer=tokenizer,
                                 max_seq_length=128)

response_handler = ResponseHandler(classes=[1, 2, 3, 4, 5])

predictor = Predictor(endpoint_name=predictor.endpoint,
                      sagemaker_session=sess,
                      serializer=request_handler,
                      deserializer=response_handler,
                      content_type='application/json',
                      model_name='saved_model',
                      model_version=0)

In [None]:
import tensorflow as tf
    
instances = ["This is great!", 
             "This is terrible."]

predicted_classes = predictor.predict(instances)

print(predicted_classes)