In [1]:
!pip install -q --upgrade pip
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q tensorflow==2.1.0
!pip install -q transformers==2.8.0

In [2]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify the S3 Location of the Features

In [3]:
%store -r training_job_name

In [4]:
print(training_job_name)

tensorflow-training-2020-05-12-16-05-35-313


In [5]:
print('Previous Training Job Name: {}'.format(training_job_name))

Previous Training Job Name: tensorflow-training-2020-05-12-16-05-35-313


In [6]:
import time
timestamp = '{}'.format(int(time.time()))

compilation_job_name = '{}-{}'.format(training_job_name, timestamp)

sm_client = boto3.client('sagemaker')
data_shape = '{"input_ids":[1,128],"input_mask":[1,128],"segment_ids":[1,128]}'
target_device = 'ml_c5'
framework = 'TENSORFLOW' # TFLITE
#framework_version = '2.1.0'
model_path = 's3://{}/{}/output/model.tar.gz'.format(bucket, training_job_name)
compiled_model_path = 's3://{}/{}/compiled-output/'.format(bucket, training_job_name)

# TODO:  Work around the following error, if possible:
```
Incompatible Tensorflow model: The following operators are not implemented: {'StatefulPartitionedCall'}
```

Different forms of this error show up for `ml_c5`, `ml_inf1`, and `ml_p3` for our BERT model.  

In [7]:
response = sm_client.create_compilation_job(
    CompilationJobName=compilation_job_name,
    RoleArn=role,
    InputConfig={
        'S3Uri': model_path,
        'DataInputConfig': data_shape,
        'Framework': framework
    },
    OutputConfig={
        'S3OutputLocation': compiled_model_path,
        'TargetDevice': target_device
    },
    StoppingCondition={
        'MaxRuntimeInSeconds': 300
    }
)

In [8]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/compilation-jobs/{}">Compilation Job</a></b>'.format(region, compilation_job_name)))

In [9]:
# Poll every 10 sec
while True:
    response = sm_client.describe_compilation_job(CompilationJobName=compilation_job_name)
    if response['CompilationJobStatus'] == 'COMPLETED':
        break
    elif response['CompilationJobStatus'] == 'FAILED':
        raise RuntimeError('Compilation failed')
    print('Compiling ...')
    time.sleep(10)
print('Done!')

Compiling ...
Compiling ...
Compiling ...
Compiling ...
Compiling ...
Compiling ...
Compiling ...
Compiling ...
Compiling ...


RuntimeError: Compilation failed

In [None]:
# Extract compiled model artifact
compiled_model_path = response['ModelArtifacts']['S3ModelArtifacts']

# TODO:  TFLite currently throwing an error related to GPUs, CUDA, and TensorRT

In [None]:
!aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz ./model.tar.gz

In [None]:
!tar -xzvf model.tar.gz

In [None]:
import tensorflow as tf

converter = tf.lite.TFLiteConverter.from_saved_model('./tensorflow/saved_model/0/')

In [None]:
converter.post_training_quantize = False

In [None]:
tflite_model = converter.convert()

In [None]:
tflite_model_path = '%s/tflite_optimized_model.tflite' % tflite_model_base_path

model_size = open(tflite_model_path, "wb").write(tflite_model)

print('\nModel size reduced to %s bytes' % model_size)

In [None]:
import numpy as np
import tensorflow as tf

# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()

In [None]:
# Get input and output tensors.
input_details = interpreter.get_input_details()
print('Input Tensor Details: %s' % input_details)

output_details = interpreter.get_output_details()
print('Output Tensor Details: %s' % output_details)

In [None]:
# Test model on random input data.
input_shape = input_details[0]['shape']
input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
print('Input: %s' % input_data)
interpreter.set_tensor(input_details[0]['index'], input_data)

In [None]:
%%time
interpreter.invoke()

In [None]:
output_data = interpreter.get_tensor(output_details[0]['index'])
print('Prediction: %s' % output_data)