# AWS Inferentia inference on Amazon EC2 Inf1 instance
This example demonstrates AWS Inferentia inference with TensorFlow and AWS Neuron SDK compiler and runtime

This example was tested on Amazon EC2 `inf1.xlarge` the following AWS Deep Learning AMI: 
`Deep Learning AMI (Ubuntu 18.04) Version 35.0`

Run this notebook using the following conda environment:
`aws_neuron_tensorflow_p36`

Prepare your imagenet validation TFRecord files using the following helper script:
https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh

Save it to `/home/ubuntu/datasets/` or update the dataset location in the `get_dataset()` function

In [1]:
# !pip install matplotlib pandas

In [2]:
!/opt/aws/neuron/bin/neuron-cli reset
import os
import time
import shutil
import json
import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.neuron as tfn
import tensorflow.compat.v1.keras as keras
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from concurrent import futures
from itertools import compress

print('test')

test


In [3]:
# https://github.com/tensorflow/tensorflow/issues/29931
temp = tf.zeros([8, 224, 224, 3])
_ = tf.keras.applications.resnet50.preprocess_input(temp)

### Resnet50 FP32 saved model

In [4]:
# Export SavedModel
saved_model_dir = 'resnet50_saved_model'
shutil.rmtree(saved_model_dir, ignore_errors=True)

keras.backend.set_learning_phase(0)
model = ResNet50(weights='imagenet')
tf.saved_model.simple_save(session = keras.backend.get_session(),
                           export_dir = saved_model_dir,
                           inputs = {'input_1:0': model.inputs[0]},
                           outputs = {'probs/Softmax:0': model.outputs[0]})

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.simple_save.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: resnet50_saved_model/saved_model.pb


### Compile models with different batch sizes and cores

In [5]:
def compile_inf1_model(saved_model_dir, inf1_model_dir, batch_size=1, num_cores=1, use_static_weights=False):
    print(f'-----------batch size: {batch_size}, num cores: {num_cores}----------')
    print('Compiling...')
    
    compiled_model_dir = f'resnet50_batch_{batch_size}_inf1_cores_{num_cores}'
    inf1_compiled_model_dir = os.path.join(inf1_model_dir, compiled_model_dir)
    shutil.rmtree(inf1_compiled_model_dir, ignore_errors=True)

    example_input = np.zeros([batch_size,224,224,3], dtype='float32')

    compiler_args = ['--verbose','1', '--neuroncore-pipeline-cores', str(num_cores)]
    if use_static_weights:
        compiler_args.append('--static-weights')
    
    start_time = time.time()
    compiled_res = tfn.saved_model.compile(model_dir = saved_model_dir,
                            model_feed_dict={'input_1:0': example_input},
                            new_model_dir = inf1_compiled_model_dir,
                            dynamic_batch_size=True,
                            compiler_args = compiler_args)
    print(f'Compile time: {time.time() - start_time}')
    
    compile_success = False
    perc_on_inf = compiled_res['OnNeuronRatio'] * 100
    if perc_on_inf > 50:
        compile_success = True
            
    print(inf1_compiled_model_dir)
    print(compiled_res)
    print('----------- Done! ----------- \n')
    
    return compile_success

### Use `tf.data` to read ImageNet validation dataset

In [6]:
def deserialize_image_record(record):
    feature_map = {'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
                  'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1),
                  'image/class/text': tf.io.FixedLenFeature([], tf.string, '')}
    obj = tf.io.parse_single_example(serialized=record, features=feature_map)
    imgdata = obj['image/encoded']
    label = tf.cast(obj['image/class/label'], tf.int32)   
    label_text = tf.cast(obj['image/class/text'], tf.string)   
    return imgdata, label, label_text

def val_preprocessing(record):
    imgdata, label, label_text = deserialize_image_record(record)
    label -= 1
    image = tf.io.decode_jpeg(imgdata, channels=3, 
                              fancy_upscaling=False, 
                              dct_method='INTEGER_FAST')

    shape = tf.shape(image)
    height = tf.cast(shape[0], tf.float32)
    width = tf.cast(shape[1], tf.float32)
    side = tf.cast(tf.convert_to_tensor(256, dtype=tf.int32), tf.float32)

    scale = tf.cond(tf.greater(height, width),
                  lambda: side / width,
                  lambda: side / height)
    
    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
    
    image = tf.image.resize(image, [new_height, new_width], method='bicubic')
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    
    image = tf.keras.applications.resnet50.preprocess_input(image)
    
    return image, label, label_text

def get_dataset(batch_size, use_cache=False):
    data_dir = '/home/ubuntu/datasets/*'
    files = tf.io.gfile.glob(os.path.join(data_dir))
    dataset = tf.data.TFRecordDataset(files)
    
    dataset = dataset.map(map_func=val_preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(count=1)
    
    if use_cache:
        shutil.rmtree('tfdatacache', ignore_errors=True)
        os.mkdir('tfdatacache')
        dataset = dataset.cache(f'./tfdatacache/imagenet_val')
    
    return dataset

## Single AWS Inferentia chip execution
* Single core compiled models with automatic data parallel model upto 4 cores
* Multi-core compiled models for pipeline execution

In [7]:
def inf1_predict_benchmark_single_threaded(neuron_saved_model_name, batch_size, user_batch_size, num_cores, use_cache=False, warm_up=10):
    print(f'Running model {neuron_saved_model_name}, user_batch_size: {user_batch_size}\n')

    model_inf1 = tf.contrib.predictor.from_saved_model(neuron_saved_model_name)

    iter_times = []
    pred_labels = []
    actual_labels = []
    display_threshold = 0
    warm_up = 10

    ds = get_dataset(user_batch_size, use_cache)

    ds_iter = ds.make_initializable_iterator()
    ds_next = ds_iter.get_next()
    ds_init_op = ds_iter.initializer

    with tf.Session() as sess:
        if use_cache:
            sess.run(ds_init_op)
            print('\nCaching dataset ...')
            start_time = time.time()
            try:
                while True:
                    (validation_ds,label,_) = sess.run(ds_next)
            except tf.errors.OutOfRangeError:
                pass
            print(f'Caching finished: {time.time()-start_time} sec')  

        try:
            sess.run(ds_init_op)
            counter = 0
            
            total_datas = 1000
            display_every = 100
            display_threshold = display_every
            
            ipname = list(model_inf1.feed_tensors.keys())[0]
            resname = list(model_inf1.fetch_tensors.keys())[0]
            
            walltime_start = time.time()
            warmup_time = []
            extend_time = []
            while True:
                sess_start = time.time()
                (validation_ds,batch_labels,_) = sess.run(ds_next)
                
                model_feed_dict={ipname: validation_ds}
                warmup_start = time.time()
                if counter == 0:
                    for i in range(warm_up):
                        _ = model_inf1(model_feed_dict);                    
                warmup_time.append(time.time() - warmup_start)
                start_time =time.time()
                inf1_results = model_inf1(model_feed_dict);
                iter_times.append(time.time() - start_time)
                
                extend_start = time.time()
                actual_labels.extend(label for label_list in batch_labels for label in label_list)
                pred_labels.extend(list(np.argmax(inf1_results[resname], axis=1)))
                extend_time.append(time.time() - extend_start)
                
                if counter*user_batch_size >= display_threshold:
                    print(f'Images {counter*user_batch_size}/{total_datas}. Average i/s {np.mean(user_batch_size/np.array(iter_times[-display_every:]))}')
                    display_threshold+=display_every

                counter+=1
        except tf.errors.OutOfRangeError:
            pass
    
    labeling_start = time.time()
    acc_inf1 = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    iter_times = np.array(iter_times)
    labeling_time = time.time() - labeling_start
    
    results = pd.DataFrame(columns = [f'inf1_compiled_batch_size_{batch_size}_compiled_cores_{num_cores}'])
    results.loc['compiled_batch_size']     = [batch_size]
    results.loc['user_batch_size']         = [user_batch_size]
    results.loc['accuracy']                = [acc_inf1]
    results.loc['prediction_time']         = [np.sum(iter_times)]
    results.loc['warmup_time']             = [np.sum(np.array(warmup_time))]
    results.loc['extend_time']             = [np.sum(np.array(extend_time))]
    results.loc['labeling_time']           = [np.sum(np.array(labeling_time))]
    results.loc['wall_time']               = [time.time() - walltime_start]
    results.loc['images_per_sec_mean']     = [np.mean(user_batch_size / iter_times)]
    results.loc['images_per_sec_std']      = [np.std(user_batch_size / iter_times, ddof=1)]
    results.loc['latency_mean']            = [np.mean(iter_times) * 1000]
    results.loc['latency_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
    results.loc['latency_median']          = [np.median(iter_times) * 1000]
    results.loc['latency_min']             = [np.min(iter_times) * 1000]
    display(results.T)
#     shutil.rmtree(neuron_saved_model_name, ignore_errors=True)
    return results, iter_times

In [12]:
inf1_model_dir = 'resnet50_inf1_saved_models'
saved_model_dir = 'resnet50_saved_model'


# testing batch size
batch_list = [1]
num_of_cores = [1]

inf1_model_dir = 'resnet50_inf1_saved_models'

for batch_size in batch_list:
    iter_ds = pd.DataFrame()
    results = pd.DataFrame()
    for num_cores in num_of_cores:
        opt ={'batch_size': batch_size, 'num_cores': num_of_cores}
        compiled_model_dir = f'resnet50_batch_{batch_size}_inf1_cores_{num_cores}'
        inf1_compiled_model_dir = os.path.join(inf1_model_dir, compiled_model_dir)

        print(f'inf1_compiled_model_dir: {inf1_compiled_model_dir}')
        col_name = lambda opt: f'inf1_{batch_size}_multicores_{num_cores}'

        res, iter_times = inf1_predict_benchmark_single_threaded(inf1_compiled_model_dir,
                                                                         batch_size = batch_size,
                                                                         user_batch_size = batch_size*10,
                                                                         num_cores = num_cores,
                                                                         use_cache=False, 
                                                                         warm_up=10)

        iter_ds = pd.concat([iter_ds, pd.DataFrame(iter_times, columns=[col_name(opt)])], axis=1)
        results = pd.concat([results, res], axis=1)

    display(results)

inf1_compiled_model_dir: resnet50_inf1_saved_models/resnet50_batch_1_inf1_cores_1
Running model resnet50_inf1_saved_models/resnet50_batch_1_inf1_cores_1, user_batch_size: 10

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.


InternalError: nrt::load failed with grpc status code 0, error message ""; nrt status code 3, details "[NMGR:eg_use] Failed to find EG: 25
[NMGR:kmgr_load_nn] Failed to load NN: 1.4.1.0+737cbb69a-/tmp/tmp66hamo5z/neuron_op_d6f098c01c780733, err: 3
[NRTD:load] DLR model load failed
"
	 [[node conv5_block3_3_bn/FusedBatchNormV3/ReadVariableOp/neuron_op_d6f098c01c780733 (defined at /home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'conv5_block3_3_bn/FusedBatchNormV3/ReadVariableOp/neuron_op_d6f098c01c780733':
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/asyncio/base_events.py", line 442, in run_forever
    self._run_once()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/asyncio/base_events.py", line 1462, in _run_once
    handle._run()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tornado/ioloop.py", line 688, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tornado/ioloop.py", line 741, in _run_callback
    ret = callback()
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tornado/gen.py", line 814, in inner
    self.ctx_run(self.run)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/contextvars/__init__.py", line 38, in run
    return callable(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tornado/gen.py", line 775, in run
    yielded = self.gen.send(value)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 358, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/contextvars/__init__.py", line 38, in run
    return callable(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/contextvars/__init__.py", line 38, in run
    return callable(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 538, in execute_request
    user_expressions, allow_stdin,
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/contextvars/__init__.py", line 38, in run
    return callable(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 302, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 539, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2867, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2895, in _run_cell
    return runner(coro)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3072, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3263, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-fcb5ce5792f5>", line 27, in <module>
    warm_up=10)
  File "<ipython-input-7-df035650934a>", line 4, in inf1_predict_benchmark_single_threaded
    model_inf1 = tf.contrib.predictor.from_saved_model(neuron_saved_model_name)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/contrib/predictor/predictor_factories.py", line 153, in from_saved_model
    config=config)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/contrib/predictor/saved_model_predictor.py", line 153, in __init__
    loader.load(self._session, tags.split(','), export_dir)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 324, in new_func
    return func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/saved_model/loader_impl.py", line 269, in load
    return loader.load(sess, tags, import_scope, **saver_kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/saved_model/loader_impl.py", line 422, in load
    **saver_kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/saved_model/loader_impl.py", line 352, in load_graph
    meta_graph_def, import_scope=import_scope, **saver_kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1477, in _import_meta_graph_with_return_elements
    **kwargs))
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/meta_graph.py", line 809, in import_scoped_meta_graph_with_return_elements
    return_elements=return_elements)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/importer.py", line 405, in import_graph_def
    producer_op_list=producer_op_list)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/importer.py", line 517, in _import_graph_def_internal
    _ProcessNewOps(graph)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/importer.py", line 243, in _ProcessNewOps
    for new_op in graph._add_new_tf_operations(compute_devices=False):  # pylint: disable=protected-access
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3561, in _add_new_tf_operations
    for c_op in c_api_util.new_tf_operations(self)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3561, in <listcomp>
    for c_op in c_api_util.new_tf_operations(self)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3451, in _create_op_from_tf_operation
    ret = Operation(c_op, self)
  File "/home/ubuntu/anaconda3/envs/aws_neuron_tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()
