# AWS Inferentia inference on Amazon EC2 Inf1 instance
This example demonstrates AWS Inferentia inference with TensorFlow and AWS Neuron SDK compiler and runtime

This example was tested on Amazon EC2 `inf1.xlarge` the following AWS Deep Learning AMI: 
`Deep Learning AMI (Ubuntu 18.04) Version 35.0`

Run this notebook using the following conda environment:
`aws_neuron_tensorflow_p36`

Prepare your imagenet validation TFRecord files using the following helper script:
https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh

Save it to `/home/ubuntu/datasets/` or update the dataset location in the `get_dataset()` function

In [8]:
# !pip install matplotlib pandas

In [9]:
# !/opt/aws/neuron/bin/neuron-cli reset
import os
import time
import shutil
import json
import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.neuron as tfn
import tensorflow.compat.v1.keras as keras
from tensorflow.keras.applications import ( 
    xception,
    vgg16,
    vgg19,
    resnet,
    resnet50,
    resnet_v2,
    inception_v3,
    inception_resnet_v2,
    mobilenet,
    densenet,
    nasnet,
    mobilenet_v2
)
from tensorflow.keras.preprocessing import image
from concurrent import futures
from itertools import compress

models = {
    'xception':xception,
    'vgg16':vgg16,
    'vgg19':vgg19,
    'resnet50':resnet50,
    'resnet101':resnet,
    'resnet152':resnet,
    'resnet50_v2':resnet_v2,
    'resnet101_v2':resnet_v2,
    'resnet152_v2':resnet_v2,
#     'resnext50':resnext,
#     'resnext101':resnext,
    'inception_v3':inception_v3,
    'inception_resnet_v2':inception_resnet_v2,
    'mobilenet':mobilenet,
    'densenet121':densenet,
    'densenet169':densenet,
    'densenet201':densenet,
    'nasnetlarge':nasnet,
    'nasnetmobile':nasnet,
    'mobilenet_v2':mobilenet_v2
}

models_detail = {
#     'xception':xception.Xception(weights='imagenet',include_top=False),
#     'vgg16':vgg16.VGG16(weights='imagenet'),
#     'vgg19':vgg19.VGG19(weights='imagenet'),
#     'resnet50':resnet.ResNet50(weights='imagenet'),
#     'resnet101':resnet.ResNet101(weights='imagenet'),
#     'resnet152':resnet.ResNet152(weights='imagenet'),
#     'resnet50_v2':resnet_v2.ResNet50V2(weights='imagenet'),
#     'resnet101_v2':resnet_v2.ResNet101V2(weights='imagenet'),
#     'resnet152_v2':resnet_v2.ResNet152V2(weights='imagenet'),
#     'resnext50':resnext.ResNeXt50(weights='imagenet'),
#     'resnext101':resnext.ResNeXt101(weights='imagenet'),
#     'inception_v3':inception_v3.InceptionV3(weights='imagenet',include_top=False),
#     'inception_resnet_v2':inception_resnet_v2.InceptionResNetV2(weights='imagenet'),
#     'mobilenet':mobilenet.MobileNet(weights='imagenet'),
#     'densenet121':densenet.DenseNet121(weights='imagenet'),
#     'densenet169':densenet.DenseNet169(weights='imagenet'),
#     'densenet201':densenet.DenseNet201(weights='imagenet'),
#     'nasnetlarge':nasnet.NASNetLarge(weights='imagenet'),
#     'nasnetmobile':nasnet.NASNetMobile(weights='imagenet'),
#     'mobilenet_v2':mobilenet_v2.MobileNetV2(weights='imagenet')
}

print('test')

test


### Resnet50 FP32 saved model

In [10]:
# # Export SavedModel
# model_type = 'resnet50'

# saved_model_dir = f'{model_type}_saved_model'
# shutil.rmtree(saved_model_dir, ignore_errors=True)

keras.backend.set_learning_phase(0)
# model = ResNet50(weights='imagenet')
# tf.saved_model.simple_save(session = keras.backend.get_session(),
#                            export_dir = saved_model_dir,
#                            inputs = {'input_1:0': model.inputs[0]},
#                            outputs = {'probs/Softmax:0': model.outputs[0]})

### Compile models with different batch sizes and cores

In [11]:
def compile_inf1_model(saved_model_dir, inf1_model_dir, batch_size=1, num_cores=1, use_static_weights=False):
    print(f'-----------batch size: {batch_size}, num cores: {num_cores}----------')
    print('Compiling...')
    
    compiled_model_dir = f'{model_type}_batch_{batch_size}_inf1_cores_{num_cores}'
    inf1_compiled_model_dir = os.path.join(inf1_model_dir, compiled_model_dir)
    shutil.rmtree(inf1_compiled_model_dir, ignore_errors=True)

    example_input = np.zeros([batch_size,224,224,3], dtype='float32')

    compiler_args = ['--verbose','1', '--neuroncore-pipeline-cores', str(num_cores)]
    if use_static_weights:
        compiler_args.append('--static-weights')
    
    start_time = time.time()
    compiled_res = tfn.saved_model.compile(model_dir = saved_model_dir,
                            model_feed_dict={'input_1:0': example_input},
                            new_model_dir = inf1_compiled_model_dir,
                            dynamic_batch_size=True,
                            compiler_args = compiler_args)
    print(f'Compile time: {time.time() - start_time}')
    
    compile_success = False
    perc_on_inf = compiled_res['OnNeuronRatio'] * 100
    if perc_on_inf > 50:
        compile_success = True
            
    print(inf1_compiled_model_dir)
    print(compiled_res)
    print('----------- Done! ----------- \n')
    
    return compile_success

### Use `tf.data` to read ImageNet validation dataset

In [12]:
def deserialize_image_record(record):
    feature_map = {'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
                  'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1)}
    obj = tf.io.parse_single_example(serialized=record, features=feature_map)
    imgdata = obj['image/encoded']
    label = tf.cast(obj['image/class/label'], tf.int32)   
    return imgdata, label

def val_preprocessing(record):
    imgdata, label = deserialize_image_record(record)
    label -= 1
    image = tf.io.decode_jpeg(imgdata, channels=3, 
                              fancy_upscaling=False, 
                              dct_method='INTEGER_FAST')

    shape = tf.shape(image)
    height = tf.cast(shape[0], tf.float32)
    width = tf.cast(shape[1], tf.float32)
    side = tf.cast(tf.convert_to_tensor(256, dtype=tf.int32), tf.float32)

    scale = tf.cond(tf.greater(height, width),
                  lambda: side / width,
                  lambda: side / height)
    
    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
    
    image = tf.image.resize(image, [new_height, new_width], method='bicubic')
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    
    image = models[model_type].preprocess_input(image)
    
    return image, label

def get_dataset(batch_size, use_cache=False):
    data_dir = '/home/ubuntu/datasets/images-50000/*'
    files = tf.io.gfile.glob(os.path.join(data_dir))
    dataset = tf.data.TFRecordDataset(files)
    
    dataset = dataset.map(map_func=val_preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(count=1)
    
    if use_cache:
        shutil.rmtree('tfdatacache', ignore_errors=True)
        os.mkdir('tfdatacache')
        dataset = dataset.cache(f'./tfdatacache/imagenet_val')
    
    return dataset

## Single AWS Inferentia chip execution
* Single core compiled models with automatic data parallel model upto 4 cores
* Multi-core compiled models for pipeline execution

In [13]:
import os
    
def inf1_predict_benchmark_single_threaded(neuron_saved_model_name, batch_size, user_batch_size, num_cores, use_cache=False, warm_up=10):
    print(f'Running model {neuron_saved_model_name}, user_batch_size: {user_batch_size}\n')

    model_inf1 = tf.saved_model.load(neuron_saved_model_name)
    
    inference_function = model_inf1.signatures['serving_default']
    walltime_start = time.time()
    first_iter_time = 0
    iter_times = []
    pred_labels = []
    actual_labels = []
    total_datas = 50000
    display_every = 1000
    display_threshold = display_every
    
    ds = get_dataset(user_batch_size, use_cache)
    load_start = time.time()
    load_time = time.time() - load_start
    counter = 0
    print(model_inf1)
    for batch, batch_labels in ds:
        start_time = time.time()
        yhat_np = inference_function(batch)
        if counter ==0:
            first_iter_time = time.time() - start_time
        else:
            iter_times.append(time.time() - start_time)
        actual_labels.extend(label for label_list in batch_labels for label in label_list)
        pred_labels.extend(list(np.argmax(yhat_np, axis=1)))

        if counter*batch_size >= display_threshold:
            print(f'Images {counter*batch_size}/{total_datas}. Average i/s {np.mean(batch_size/np.array(iter_times[-display_every:]))}')
            display_threshold+=display_every

        counter+=1
    iter_times = np.array(iter_times)
    acc_inf1 = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    results = pd.DataFrame(columns = [f'inf1_tf2_{model_type}_{batch_size}'])
    results.loc['batch_size']              = [batch_size]
    results.loc['accuracy']                = [acc_inf1]
    results.loc['first_prediction_time']   = [first_iter_time]
    results.loc['average_prediction_time'] = [np.mean(iter_times)]
    results.loc['load_time']               = [load_time]
    results.loc['wall_time']               = [time.time() - walltime_start]

    return results, iter_times

In [14]:
model_type = 'vgg16'
# https://github.com/tensorflow/tensorflow/issues/29931
temp = tf.zeros([8, 224, 224, 3])
_ = models[model_type].preprocess_input(temp)

# testing batch size
batch_list = [64]
num_of_cores = [1]
user_batchs = [64]
inf1_model_dir = f'{model_type}_inf1_saved_models'

for user_batch in user_batchs:
    iter_ds = pd.DataFrame()
    results = pd.DataFrame()
    for batch_size in batch_list:
        for num_cores in num_of_cores:
            opt ={'batch_size': batch_size, 'num_cores': num_of_cores}
            compiled_model_dir = f'{model_type}_batch_{batch_size}_inf1_cores_{num_cores}'
            inf1_compiled_model_dir = os.path.join(inf1_model_dir, compiled_model_dir)

            print(f'inf1_compiled_model_dir: {inf1_compiled_model_dir}')
            col_name = lambda opt: f'inf1_{batch_size}_multicores_{num_cores}'

            res, iter_times = inf1_predict_benchmark_single_threaded(inf1_compiled_model_dir,
                                                                             batch_size = batch_size,
                                                                             user_batch_size = batch_size*user_batch,
                                                                             num_cores = num_cores,
                                                                             use_cache=False, 
                                                                             warm_up=10)

        iter_ds = pd.concat([iter_ds, pd.DataFrame(iter_times, columns=[col_name(opt)])], axis=1)
        results = pd.concat([results, res], axis=1)
    display(results)
    results.to_csv(f'{model_type}_batch_size_{batch_size}.csv')

inf1_compiled_model_dir: vgg16_inf1_saved_models/vgg16_batch_64_inf1_cores_1
Running model vgg16_inf1_saved_models/vgg16_batch_64_inf1_cores_1, user_batch_size: 4096

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject object at 0x7f2144734e48>


UnavailableError: 2 root error(s) found.
  (0) Unavailable:  grpc server unix:/run/neuron.sock is unavailable. Please check the status of neuron-rtd service by `systemctl is-active neuron-rtd`. If it shows `inactive`, please install the service by `sudo apt-get install aws-neuron-runtime`. If `aws-neuron-runtime` is already installed, you may activate neuron-rtd service by `sudo systemctl restart neuron-rtd`.
	 [[{{node StatefulPartitionedCall/aws_neuron_model/StatefulPartitionedCall/neuron_op_dd78f53bee0a8715}}]]
  (1) Unavailable:  grpc server unix:/run/neuron.sock is unavailable. Please check the status of neuron-rtd service by `systemctl is-active neuron-rtd`. If it shows `inactive`, please install the service by `sudo apt-get install aws-neuron-runtime`. If `aws-neuron-runtime` is already installed, you may activate neuron-rtd service by `sudo systemctl restart neuron-rtd`.
	 [[{{node StatefulPartitionedCall/aws_neuron_model/StatefulPartitionedCall/neuron_op_dd78f53bee0a8715}}]]
	 [[StatefulPartitionedCall/aws_neuron_model/StatefulPartitionedCall/neuron_op_dd78f53bee0a8715/_4]]
0 successful operations.
0 derived errors ignored. [Op:__inference_signature_wrapper_44]

Function call stack:
signature_wrapper -> signature_wrapper
