In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.tensorrt as trt
import time

In [2]:
# all three models
models = [{'model_path': './artifacts/model1.pb', 
           'input_node_name': 'tower_0/inference_input:0', 
           'output_node_name': 'tower_0/inference_output:0', 
           'n_channels': 3, 'n_height': 128, 'n_width': 128},
          {'model_path': './artifacts/model2.pb', 
           'input_node_name': 'import/input_1:0', 
           'output_node_name': 'import/dense_2/Sigmoid:0', 
           'n_channels': 3, 'n_height': 256, 'n_width': 256}, 
          {'model_path': './artifacts/model3.pb', 
           'input_node_name': 'tower_0/inference_input:0', 
           'output_node_name': 'tower_0/inference_output:0', 
           'n_channels': 3, 'n_height': 256, 'n_width': 256}]

In [3]:
# pick a model
model_number = 0  # possible values are 0, 1, and 2
model = models[model_number]

# settings
model_path = model['model_path']
input_node_name = model['input_node_name']
output_node_name = model['output_node_name']
n_channels, n_height, n_width = model['n_channels'], model['n_height'], model['n_width']

In [4]:
# inference settings
n_inferences = 1000
batch_size = 1
precision_mode = 'FP16'  # possible values are FP16 (default) and FP32

In [5]:
# create random data
data = np.random.randint(low=0, high=256, size=(batch_size, n_height, n_width, n_channels))
warmup_sample = np.uint8(data)

In [6]:
# Inference with TF frozen graph workflow
with tf.Session() as sess:
    with tf.gfile.GFile(model_path, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        if model_number == 1:
            _ = tf.import_graph_def(graph_def)
        else:
            _ = tf.import_graph_def(graph_def, name='')
        

        input_node  = sess.graph.get_tensor_by_name(input_node_name)
        output_node = sess.graph.get_tensor_by_name(output_node_name)
        
        # get input placeholder for phase for model #1
        if model_number == 1:
            ph_1 = sess.graph.get_tensor_by_name('import/bn_conv1/keras_learning_phase:0')
        
        # Warmup
        if model_number == 1:
            _ = sess.run(output_node, feed_dict={ph_1: False, input_node: warmup_sample})
        else:
            _ = sess.run(output_node, feed_dict={input_node: warmup_sample})
        
        # Run
        timings = []
        for _ in range(n_inferences):
            if model_number == 1:
                start = time.time()
                results_tf = sess.run(output_node, feed_dict={ph_1: False, input_node: warmup_sample})
                end = time.time()
            else:
                start = time.time()
                results_tf = sess.run(output_node, feed_dict={input_node: warmup_sample})
                end = time.time()
            timings.append(end - start)

In [7]:
# Check timings
print(timings)

[0.005265712738037109, 0.004200935363769531, 0.004057407379150391, 0.00394749641418457, 0.004011869430541992, 0.004251956939697266, 0.004097700119018555, 0.0035610198974609375, 0.0033524036407470703, 0.003335237503051758, 0.003421306610107422, 0.003316640853881836, 0.0034036636352539062, 0.003354787826538086, 0.0035064220428466797, 0.0032951831817626953, 0.003365755081176758, 0.0035657882690429688, 0.003334522247314453, 0.0034284591674804688, 0.003516674041748047, 0.003473520278930664, 0.003568410873413086, 0.0034575462341308594, 0.0033240318298339844, 0.0035119056701660156, 0.003336191177368164, 0.0034418106079101562, 0.0033295154571533203, 0.003309488296508789, 0.0032918453216552734, 0.003368377685546875, 0.0033714771270751953, 0.003277301788330078, 0.003332853317260742, 0.0032854080200195312, 0.0035033226013183594, 0.003302335739135742, 0.003286600112915039, 0.0032591819763183594, 0.0034623146057128906, 0.0034399032592773438, 0.00323486328125, 0.003238201141357422, 0.003372192382812

In [8]:
# Benchmark TF
timings = np.asarray(timings)
delta_tf = np.sum(timings)
average_latency_tf = np.mean(timings)
std_latency_tf = np.std(timings)
average_throughput_tf = batch_size * (1 / average_latency_tf)
print('Total Inference Time: {} seconds'.format(delta_tf))
print('Number of Inferences: {}'.format(len(timings)))
print('Average Latency: {} +- {} seconds'.format(average_latency_tf, std_latency_tf))
print('Average Throughput w/ Batch Size {}: {} examples per second'.format(batch_size, average_throughput_tf))

Total Inference Time: 3.268392562866211 seconds
Number of Inferences: 1000
Average Latency: 0.003268392562866211 +- 0.00015616132438339632 seconds
Average Throughput w/ Batch Size 1: 305.9607990060569 examples per second


In [9]:
# Inference with TF-TRT frozen graph workflow:
graph = tf.Graph()
with graph.as_default():
    with tf.Session() as sess:
        # First deserialize your frozen graph:
        with tf.gfile.GFile(model_path, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            if model_number == 1:
                _ = tf.import_graph_def(graph_def)
            else:
                _ = tf.import_graph_def(graph_def, name='')
        
        if model_number == 1:
            output_node_name = 'dense_2/Sigmoid:0'
            warmup_sample = np.float32(warmup_sample)
        
        # Create TF-TRT Graph
        trt_graph = trt.create_inference_graph(
            input_graph_def=graph_def,
            outputs=[output_node_name],
            max_batch_size=batch_size,
            is_dynamic_op=True,
            max_workspace_size_bytes=2<<20,
            precision_mode=precision_mode)
        
        # Import the TensorRT graph into a new graph:
        output_node = tf.import_graph_def(
            trt_graph,
            return_elements=[output_node_name])
        
        # Warmup
        if model_number == 1:
            _ = sess.run(output_node, feed_dict={'import_1/input_1:0': warmup_sample})
        else:
            _ = sess.run(output_node, feed_dict={'import/{}'.format(input_node_name): warmup_sample})
        
        # Run
        timings = []
        for _ in range(n_inferences):
            if model_number == 1:
                start = time.time()
                results_tf_trt = sess.run(output_node, feed_dict={'import_1/input_1:0': warmup_sample})
                end = time.time()
            else:
                start = time.time()
                results_tf_trt = sess.run(output_node, feed_dict={'import/{}'.format(input_node_name): warmup_sample})
                end = time.time()
            timings.append(end - start)

INFO:tensorflow:Running against TensorRT version 5.1.5


In [10]:
# Check timings
print(timings)

[0.0037479400634765625, 0.0027039051055908203, 0.0022881031036376953, 0.0024483203887939453, 0.002333402633666992, 0.0023293495178222656, 0.002386331558227539, 0.0022733211517333984, 0.002376079559326172, 0.002266407012939453, 0.002200603485107422, 0.002271413803100586, 0.0025472640991210938, 0.0024232864379882812, 0.0024824142456054688, 0.002321481704711914, 0.002460479736328125, 0.0022954940795898438, 0.002176523208618164, 0.0022885799407958984, 0.002387523651123047, 0.00225067138671875, 0.0022051334381103516, 0.002330303192138672, 0.0022966861724853516, 0.0025908946990966797, 0.002275228500366211, 0.0021936893463134766, 0.0023310184478759766, 0.0022881031036376953, 0.0023179054260253906, 0.0026192665100097656, 0.002389669418334961, 0.0024733543395996094, 0.002374887466430664, 0.0023529529571533203, 0.002252817153930664, 0.0022509098052978516, 0.002220630645751953, 0.0022208690643310547, 0.0023207664489746094, 0.002460479736328125, 0.0023391246795654297, 0.0022535324096679688, 0.0021

In [11]:
# Benchmark TF-TRT
timings = np.asarray(timings)
delta_tf_trt = np.sum(timings)
average_latency_tf_trt = np.mean(timings)
std_latency_tf_trt = np.std(timings)
average_throughput_tf_trt = batch_size * (1 / average_latency_tf_trt)
print('Total Inference Time: {} seconds'.format(delta_tf_trt))
print('Number of Inferences: {}'.format(len(timings)))
print('Average Latency: {} +- {} seconds'.format(average_latency_tf_trt, std_latency_tf_trt))
print('Average Throughput w/ Batch Size {}: {} examples per second'.format(batch_size, average_throughput_tf_trt))

Total Inference Time: 1.8399090766906738 seconds
Number of Inferences: 1000
Average Latency: 0.0018399090766906738 +- 0.00019795778054295358 seconds
Average Throughput w/ Batch Size 1: 543.5051180891154 examples per second


In [12]:
# Compare results
print('Results are identical:', results_tf - results_tf_trt)
speedup = average_latency_tf / average_latency_tf_trt
print('Speedup of TF-TRT over TF is: {}x'.format(speedup))

Results are identical: [[[-5.0032626e-12 -6.3944198e-03  2.5724113e-02 -1.9329704e-02
   -5.6191989e-11]]]
Speedup of TF-TRT over TF is: 1.7763880858421865x
