In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.tensorrt as trt
import time

In [2]:
# all three models
models = [{'model_path': './artifacts/model1.pb', 
           'input_node_name': 'tower_0/inference_input:0', 
           'output_node_name': 'tower_0/inference_output:0', 
           'n_channels': 3, 'n_height': 128, 'n_width': 128},
          {'model_path': './artifacts/model2.pb', 
           'input_node_name': 'import/input_1:0', 
           'output_node_name': 'import/dense_2/Sigmoid:0', 
           'n_channels': 3, 'n_height': 256, 'n_width': 256}, 
          {'model_path': './artifacts/model3.pb', 
           'input_node_name': 'tower_0/inference_input:0', 
           'output_node_name': 'tower_0/inference_output:0', 
           'n_channels': 3, 'n_height': 256, 'n_width': 256}]

In [3]:
# pick a model
model_number = 0  # possible values are 0, 1, and 2
model = models[model_number]

# settings
model_path = model['model_path']
tftrt_graph_path = './artifacts/model1_tftrt_graph.pb'
tftrt_int8_graph_path = './artifacts/model1_tftrt_int8_graph.pb'
input_node_name = model['input_node_name']
output_node_name = model['output_node_name']
n_channels, n_height, n_width = model['n_channels'], model['n_height'], model['n_width']

In [4]:
# inference settings
n_inferences = 1000
batch_size = 1
precision_mode = 'INT8'
n_calibration_loops = 500  # n_calibration_loops * batch_size = number_of_total_calibration_images

In [5]:
# create random data
data = np.random.randint(low=0, high=255, size=(batch_size, n_height, n_width, n_channels))
# warmup_sample = data.astype(np.uint8)
warmup_sample = data.astype(np.float32)

In [6]:
# Inference with TF frozen graph workflow
with tf.Session() as sess:
    with tf.gfile.GFile(model_path, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        if model_number == 1:
            _ = tf.import_graph_def(graph_def)
        else:
            _ = tf.import_graph_def(graph_def, name='')
        

        input_node  = sess.graph.get_tensor_by_name(input_node_name)
        output_node = sess.graph.get_tensor_by_name(output_node_name)
        
        # get input placeholder for phase for model #1
        if model_number == 1:
            ph_1 = sess.graph.get_tensor_by_name('import/bn_conv1/keras_learning_phase:0')
        
        # Warmup
        if model_number == 1:
            _ = sess.run(output_node, feed_dict={ph_1: False, input_node: warmup_sample})
        else:
            _ = sess.run(output_node, feed_dict={input_node: warmup_sample})
        
        # Run
        timings = []
        for _ in range(n_inferences):
            if model_number == 1:
                start = time.time()
                results_tf = sess.run(output_node, feed_dict={ph_1: False, input_node: warmup_sample})
                end = time.time()
            else:
                start = time.time()
                results_tf = sess.run(output_node, feed_dict={input_node: warmup_sample})
                end = time.time()
            timings.append(end - start)

In [7]:
# Check timings
print(timings)

[0.007463693618774414, 0.006991863250732422, 0.006924867630004883, 0.004235267639160156, 0.003983736038208008, 0.004035472869873047, 0.0040433406829833984, 0.003922224044799805, 0.004025936126708984, 0.0033147335052490234, 0.0034232139587402344, 0.003220081329345703, 0.0032901763916015625, 0.003161907196044922, 0.003353118896484375, 0.0034885406494140625, 0.003193378448486328, 0.0032050609588623047, 0.0031785964965820312, 0.003154277801513672, 0.0031197071075439453, 0.003391742706298828, 0.0032083988189697266, 0.0031747817993164062, 0.003216981887817383, 0.0032777786254882812, 0.0033855438232421875, 0.0033779144287109375, 0.003519296646118164, 0.003249645233154297, 0.0032205581665039062, 0.003200531005859375, 0.0032455921173095703, 0.003210783004760742, 0.0031294822692871094, 0.0031173229217529297, 0.0032477378845214844, 0.003152608871459961, 0.003260374069213867, 0.003182649612426758, 0.003165721893310547, 0.0031249523162841797, 0.0033082962036132812, 0.0032334327697753906, 0.00322055

In [8]:
# Benchmark TF
timings = np.asarray(timings)
delta_tf = np.sum(timings)
average_latency_tf = np.mean(timings)
std_latency_tf = np.std(timings)
average_throughput_tf = batch_size * (1 / average_latency_tf)
print('Total Inference Time: {} seconds'.format(delta_tf))
print('Number of Inferences: {}'.format(len(timings)))
print('Average Latency: {} +- {} seconds'.format(average_latency_tf, std_latency_tf))
print('Average Throughput w/ Batch Size {}: {} examples per second'.format(batch_size, average_throughput_tf))

Total Inference Time: 3.2824668884277344 seconds
Number of Inferences: 1000
Average Latency: 0.0032824668884277345 +- 0.00025755496218351115 seconds
Average Throughput w/ Batch Size 1: 304.64892228630794 examples per second


In [9]:
# Deserialize frozen graph
with tf.gfile.GFile(model_path, 'rb') as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())

# Create TF-TRT Graph
trt_graph = trt.create_inference_graph(
    input_graph_def=graph_def,
    outputs=[output_node_name],
    max_batch_size=batch_size,
    is_dynamic_op=True,
    max_workspace_size_bytes=2<<20,
    precision_mode=precision_mode)

# Serialize TF-TRT graph
with tf.gfile.GFile(tftrt_graph_path, 'wb') as f:
    f.write(trt_graph.SerializeToString())

del trt_graph

INFO:tensorflow:Running against TensorRT version 5.1.5


In [10]:
# calibrate the graph to INT8
with tf.Session() as sess:
    # Deserialize TF-TRT graph
    with tf.gfile.GFile(tftrt_graph_path, 'rb') as f:
        calib_graph = tf.GraphDef()
        calib_graph.ParseFromString(f.read())
    
    # import TF-TRT graph
    print('Importing TF-TRT Graph...')
    output_node = tf.import_graph_def(calib_graph, return_elements=[output_node_name])
    print('TF-TRT Graph imported!')
    
    tensor_names = [n.name for n in tf.get_default_graph().as_graph_def().node]
    print(tensor_names)
    
    # modify node name
    modified_input_node_name = 'import/{}'.format(input_node_name)
    
    print('Running calibration data through TF-TRT Graph...')
    for i in range(n_calibration_loops):
        # create random data for calibration - TODO replace this with calibration data
        calibration_data = np.random.randint(low=0, high=256, size=(batch_size, n_height, n_width, n_channels))
        calibration_data = calibration_data.astype(np.float32)
        
        # run net to dummy calibrate
        _ = sess.run(output_node, feed_dict={modified_input_node_name: calibration_data})
    print('Calibration data ran through TF-TRT Graph!')
    
    # calibrate graph
    print('Converting calibration graph to inference graph...')
    #calibrated_graph = trt.calib_graph_to_infer_graph(calib_graph, is_dynamic_op=True)
    calibrated_graph = trt.calib_graph_to_infer_graph(calib_graph)
    print('Calibration graph converted to inference graph!')
    
    # Serialize INT8 calibrated TF-TRT graph
    with tf.gfile.GFile(tftrt_int8_graph_path, 'wb') as f:
        f.write(calibrated_graph.SerializeToString())
    
    del calib_graph
    del calibrated_graph

Importing TF-TRT Graph...
TF-TRT Graph imported!
['tower_0/inference_input', 'tower_0/Const', 'tower_0/div', 'tower_0/Const_1', 'tower_0/Sub', 'tower_0/Const_2', 'tower_0/Mul', 'block_1/conv2d/kernel', 'tower_0/block_1/conv2d/Conv2D', 'block_4/conv2d/kernel', 'block_9/conv2d/kernel', 'tower_0/Reshape/shape', 'dense_1/kernel', 'dense_1/bias', 'dense_2/kernel', 'dense_2/bias', 'block_1/conv2d_1/kernel', 'tower_0/block_1/conv2d_2/Conv2D', 'tower_0/block_1/conv2d_2/Conv2D_bn_offset', 'tower_0/block_1/batch_normalization/FusedBatchNorm', 'tower_0/block_1/Relu', 'block_1/conv2d_2/kernel', 'tower_0/block_1/conv2d_3/Conv2D', 'tower_0/block_1/conv2d_3/Conv2D_bn_offset', 'tower_0/block_1/batch_normalization_2/FusedBatchNorm', 'tower_0/add', 'tower_0/Relu', 'tower_0/average_pooling2d/AvgPool', 'block_2/conv2d/kernel', 'tower_0/block_2/conv2d/Conv2D', 'tower_0/block_2/conv2d/Conv2D_bn_offset', 'tower_0/block_2/batch_normalization/FusedBatchNorm', 'tower_0/block_2/Relu', 'block_2/conv2d_1/kernel', 

Calibration data ran through TF-TRT Graph!
Converting calibration graph to inference graph...
Calibration graph converted to inference graph!


In [11]:
# calibrate the graph to INT8
with tf.Session() as sess:
    # Deserialize INT8 calibrated TF-TRT graph
    with tf.gfile.GFile(tftrt_int8_graph_path, 'rb') as f:
        tftrt_int8_graph = tf.GraphDef()
        tftrt_int8_graph.ParseFromString(f.read())
    
    # modify node names
    modified_input_node_name = 'import_1/{}'.format(input_node_name)
    #modified_output_node_name = 'import/{}'.format(output_node_name)
        
    # Import the calibrated TF-TRT graph
    output_node = tf.import_graph_def(tftrt_int8_graph, return_elements=[output_node_name])
    #output_node = tf.import_graph_def(tftrt_int8_graph, return_elements=[modified_output_node_name])
    print(output_node)
    
    tensor_names = [n.name for n in tf.get_default_graph().as_graph_def().node]
    print(tensor_names)
    
    # create random data for warming up
    warmup_sample = np.random.randint(low=0, high=256, size=(batch_size, n_height, n_width, n_channels))
    warmup_sample = warmup_sample.astype(np.float32)
    _ = sess.run(output_node, feed_dict={modified_input_node_name: warmup_sample})

    # Run
    timings = []
    for _ in range(n_inferences):
        # create random data for testing - TODO replace this with test data
        test_data = np.random.randint(low=0, high=256, size=(batch_size, n_height, n_width, n_channels))
        test_data = test_data.astype(np.float32)
        
        # time inference
        start = time.time()
        results_tf_trt = sess.run(output_node, feed_dict={modified_input_node_name: test_data})
        end = time.time()
        timings.append(end - start)

[<tf.Tensor 'import_1/tower_0/inference_output:0' shape=<unknown> dtype=float32>]
['tower_0/inference_input', 'tower_0/Const', 'tower_0/div', 'tower_0/Const_1', 'tower_0/Sub', 'tower_0/Const_2', 'tower_0/Mul', 'block_1/conv2d/kernel', 'tower_0/block_1/conv2d/Conv2D', 'block_4/conv2d/kernel', 'block_9/conv2d/kernel', 'tower_0/Reshape/shape', 'dense_1/kernel', 'dense_1/bias', 'dense_2/kernel', 'dense_2/bias', 'block_1/conv2d_1/kernel', 'tower_0/block_1/conv2d_2/Conv2D', 'tower_0/block_1/conv2d_2/Conv2D_bn_offset', 'tower_0/block_1/batch_normalization/FusedBatchNorm', 'tower_0/block_1/Relu', 'block_1/conv2d_2/kernel', 'tower_0/block_1/conv2d_3/Conv2D', 'tower_0/block_1/conv2d_3/Conv2D_bn_offset', 'tower_0/block_1/batch_normalization_2/FusedBatchNorm', 'tower_0/add', 'tower_0/Relu', 'tower_0/average_pooling2d/AvgPool', 'block_2/conv2d/kernel', 'tower_0/block_2/conv2d/Conv2D', 'tower_0/block_2/conv2d/Conv2D_bn_offset', 'tower_0/block_2/batch_normalization/FusedBatchNorm', 'tower_0/block_2/R

In [12]:
# Check timings
print(timings)

[0.00648188591003418, 0.004743337631225586, 0.004198789596557617, 0.004148006439208984, 0.004114389419555664, 0.004171133041381836, 0.004261970520019531, 0.003983259201049805, 0.004075527191162109, 0.00412297248840332, 0.00400543212890625, 0.004207611083984375, 0.0040547847747802734, 0.004126787185668945, 0.004075050354003906, 0.0042057037353515625, 0.004080057144165039, 0.0040514469146728516, 0.004174709320068359, 0.004047393798828125, 0.004068851470947266, 0.0041429996490478516, 0.004045724868774414, 0.003919839859008789, 0.00420379638671875, 0.00415349006652832, 0.003972291946411133, 0.004252433776855469, 0.004261493682861328, 0.004236698150634766, 0.0039038658142089844, 0.0034399032592773438, 0.0033180713653564453, 0.003351449966430664, 0.0032715797424316406, 0.0038657188415527344, 0.003404378890991211, 0.0034673213958740234, 0.003287792205810547, 0.003895282745361328, 0.0033206939697265625, 0.003272533416748047, 0.003278493881225586, 0.003315448760986328, 0.003345966339111328, 0.0

In [13]:
# Benchmark TF-TRT
timings = np.asarray(timings)
delta_tf_trt = np.sum(timings)
average_latency_tf_trt = np.mean(timings)
std_latency_tf_trt = np.std(timings)
average_throughput_tf_trt = batch_size * (1 / average_latency_tf_trt)
print('Total Inference Time: {} seconds'.format(delta_tf_trt))
print('Number of Inferences: {}'.format(len(timings)))
print('Average Latency: {} +- {} seconds'.format(average_latency_tf_trt, std_latency_tf_trt))
print('Average Throughput w/ Batch Size {}: {} examples per second'.format(batch_size, average_throughput_tf_trt))

Total Inference Time: 3.26582407951355 seconds
Number of Inferences: 1000
Average Latency: 0.00326582407951355 +- 0.00022483372545169648 seconds
Average Throughput w/ Batch Size 1: 306.2014289970425 examples per second


In [14]:
# Compare results
print('Results area identical:', results_tf - results_tf_trt)
speedup = average_latency_tf / average_latency_tf_trt
print('Speedup of TF-TRT over TF is: {}x'.format(speedup))

Results area identical: [[[-1.2761276e-09  6.0645770e-04  3.1037450e-02 -3.1643908e-02
   -1.1389229e-08]]]
Speedup of TF-TRT over TF is: 1.005096051872048x
