# Tensorflow 2.0 tests on Nvidia Jetson Nano and Google Coral TPU Edge

** WORK in progress **

In [1]:
# Imports
import sys
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from edgetpu.basic import edgetpu_utils
from platform import python_version


# Enable device placement logging
tf.debugging.set_log_device_placement(True)

# Check configuration
print("Python: {} ({})".format(python_version(), sys.prefix))
print("Tensorflow:{}".format(tf.version.VERSION))
if tf.test.is_gpu_available():
    gpu = tf.config.experimental.list_physical_devices('GPU')[0]
    tf.config.experimental.set_memory_growth(gpu, True)
    print("Available GPU: {}".format(tf.test.gpu_device_name()))
    print("Built with CUDA: {}".format(tf.test.is_built_with_cuda()))
    print("GPU details:" , gpu)

tpu = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_NONE)
if len(tpu) >0:
    print("USB TPU Edge device: {}".format(tpu))
else:
    print("No USB TPU Edge device found")

Python: 3.6.8 (/opt/local/virtual-env)
Tensorflow:2.0.0
Available GPU: /device:GPU:0
Built with CUDA: True
GPU details: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
USB TPU Edge device: ('/sys/bus/usb/devices/2-1.3',)


## Build and Train a basic Tensorflow model (using Keras API)

In [2]:
# Dataset download and preparation

fashion_mnist = keras.datasets.fashion_mnist
(train_img, train_labels), (test_img, test_labels) = fashion_mnist.load_data()

class_names = ['t-shirt/top', 'trouser', 'pullover', 'dress', ' coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot' ]

# Add a channel dimension [batch, width, height, channel]
train_img = train_img.reshape(train_img.shape[0], train_img.shape[1], train_img.shape[2], 1)
test_img = test_img.reshape(test_img.shape[0], test_img.shape[1], test_img.shape[2], 1)

# Normalize values
train_img = train_img / 255.0
test_img = test_img / 255.0

In [3]:
# Build and Train a basic model in Keras

mode='ADVANCED' #'BASIC'

model = keras.Sequential()

if (mode=='BASIC'):
    
    # basic model : small fully conected neural network (100K params)
    model.add(keras.layers.Flatten(input_shape=(28,28)))
    model.add(keras.layers.Dense(128, activation='relu'))
    model.add(keras.layers.Dense(10, activation='softmax'))

else:
    
    # Advanced model : small convolutional neural network (230K params)
    model.add(keras.layers.InputLayer(input_shape=(28,28,1)))
    
    model.add(keras.layers.Conv2D(filters=32, kernel_size=(3,3), activation=None))
    model.add(keras.layers.BatchNormalization(axis=-1))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.MaxPooling2D(pool_size=(2,2)))

    model.add(keras.layers.Conv2D(filters=64, kernel_size=(3,3), activation=None))
    model.add(keras.layers.BatchNormalization(axis=-1))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.MaxPooling2D(pool_size=(2,2)))

    model.add(keras.layers.Flatten())
    
    model.add(keras.layers.Dense(128, activation=None))
    model.add(keras.layers.BatchNormalization(axis=-1))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dropout(rate=0.3))
              
    model.add(keras.layers.Dense(64, activation=None))
    model.add(keras.layers.BatchNormalization(axis=-1))
    model.add(keras.layers.Activation('relu'))          
    model.add(keras.layers.Dropout(rate=0.3))
              
    model.add(keras.layers.Dense(10, activation='softmax'))


model.compile(optimizer=keras.optimizers.Adam(learning_rate=3e-4),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/tas

In [4]:
# Train the model (Note: The Jetson Nano is optimized for inference, not for training...)

batch_size = 64
epochs = 10

start_time=time.time()
model.fit(train_img, train_labels, epochs=epochs, batch_size=batch_size)
print("\n--- Elapsed Time: {:.6f} sec (Batch size = {}) ---".format(time.time() - start_time, batch_size))

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Train on 60000 samples
Epoch 1/10
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AnonymousIteratorV2 in device /job:localhost/r

In [5]:
# Evaluate the model on the test dataset

start_time=time.time()
test_loss, test_acc = model.evaluate(test_img, test_labels, batch_size=batch_size, verbose=0)
print("\nAccuracy on the test dataset {:3.5}".format(test_acc))
print("\n--- Elapsed Time: {:.6f} sec (Batch size = {}) ---".format(time.time() - start_time, batch_size))


Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_distributed_function_30858 in device /job:localhost/replica:0/task:0/device:GPU:0

Accuracy on the test dataset 0.9052

--- Elapsed Time: 4.468205 sec (Batch size = 64) ---


### Inference with Tensorflow (Keras API)

In [6]:
# Perform inference on the CPU cores

start_time=time.time()
with tf.device('/CPU:0'):
    predictions = model.predict(test_img, batch_size=batch_size, verbose=0)
print("\n--- Elapsed Time: {:.6f} sec (Batch size = {}) ---".format(time.time() - start_time, batch_size))


Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AnonymousIteratorV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_distributed_function_31508 in device /job:localhost/replica:0/task:

In [7]:
# Perform inference on the Jetson Nano GPU

start_time=time.time()
with tf.device('/GPU:0'):
    predictions = model.predict(test_img, batch_size=batch_size, verbose=0)
print("\n--- Elapsed Time: {:.6f} sec (Batch size = {}) ---".format(time.time() - start_time, batch_size))


Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_distributed_function_31508 in device /job:localhost/replica:0/task:0/device:GPU:0

--- Elapsed Time: 3.925985 sec (Batch size = 64) ---


In [8]:
# Display some inference results
for i in range(5):
    print("predition: {:15}  label: {:15}".format(class_names[np.argmax(predictions[i])], 
                                          class_names[test_labels[i]]))


predition: ankle boot       label: ankle boot     
predition: pullover         label: pullover       
predition: trouser          label: trouser        
predition: trouser          label: trouser        
predition: shirt            label: shirt          


### Save the Model

In [9]:
# Export the model to Tensorflow Saved Model format
model.save('./models', save_format='tf')

# Verify that the Saved Model is ok
new_model = keras.models.load_model('./models')
new_predictions = new_model.predict(test_img, batch_size=batch_size, verbose=0)

np.testing.assert_allclose(predictions, new_predictions, rtol=1e-6, atol=1e-6)
           

Executing op __inference_initialize_variables_31999 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_variables_32012 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_variables_32023 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_variables_32062 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_variables_32133 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_variables_32173 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_variables_32182 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_variables_32195 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_variables_32206 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_initialize_v

In [10]:
%reset -f
%who_ls

Executing op DestroyResourceOp in device /job:localhost/replica:0/task:0/device:GPU:0


['sys']

### Convert the Tensorflow Saved Model into Tensorflow Lite models 


In [11]:
# Reload Tensorflow 

import time
import pprint
import numpy as np
import tensorflow as tf
from tensorflow import keras
from edgetpu.basic import edgetpu_utils
from tensorflow.lite.python.interpreter import load_delegate

tf.debugging.set_log_device_placement(True)

# Dataset reload & preparation
fashion_mnist = keras.datasets.fashion_mnist
(train_img, train_labels), (test_img, test_labels) = fashion_mnist.load_data()

class_names = ['t-shirt/top', 'trouser', 'pullover', 'dress', ' coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot' ]

# Add a channel dimension [batch, width, height, channel]
train_img = train_img.reshape(train_img.shape[0], train_img.shape[1], train_img.shape[2], 1)
test_img = test_img.reshape(test_img.shape[0], test_img.shape[1], test_img.shape[2], 1)

# Normalize values
train_img = train_img / 255.0
test_img = test_img / 255.0

In [12]:
# TF Lite with Weight Quantization (for CPU/TPU inference)

# Define a Representative dataset generator to measure the dynamic  
# range of activations and inputs during the quantization process

def representative_dataset_gen():

    representative_dataset = tf.data.Dataset.from_tensor_slices(train_img)
    representative_dataset = representative_dataset.shuffle(buffer_size=1000, seed=42)
    representative_dataset = representative_dataset.batch(1)
    
    for representative_input in representative_dataset.take(100):
        yield [tf.cast(representative_input, dtype=tf.float32) ]


converter = tf.lite.TFLiteConverter.from_saved_model('./models')

converter.representative_dataset = representative_dataset_gen

converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_LATENCY]

converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.target_spec.supported_types = [tf.int8]

tflite_model = converter.convert()

open("./models/fashion_mnsit_quant_INT8.tflite", "wb").write(tflite_model)

Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:

240048

In [13]:
# TF Lite without Weight Quantization (for CPU inference)

converter = tf.lite.TFLiteConverter.from_saved_model('./models')
tflite_model = converter.convert()
open("./models/fashion_mnsit.tflite", "wb").write(tflite_model)

Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:

933780

In [14]:
# TF Lite with Weight Quantization (for CPU inference)

converter = tf.lite.TFLiteConverter.from_saved_model('./models')
converter.optimizations = [tf.lite.Optimize.DEFAULT] 
tflite_model = converter.convert()
open("./models/fashion_mnsit_quant.tflite", "wb").write(tflite_model)


Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:

239592

In [15]:
# TF Lite with Weight Quantization (for CPU inference)

converter = tf.lite.TFLiteConverter.from_saved_model('./models')
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE] 
tflite_model = converter.convert()
open("./models/fashion_mnsit_quant_size.tflite", "wb").write(tflite_model)


Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:

239592

In [16]:
# TF Lite with Weight Quantization (for CPU inference)

converter = tf.lite.TFLiteConverter.from_saved_model('./models')
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_LATENCY] 
tflite_model = converter.convert()
open("./models/fashion_mnsit_quant_latency.tflite", "wb").write(tflite_model)


Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:

239592

In [17]:
# List TFLite models with size

! ls -lh ./models/*.tflite | awk '{print $5,$9}'

309K ./models/fashion_mnsit_quant_INT8_edgetpu.tflite
235K ./models/fashion_mnsit_quant_INT8.tflite
234K ./models/fashion_mnsit_quant_latency.tflite
234K ./models/fashion_mnsit_quant_size.tflite
234K ./models/fashion_mnsit_quant.tflite
912K ./models/fashion_mnsit.tflite


### Inference with Tensorflow Lite ** Work in Progress **

In [18]:
class TflModel:

    def __init__(self, model=None, target_tpu=False, batch_size=1):
        self.model = model
        self.target_tpu = target_tpu
        self.batch_size = batch_size 
        self.interpreter = None
        self.input_details = None
        self.output_details = None
        
        if model is not None:
            self.load(model, target_tpu, batch_size)
        
            
    def load(self, model, target_tpu=False, batch_size=1):
        """  
        Load a Tensorflow Lite model and return

            model: Tensorflow Lite model (full path to)

        """

        # Load TFLite model
        
        if target_tpu:
            print("Using tf.lite with TPU delegate")
            self.interpreter = tf.lite.Interpreter(model,
                                                   experimental_delegates=[load_delegate('libedgetpu.so.1.0')])
        else:
            self.interpreter = tf.lite.Interpreter(model)
        
        # Resize the model's input to match the batch_size
        input_shape = self.interpreter.get_input_details()[0]['shape']
        input_index = self.interpreter.get_input_details()[0]['index']
        input_shape[0] = batch_size
        self.interpreter.resize_tensor_input(input_index, input_shape)

        # Get input/output tensor's details
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()
        
        # Allocate the tensors
        self.interpreter.allocate_tensors()

        # Display information about the model's input/output
        print("\nModel's Input details:")
        pprint.pprint(self.input_details)
        print("\nModel's Output details:")
        pprint.pprint(self.output_details)
        print("")

    
    def prepare_data(self, input_data):
        """ Prepare input data """ 
        
        # Working with Numpy arrays
        if isinstance(input_data , np.ndarray):
           
            # Add batch dim if necessary
            if len(input_data.shape) == len(self.input_details[0]['shape']) -1 :
                input_data = np.expand_dims(input_data, axis=0)   
        
            # Cast input_data type according to the model's input
            input_data = input_data.astype(self.input_details[0]['dtype']) 
           
         # Working with Tensors   
        elif isinstance(input_data , tf.Tensor):

            # Add batch dim if necessary
            if len(input_data.shape) == len(self.input_details[0]['shape']) -1 : 
                input_data = tf.expand_dims(test_img[0], axis=0)
            
            # Cast input_data type according to the model's input
            input_data = tf.dtypes.cast(input_data, self.input_details[0]['dtype'])
            
        # Ensure that the input data shape match the model's input shape
        assert np.array(input_data.shape).all() == self.input_details[0]['shape'].all(), \
                "input_data shape {} does not match model input shape {}".format(input_data.shape, self.input_details[0]['shape'])
        
        return input_data
    
    
    
    def predict(self, input_data):   

        # Perform inference on the input data
        self.interpreter.set_tensor(self.input_details[0]['index'], input_data)
        start_time = time.time()
        self.interpreter.invoke()

        # The function `get_tensor()` returns a copy of the tensor data.
        # Use `tensor()` in order to get a pointer to the tensor.
        output_data = self.interpreter.get_tensor(self.output_details[0]['index'])

        # Do we need this ?
        # If the model is quantized (uint8 data), then dequantize the results
        #if output_details['dtype'] == np.uint8:

        #    print(ou)

        #    scale, zero_point = output_details['quantization']
        #    output = scale * (output - zero_point)
        
        elapsed_time = time.time() - start_time

        return output_data, elapsed_time


In [19]:
def fashion_mnsit_inference(model, target_tpu=False, batch_size=64, batch_limit=None):

    print("Model:", model)
    print("Batch size:", batch_size)
    
    batch = 1
    total_accuracy = 0

    # Load Tensorflow Lite Model
    fashion_mnsit = TflModel(model, target_tpu=target_tpu, batch_size=batch_size)
 
    test_dataset = tf.data.Dataset.from_tensor_slices((test_img, test_labels))
    test_dataset = test_dataset.batch(batch_size, drop_remainder=True)
        
    for batch_input_data, batch_labels in test_dataset:

        # Check and prepare the input dataset
        input_data = fashion_mnsit.prepare_data(batch_input_data)

        # Perform Inference
        raw_predictions , elapsed_time = fashion_mnsit.predict(input_data)   # [batch, classes]
        
        # Return the indices that gives the prdictions sorted values
        predictions = tf.argsort(raw_predictions, axis =-1, direction='DESCENDING')  # [batch, sorted classes]
        
        # The first colum contains the prediction with highest confidence
        predictions = tf.gather(predictions, indices=[0], axis=1) # [batch, 1]
        
        # Reshape the predictions tensor like the labels tensor
        predictions = tf.transpose(predictions)   # [1, batch]
        predictions = tf.squeeze(predictions)     # [ batch ]
        
        # Check if the predictions are correct ->  [ batch ] array of 0/1 values 
        correct_predictions = (tf.cast(tf.cast(predictions, tf.int32) == tf.cast(batch_labels, tf.int32), tf.int32 )   )
        
         # Compute accuracy
        absolute_accuracy = tf.reduce_sum(correct_predictions)    
        accuracy = absolute_accuracy / batch_size * 100
        total_accuracy += absolute_accuracy 
        print("batch {:3} (batch size {:3}): accuracy={:3.2f}% - Global Accuracy={:3.2f}% (Inference time {:.5f} sec - {:3.2f} µsec/sample)"\
              .format(batch, batch_size, accuracy, (total_accuracy/(batch*len(batch_labels))*100), elapsed_time, elapsed_time/batch_size*1000000))
        
        # Early stop
        if (batch is not None) and batch == batch_limit:
            print("Early stop after {} batch(es)".format(batch))
            break
        
        # Next Batch
        batch +=1
        
batch_size = 128        

In [20]:
!edgetpu_compiler -s -o ./models ./models/fashion_mnsit_quant_INT8.tflite 

Edge TPU Compiler version 2.0.267685300

Model compiled successfully in 211 ms.

Input model: ./models/fashion_mnsit_quant_INT8.tflite
Input size: 234.42KiB
Output model: ./models/fashion_mnsit_quant_INT8_edgetpu.tflite
Output size: 308.84KiB
On-chip memory available for caching model parameters: 7.95MiB
On-chip memory used for caching model parameters: 234.75KiB
Off-chip memory used for streaming uncached model parameters: 0.00B
Number of Edge TPU subgraphs: 1
Total number of operations: 10
Operation log: ./models/fashion_mnsit_quant_INT8_edgetpu.log

Model successfully compiled but not all operations are supported by the Edge TPU. A percentage of the model will instead run on the CPU, which is slower. If possible, consider updating your model to use only operations supported by the Edge TPU. For details, visit g.co/coral/model-reqs.
Number of operations that will run on Edge TPU: 8
Number of operations that will run on CPU: 2

Operator                       Count      Status

MAX_POO

#### WORK IN PROGRESS

See: 
- https://www.tensorflow.org/lite/performance/post_training_quantization
- https://www.tensorflow.org/lite/performance/post_training_integer_quant
- https://coral.withgoogle.com/docs/edgetpu/tflite-python/

While the tensorflow 2.0 documentation has greatly inproved and provide great guides, it looks like some of them not fully up-to-date for Tensorflow 2.0. For example the `converter.inference_input_type` and `converter.inference_output_type` have been removed from the 2.0 API)

Open questions:
- When comiling the TFLite model for the TPU Edge, is it normal teh quantize/dequnatize operation will remain to be executed on the CPU ?
- When running the model on the TPU, why the accuracy is that bad ? (while being OK when run on the Jetson Nano GPU)
- Why `self.interpreter.get_input_details()[0]['quantization']` value is `(0.0, 0)`? Same question for `self.interpreter.get_output_details()[0]['quantization']` ?
- What about the maturity of tf.lite in TF 2.0 ? What about the maturity of TF 2.0 with the TPU Edge


Unsoved Problem:
- Unable to build Tennsorlow Lite runtime for Python 3.6 (navively or via cross-compilation)

TODO:
- Check Tensor RT 
- Example with OpenCV2/PiCam


In [21]:
fashion_mnsit_inference("./models/fashion_mnsit_quant_INT8.tflite", batch_size=batch_size, batch_limit=3)


Model: ./models/fashion_mnsit_quant_INT8.tflite
Batch size: 128

Model's Input details:
[{'dtype': <class 'numpy.float32'>,
  'index': 19,
  'name': 'input_1',
  'quantization': (0.0, 0),
  'shape': array([128,  28,  28,   1], dtype=int32)}]

Model's Output details:
[{'dtype': <class 'numpy.float32'>,
  'index': 20,
  'name': 'Identity',
  'quantization': (0.0, 0),
  'shape': array([ 1, 10], dtype=int32)}]

Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AnonymousIteratorV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op IteratorGetNextSync in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Shape in device /job:localhost/replica:0/task:0/device:GPU:0
Exec

In [22]:
## TEST

fashion_mnsit_inference("./models/fashion_mnsit_quant_INT8_edgetpu.tflite", target_tpu=True, batch_size=batch_size, batch_limit=3)


Model: ./models/fashion_mnsit_quant_INT8_edgetpu.tflite
Batch size: 128
Using tf.lite with TPU delegate

Model's Input details:
[{'dtype': <class 'numpy.float32'>,
  'index': 2,
  'name': 'input_1',
  'quantization': (0.0, 0),
  'shape': array([128,  28,  28,   1], dtype=int32)}]

Model's Output details:
[{'dtype': <class 'numpy.float32'>,
  'index': 3,
  'name': 'Identity',
  'quantization': (0.0, 0),
  'shape': array([ 1, 10], dtype=int32)}]

batch   1 (batch size 128): accuracy=7.81% - Global Accuracy=7.81% (Inference time 0.05433 sec - 424.43 µsec/sample)
batch   2 (batch size 128): accuracy=14.84% - Global Accuracy=11.33% (Inference time 0.05926 sec - 462.95 µsec/sample)
batch   3 (batch size 128): accuracy=7.81% - Global Accuracy=10.16% (Inference time 0.05142 sec - 401.70 µsec/sample)
Early stop after 3 batch(es)


In [23]:
fashion_mnsit_inference("./models/fashion_mnsit.tflite", batch_size=batch_size, batch_limit=3)

Model: ./models/fashion_mnsit.tflite
Batch size: 128

Model's Input details:
[{'dtype': <class 'numpy.float32'>,
  'index': 18,
  'name': 'input_1',
  'quantization': (0.0, 0),
  'shape': array([128,  28,  28,   1], dtype=int32)}]

Model's Output details:
[{'dtype': <class 'numpy.float32'>,
  'index': 0,
  'name': 'Identity',
  'quantization': (0.0, 0),
  'shape': array([ 1, 10], dtype=int32)}]

batch   1 (batch size 128): accuracy=90.62% - Global Accuracy=90.62% (Inference time 0.17294 sec - 1351.10 µsec/sample)
batch   2 (batch size 128): accuracy=92.19% - Global Accuracy=91.41% (Inference time 0.13290 sec - 1038.31 µsec/sample)
batch   3 (batch size 128): accuracy=95.31% - Global Accuracy=92.71% (Inference time 0.12306 sec - 961.41 µsec/sample)
Early stop after 3 batch(es)


In [24]:
fashion_mnsit_inference("./models/fashion_mnsit_quant.tflite", batch_size=batch_size, batch_limit=3)

Model: ./models/fashion_mnsit_quant.tflite
Batch size: 128

Model's Input details:
[{'dtype': <class 'numpy.float32'>,
  'index': 18,
  'name': 'input_1',
  'quantization': (0.0, 0),
  'shape': array([128,  28,  28,   1], dtype=int32)}]

Model's Output details:
[{'dtype': <class 'numpy.float32'>,
  'index': 0,
  'name': 'Identity',
  'quantization': (0.0, 0),
  'shape': array([ 1, 10], dtype=int32)}]

batch   1 (batch size 128): accuracy=89.84% - Global Accuracy=89.84% (Inference time 0.18021 sec - 1407.89 µsec/sample)
batch   2 (batch size 128): accuracy=92.19% - Global Accuracy=91.02% (Inference time 0.16977 sec - 1326.32 µsec/sample)
batch   3 (batch size 128): accuracy=95.31% - Global Accuracy=92.45% (Inference time 0.17185 sec - 1342.56 µsec/sample)
Early stop after 3 batch(es)


In [25]:
fashion_mnsit_inference("./models/fashion_mnsit_quant_size.tflite", batch_size=batch_size, batch_limit=3)

Model: ./models/fashion_mnsit_quant_size.tflite
Batch size: 128

Model's Input details:
[{'dtype': <class 'numpy.float32'>,
  'index': 18,
  'name': 'input_1',
  'quantization': (0.0, 0),
  'shape': array([128,  28,  28,   1], dtype=int32)}]

Model's Output details:
[{'dtype': <class 'numpy.float32'>,
  'index': 0,
  'name': 'Identity',
  'quantization': (0.0, 0),
  'shape': array([ 1, 10], dtype=int32)}]

batch   1 (batch size 128): accuracy=89.84% - Global Accuracy=89.84% (Inference time 0.18317 sec - 1430.99 µsec/sample)
batch   2 (batch size 128): accuracy=92.19% - Global Accuracy=91.02% (Inference time 0.18257 sec - 1426.35 µsec/sample)
batch   3 (batch size 128): accuracy=95.31% - Global Accuracy=92.45% (Inference time 0.17980 sec - 1404.71 µsec/sample)
Early stop after 3 batch(es)


In [26]:
fashion_mnsit_inference("./models/fashion_mnsit_quant_latency.tflite", batch_size=batch_size, batch_limit=3)

Model: ./models/fashion_mnsit_quant_latency.tflite
Batch size: 128

Model's Input details:
[{'dtype': <class 'numpy.float32'>,
  'index': 18,
  'name': 'input_1',
  'quantization': (0.0, 0),
  'shape': array([128,  28,  28,   1], dtype=int32)}]

Model's Output details:
[{'dtype': <class 'numpy.float32'>,
  'index': 0,
  'name': 'Identity',
  'quantization': (0.0, 0),
  'shape': array([ 1, 10], dtype=int32)}]

batch   1 (batch size 128): accuracy=89.84% - Global Accuracy=89.84% (Inference time 0.17634 sec - 1377.69 µsec/sample)
batch   2 (batch size 128): accuracy=92.19% - Global Accuracy=91.02% (Inference time 0.17045 sec - 1331.65 µsec/sample)
batch   3 (batch size 128): accuracy=95.31% - Global Accuracy=92.45% (Inference time 0.17025 sec - 1330.06 µsec/sample)
Early stop after 3 batch(es)
