### 1. Library Imports

In [None]:
import time
import nvidia_smi

# Initialize NVML
nvidia_smi.nvmlInit()

try:
    while True:
        # Get GPU power draw
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
        power_draw = (
            nvidia_smi.nvmlDeviceGetPowerUsage(handle) / 1000.0
        )  # Convert to watts

        # Print power draw
        print("Power Draw: {:.2f} W".format(power_draw))

        # Sleep for 1 second
        time.sleep(1)
except KeyboardInterrupt:
    # Shutdown NVML
    nvidia_smi.nvmlShutdown()

In [None]:
# Reload Extensions
%load_ext autoreload
%autoreload 2

# Base Libraries
from skimage.transform import resize
import matplotlib.pyplot as plt
from skimage import io
import numpy as np
from utility_scripts import utils
import os

# Utility py files
from utility_scripts import tft_optimizer as tft

# DL Base Libraries
import tensorflow as tf

# Use GPUS as is Required
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

import tensorflow.keras.applications as tf_app
from tensorflow.keras.applications.mobilenet_v3 import (
    preprocess_input,
    decode_predictions,
)

# Model Conversion and Inference Libraries
import torch
import onnx
import tf2onnx
import onnxruntime as rt
from onnx2torch import convert

from EMA import (
    EMA_finalize,
    EMA_init,
)

### 2. Downloading the Model

In [None]:
nb = int(input("\n\nIf using a jnb enter 1 else 0\n"))
uni = int(input("\n\nIf using uni gpu enter 1 else 0\n"))
gpu_id = int(input("\n\n Which GPU to USE 1 else 0\n"))
# If everything is run from jupyter nb then results file generated will have a suffix of uni

results_suffix = "uni" if uni else "work"
results_suffix += "_nb" if nb else "_py"
print(f"Suffix used with result files will be {results_suffix}!!")


GPU_ID = gpu_id
print(f"GPU: {GPU_ID} is being used")

BATCH_SIZE = 512

In [None]:
# Define the model names and directories for saving
model_name = "MobileNetV3L"
results_directory = "benchmark_results"
results_save_path = os.path.join(results_directory, model_name)

if not os.path.exists(results_save_path):
    print(f"Results Dir {results_save_path} doesn't exist Creating!!")
    os.makedirs(results_save_path, exist_ok=True)


models_directory = "models_lib"
model_type = "tf_models"
tf_model_save_path = os.path.join(models_directory, model_type)
if not os.path.exists(tf_model_save_path):
    print(f"Save Path {tf_model_save_path} doesn't exist Creating!!")
    os.makedirs(tf_model_save_path, exist_ok=True)

# Creation of directory for trt_models
models_directory = "models_lib"
model_type = "trt_models"
trt_model_save_path = os.path.join(models_directory, model_type)
if not os.path.exists(trt_model_save_path):
    print(f"Save Path {trt_model_save_path} doesn't exist Creating!!")
    os.makedirs(trt_model_save_path, exist_ok=True)

# Creation of directory for onnx models
models_directory = "models_lib"
model_type = "onnx_models"
onnx_model_save_path = os.path.join(models_directory, model_type)
if not os.path.exists(onnx_model_save_path):
    print(f"Save Path {onnx_model_save_path} doesn't exist Creating!!")
    os.makedirs(onnx_model_save_path, exist_ok=True)

# Creation of directory for onnx models
models_directory = "models_lib"
model_type = "torch_models"
torch_model_save_path = os.path.join(models_directory, model_type)
if not os.path.exists(torch_model_save_path):
    print(f"Save Path {torch_model_save_path} doesn't exist Creating!!")
    os.makedirs(torch_model_save_path, exist_ok=True)

In [None]:
# Loading the model
tf_model = tf_app.MobileNetV3Large(weights="imagenet", include_top=True)

#### 2.1 Making Prediction with Downloaded Model

In [None]:
### Doing one prediction is necessary to compile the model
# Loading n preprocessing the image
# url = "https://images.dog.ceo/breeds/retriever-golden/n02099601_3004.jpg"
url = "https://i.pinimg.com/originals/56/ea/2b/56ea2bb991a7446776ac2f2f27fdc397.jpg"

img = resize(io.imread(url), (224, 224))
img = 255 * np.expand_dims(img, axis=0)
img = preprocess_input(img)
preds = tf_model.predict(img)
print(f"Predicted {decode_predictions(preds, top = 3)[0]}")
plt.imshow(img[0] / 255)
plt.title(decode_predictions(preds, top=3)[0][0][1])
plt.axis()
plt.show()

### 3. Saving the Model

In [None]:
# tf_model.save(os.path.join(tf_model_save_path, model_name))
# tf_model.save(f"{os.path.join(tf_model_save_path, model_name)}.keras")

### 4. Loading the Model

In [None]:
tf_model = tf.keras.models.load_model(
    os.path.join(tf_model_save_path, model_name)
)
keras_model = tf.keras.models.load_model(
    f"{os.path.join(tf_model_save_path, model_name)}.keras"
)

### 5. Converting & Benchmarking for TF-TRT models

#### 5.1 Inferencing & Benchmarking TF-TRT FP32 Model

In [None]:
# Converting and saving the model
# NOTE Load the Cuda and tensorrt Modules
PRECISION = "FP32"
print("Converting to TF-TRT FP32...")
file_name = f"{model_name}_TFTRT_{PRECISION}"
trt_model_path = os.path.join(trt_model_save_path, file_name)

opt_model = tft.ModelOptimizer(os.path.join(tf_model_save_path, model_name))
trt_fp32 = opt_model.convert(trt_model_path, precision=PRECISION)
print(f"Done Converting to TF-TRT {PRECISION}")

In [None]:
# Load the data
# url = "https://images.dog.ceo/breeds/retriever-golden/n02099601_3004.jpg"
url = "https://i.pinimg.com/originals/56/ea/2b/56ea2bb991a7446776ac2f2f27fdc397.jpg"
img = io.imread(url)
input_data, _ = utils.batch_sigle_img(
    img,
    target_size=(224, 224),
    num_images=BATCH_SIZE,
    preprocessor=preprocess_input,
)
input_data = input_data.astype(np.float32)

# Load the saved model
trt_fp32 = tft.OptimizedModel(trt_model_path)


# preds = trt_fp32.predict(input_data).numpy()
# print(f"Predicted {decode_predictions(preds, top = 3)[0]}")

In [None]:
# Inferencing the model
EMA_init()
num_warmup_runs = 50
num_model_runs = 10
fname = f"TFTRT{PRECISION}_{num_model_runs}_it_{results_suffix}.csv"
csv_save_path = os.path.join(results_save_path, fname)

utils.batch_model_performances(
    framework_name=f"TFTRT{PRECISION}",
    model=trt_fp32,
    input_data=input_data,
    batch_sizes=[8, 16, 32, 64, 128, 256, 512],
    csv_path=csv_save_path,
    num_warmup_runs=num_warmup_runs,
    num_model_runs=num_model_runs,
    trt=True,
    onnx=False,
    torch=False,
    gpu_id=GPU_ID,
)
EMA_finalize()
# 30m and 1.4m

#### 5.2 Inferencing & Benchmarking TF-TRT FP16 Model

In [None]:
# Converting and saving the model
PRECISION = "FP16"
print("Converting to TF-TRT FP16...")
file_name = f"{model_name}_TFTRT_{PRECISION}"
trt_model_path = os.path.join(trt_model_save_path, file_name)

opt_model = tft.ModelOptimizer(os.path.join(tf_model_save_path, model_name))
trt_fp16 = opt_model.convert(trt_model_path, precision=PRECISION)
print(f"Done Converting to TF-TRT {PRECISION}")

In [None]:
# Load the data
# url = "https://images.dog.ceo/breeds/retriever-golden/n02099601_3004.jpg"
url = "https://i.pinimg.com/originals/56/ea/2b/56ea2bb991a7446776ac2f2f27fdc397.jpg"
img = io.imread(url)
input_data, _ = utils.batch_sigle_img(
    img,
    target_size=(224, 224),
    num_images=BATCH_SIZE,
    preprocessor=preprocess_input,
)
input_data = input_data.astype(np.float32)

# Load the saved model
trt_fp16 = tft.OptimizedModel(trt_model_path)


# preds = trt_fp32.predict(input_data).numpy()
# print(f"Predicted {decode_predictions(preds, top = 3)[0]}")

In [None]:
# Inferencing the model
EMA_init()
num_warmup_runs = 50
num_model_runs = 10
fname = f"TFTRT{PRECISION}_{num_model_runs}_it_{results_suffix}.csv"
csv_save_path = os.path.join(results_save_path, fname)

utils.batch_model_performances(
    framework_name=f"TFTRT{PRECISION}",
    model=trt_fp16,
    input_data=input_data,
    batch_sizes=[8, 16, 32, 64, 128, 256, 512],
    csv_path=csv_save_path,
    num_warmup_runs=num_warmup_runs,
    num_model_runs=num_model_runs,
    trt=True,
    onnx=False,
    torch=False,
    gpu_id=GPU_ID,
)

EMA_finalize()
# 7m and 1.4m

#### 5.3 Inferencing & Benchmarking TF-TRT Int8 Model

Whether you want to further reduce to INT8 precision depends on hardware - Turing cards and later INT8 is often better. Inference focused cards such as the NVIDIA T4 or systems-on-module such as Jetson AGX Xavier do well with INT8. In contrast, on a training-focused GPU like V100, INT8 often isn't any faster than FP16.

To perform INT8 inference, we need to see what the normal range of activations are in the network so we can quantize our INT8 representations based on a normal set of values for our dataset. It is important that this dataset is representative of the testing samples in order to maintain accuracy levels.

Here, we just want to see how our network performs in TensorRT from a runtime standpoint - so we will just feed dummy data and dummy calibration data into TensorRT.

In [None]:
# url = "https://images.dog.ceo/breeds/retriever-golden/n02099601_3004.jpg"
# img = io.imread(url)
# int8_data = utils.batch_sigle_img(
#     img, target_size=(224, 224), num_images=8, preprocessor=preprocess_input
# )

# Converting and saving the model
# PRECISION = "INT8"
# print("Converting to TF-TRT INT8...")
# save_dir = f"models_lib/trt_models/{original_model_name}_TFTRT_{PRECISION}"
# opt_model = tft.ModelOptimizer(original_save_path)
# opt_model.set_calibration_data(int8_data)
# trt_int8  = opt_model.convert(save_dir, precision = PRECISION)
# print(f"Done Converting to TF-TRT {PRECISION}")

TensorFlow TensorRT integration typically expects input data in the form of TensorFlow tensors. When working with TensorFlow models that are optimized or converted to run with TensorRT (e.g., using the trt.TrtGraphConverterV2), the inference is performed using TensorFlow tensor inputs.

You can convert NumPy arrays to TensorFlow tensors using tf.constant or tf.convert_to_tensor before feeding them to a TensorFlow-TRT model for inference.

### 6. Converting & Benchmarking for Onnx models

#### 6.1 Saving the tf model to onnx format

In [None]:
# Define the input shape (replace this with the actual input shape of your model)
input_shape = (None, 224, 224, 3)

# Convert the TensorFlow model to ONNX format
onnx_model, _ = tf2onnx.convert.from_keras(
    tf_model,
    input_signature=[
        tf.TensorSpec(shape=input_shape, dtype=tf.float32, name="input")
    ],
)
onnx_model_path = os.path.join(onnx_model_save_path, model_name)
# Save the ONNX model to a file
with open(f"{onnx_model_path}.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

# or
# spec = (tf.TensorSpec((None, 224, 224, 3), tf.float32, name="input"),)
# output_path = "models_lib/onnx_models/MobileNetV3L.onnx"
# model_proto, _ = tf2onnx.convert.from_keras(
#     tf_model, input_signature=spec, opset=15, output_path=output_path
# )
# output_names = [n.name for n in model_proto.graph.output]

In [None]:
onnx_model_path = os.path.join(onnx_model_save_path, model_name)

#### 6.2 Inferencing & Benchmarking Onnx model

In [None]:
PRECISION = np.float32
# url = "https://images.dog.ceo/breeds/retriever-golden/n02099601_3004.jpg"
url = "https://i.pinimg.com/originals/56/ea/2b/56ea2bb991a7446776ac2f2f27fdc397.jpg"
img = io.imread(url)
input_data, _ = utils.batch_sigle_img(
    img,
    target_size=(224, 224),
    num_images=BATCH_SIZE,
    preprocessor=preprocess_input,
)
input_data = input_data.astype(PRECISION)

In [None]:
# Testing the performance
# providers = ["CUDAExecutionProvider"]
# session = rt.InferenceSession(
#     "models_lib/onnx_models/MobileNetV3L.onnx",
#     providers=providers,
# )
# results = session.run(["Predictions"],{'input':input_data})
# results = np.squeeze(results, axis=0)
# print(f"Predicted {decode_predictions((results),top=3)[0]}")
# # Y = io_binding.copy_outputs_to_cpu()[0]

In [None]:
# options = rt.SessionOptions()
# options.enable_profiling = True
# providers = ["CUDAExecutionProvider"]
# session = rt.InferenceSession(
#     "models_lib/onnx_models/MobileNetV3L.onnx",
#     providers=providers,
#     sess_options=options,
# )
# io_binding = session.io_binding()
# io_binding.bind_cpu_input("input", input_data)
# io_binding.bind_output("Predictions")
# session.run_with_iobinding(io_binding)
# Y = io_binding.copy_outputs_to_cpu()[0]
# print(f"Predicted {decode_predictions(Y,top=3)[1]}")

In [None]:
# Performing Inference
EMA_init()

providers = ["CUDAExecutionProvider"]
session = rt.InferenceSession(f"{onnx_model_path}.onnx", providers=providers)
num_warmup_runs = 50
num_model_runs = 10
batch_sizes = [8, 16, 32, 64, 128, 256, 512]
fname = f"onnxrt_{num_model_runs}_it_{results_suffix}.csv"
csv_save_path = os.path.join(results_save_path, fname)

results = utils.batch_model_performances(
    framework_name="onnxrt",
    model=session,
    batch_sizes=batch_sizes,
    num_warmup_runs=num_warmup_runs,
    num_model_runs=num_model_runs,
    input_data=input_data,
    csv_path=csv_save_path,
    onnx=True,
    trt=False,
    torch=False,
    gpu_id=GPU_ID,
)
EMA_finalize()

### 7. Converting and Benchmarking for .trt models

In [None]:
# # Load the onnx model
# BATCH_SIZE = 32
# PRECISION = np.float32
# onnx_model_path = os.path.join(onnx_model_save_path, model_name)
# onnx_model = onnx.load_model(f"{onnx_model_path}.onnx")

In [None]:
# # Add the inference BATCH_SIZE for infernce and perform inference NOTE with only this batch size nothing else.
# inputs = onnx_model.graph.input
# for input in inputs:
#     dim1 = input.type.tensor_type.shape.dim[0]
#     dim1.dim_value = BATCH_SIZE

# # for input in onnx_model.graph.input:
# #     for dim in input.type.tensor_type.shape.dim:
# #         dim.dim_param = -1

In [None]:
# onnx.save_model(
#     onnx_model,
#     os.path.join(onnx_model_save_path,model_name)+f"_batch_size{BATCH_SIZE}.onnx"
# )

##### Command to convert ONNX to handle dynamic input shape Via TRTEXEC (Doesn't work with min, opt, max, shapes options)

```bash
trtexec --onnx=models_lib/onnx_models/MobileNetV3L.onnx \
        --saveEngine=models_lib/trt_models/Mobilenet.trt \
        --explicitBatch \
        --minShapes=input_1:8x224x224x3 \
        --optShapes=input_1:32x224x224x3 \
        --maxShapes=input_1:512x224x224x3 \
        --shapes=input_1:16x224x224x3 \
        --workspace=1024*8<In MBs>
```
--explicitBatch: Specifies that the TensorRT engine should be optimized for varying batch sizes.

--minShapes, --optShapes, --maxShapes: Define the range of batch sizes for which TensorRT should optimize the engine.

--shapes: Specifies a preferred input shape for optimization, but it doesn't constrain the engine to only that shape. It's useful for indicating a common or preferred input size.

--workspace: Sets the GPU workspace size for TensorRT optimization

<img src="http://developer.download.nvidia.com/notebooks/dlsw-notebooks/tensorrt_tftrt_dynamic_shapes/nvidia_logo.png" style="width: 90px; float: right;">

**Using Dynamic Shapes with TensorFlow TensorRT**

The NVIDIA TensorRT is a library that facilitates high performance inference on NVIDIA graphics processing units (GPUs). TensorRT takes a trained network, which consists of a network definition and a set of trained parameters, and produces a highly optimized runtime engine which performs inference for that network. 

TensorFlow™ integration with TensorRT™ (TF-TRT) optimizes and executes compatible subgraphs, allowing TensorFlow to execute the remaining graph. While you can still use TensorFlow's wide and flexible feature set, TensorRT will parse the model and apply optimizations to the portions of the graph wherever possible.

In this notebook demonstrates the use of dynamic shape tensors when using TensorFlow-TensorRT


**Introduction**

If you are unfamiliar with how TensorFlow TensorRT works, you can refer to this [video](https://www.youtube.com/watch?v=w7871kMiAs8) for a quick overview. Some understanding of how TF-TRT works is required to digest the information in the following section. A quick and dirty explaination of the above is as follows: TF-TRT partitions the network graph into supported and unsupported sub-graphs. For each of these supported subgraphs, TRTEngineOp builds a TensorRT Engine. With this information in mind, let's proceed to the task at hand.

TensorFlow TensorRT has two concepts relevent to this discussion:
* Dynamic Ops
* Dynamic Shape

**Explaining Dynamic Ops**

Dynamic Ops can be treated as a mode which let's users leverage the optimized model "implicit shape" mode, ie, if the model's input tensor shape is defined as(example) `[?, ?, ?, 3]`. How does this work? The TRTEngineOp creates the TensorRT engine at inference time with the shape of the input tensor (Let's say, `[8, 224, 224, 3]`). So up on execution, if we supply a tensor with a shape (say `[16, 224, 224, 3]`) another engine will be created. While this provides flexibility, the downside is that each TRT Engine consumes memory (a set of model weights for each "profile").

###### Explaining Dynamic Shapes

Dynamic Shape mode reqires the user to define, `minimum`, `optimial` and `maximum` shapes for the input tensor. This shifts the task at hand from being one about supporting implict tensor shape to supporting a set of explict batch shapes. The engine built in this case can handle any shape between the `minimum` and `maximum` shape, without a need for building separate engines.

For a visual representation of the above, refer to the image below. The image on the right shows the scenerio where the use of three different shapes has resulted in three different engines as opposed to the one for dynamic shapes.

#### 7.3 Inferencing & Benchmarking .trt Model

In [None]:
# # url = "https://images.dog.ceo/breeds/retriever-golden/n02099601_3004.jpg"
# url = "https://i.pinimg.com/originals/56/ea/2b/56ea2bb991a7446776ac2f2f27fdc397.jpg"
# img = io.imread(url)
# input_data = utils.batch_sigle_img(
#     img,
#     target_size=(224, 224),
#     num_images=BATCH_SIZE,
#     preprocessor=preprocess_input,
# ).astype(PRECISION)

##### Produce the trt file (Before running next cell)
```bash
polygraphy convert models_lib/onnx_models/MobileNetV3L_batch_size32.onnx --convert-to trt -o models_lib/trt_models/MobileNetV3L.trt
```

In [None]:
# # Using the trtexec trt file
# save_dir = "models_lib/trt_models/MobileNetV3L.trt"
# trt_model = ONNXClassifierWrapper(
#     save_dir,
#     [BATCH_SIZE, 1000],
#     target_dtype=PRECISION,
# )

In [None]:
# preds = trt_model.predict(input_data)
# print(f"Preds: {decode_predictions(preds, top = 3)[31]}")

### 8. Inferencing and Benchmarking models via pytorch

In [None]:
# url = "https://images.dog.ceo/breeds/retriever-golden/n02099601_3004.jpg"

BATCH_SIZE = 512
PRECISION = np.float32
url = "https://i.pinimg.com/originals/56/ea/2b/56ea2bb991a7446776ac2f2f27fdc397.jpg"
img = io.imread(url)
input_data, _ = utils.batch_sigle_img(
    img,
    target_size=(224, 224),
    num_images=BATCH_SIZE,
    preprocessor=preprocess_input,
)
input_data = input_data.astype(PRECISION)

In [None]:
# Path to ONNX model
onnx_model_path = os.path.join(onnx_model_save_path, model_name)
# You can pass the path to the onnx model to convert it or...
torch_model = convert(f"{onnx_model_path}.onnx")

In [None]:
# Move the model to the same device as the input data (GPU in this case)
torch_model = torch_model.cuda()
torch_model.eval()
# Create example data on the GPU
x = torch.tensor(input_data.transpose(0, 2, 1, 3), dtype=torch.float32).cuda()
# preds = torch_model(x).detach().cpu().numpy()

In [None]:
EMA_init()
num_warmup_runs = 50
num_model_runs = 10
fname = f"torch_{num_model_runs}_it_{results_suffix}.csv"
csv_save_path = os.path.join(results_save_path, fname)
batch_sizes = [8, 16, 32, 64, 128, 256]
utils.batch_model_performances(
    framework_name="torch",
    model=torch_model,
    input_data=input_data,
    batch_sizes=batch_sizes,
    csv_path=csv_save_path,
    num_warmup_runs=num_warmup_runs,
    num_model_runs=num_model_runs,
    trt=False,
    torch=True,
    gpu_id=GPU_ID,
)
EMA_finalize()