# Demo Notebook

In [1]:
# PyTorch, open-cv, etc-
import torch
import cv2
import numpy as np
import time
import statistics

# ONNX libraries
import onnx
import onnxruntime

# EML libraries
from tinyyolov2_pruned import TinyYoloV2, PrunedTinyYoloV2, FusedTinyYoloV2, PrunedFusedTinyYoloV2
from utils.yolo import nms, filter_boxes
from utils.viz import display_result
from utils.camera import CameraDisplay

# Globals and Hyperparameters

In [2]:
# Thresholds
CONFIDENCE_THRESHOLD     = 0.5
NMS_THRESHOLD            = 0.5
# CUDA
if torch.cuda.is_available():
      torch_device = torch.device("cuda")
      print("Using GPU")
else:
    torch_device = torch.device("cpu")
    print("Using CPU")
# FPS
now = time.time()

Using GPU


# Utility Functions

In [3]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

## Preprocess function
- Convert to RGB
- Normalize to (0, ..., 1)
- Convert to torch.Tensor + add batch-dimension

In [4]:
def preprocess(image, target_size=(320, 320)):
    image = torch.from_numpy(cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0)
    
    return image.permute(2, 0, 1).unsqueeze(0)

In [108]:
def preprocess_onnx(image, target_size=(320, 320)):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float16) / 255.0
    
    return np.expand_dims(np.transpose(image, (2, 0, 1)), axis=0)
    # image = torch.from_numpy(cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0)
    
    # return image.permute(2, 0, 1).unsqueeze(0)

## Inference function
- perform inference of given ML-model
- filter boxes w.r.t. confidence threshold
- perform non-maximum-suppression

In [6]:
def inference(image, net, torch_device):
    image = image.to(torch_device, dtype=torch.float32)
    with torch.no_grad():
        output = net(image)
        output = filter_boxes(output, CONFIDENCE_THRESHOLD)
        output = nms(output, NMS_THRESHOLD)
        
    return output

In [92]:
def inference_onnx(image, onnx_session, torch_device):
    # image = image.to(torch_device, dtype=torch.float32)
    # with torch.no_grad():
    #     output = net(image)
    #     output = filter_boxes(output, CONFIDENCE_THRESHOLD)
    #     output = nms(output, NMS_THRESHOLD)

    # image = image.to(dtype=torch.float16)
    # image = image.astype(np.float16)
    ort_input = {onnx_session.get_inputs()[0].name: image}
    # ort_input = {onnx_session.get_inputs()[0].name: image}
    ort_output = onnx_session.run(None, ort_input)[0]
    ort_output = torch.from_numpy(ort_output)
    # ort_output = ort_output.to(torch_device)
    ort_output = filter_boxes(ort_output, CONFIDENCE_THRESHOLD)
    ort_output = nms(ort_output, NMS_THRESHOLD)

    return ort_output

## Postprocess function
- draw bboxes on image

In [8]:
def postprocess(image, output):
    img_shape = 320

    if output:
        bboxes = torch.stack(output, dim=0)
        valid = bboxes[0, :, -1] >= 0 # filter valid bboxes

        if valid.any():
            bboxes = bboxes[:, valid]  # keep only valid bboxes

            # vectorized calculation
            x_min = (bboxes[0, :, 0] - bboxes[0, :, 2] / 2) * img_shape
            y_min = (bboxes[0, :, 1] - bboxes[0, :, 3] / 2) * img_shape
            x_max = x_min + bboxes[0, :, 2] * img_shape
            y_max = y_min + bboxes[0, :, 3] * img_shape
            conf = bboxes[0, :, 4].tolist()

            for i in range(len(conf)):
                cv2.rectangle(image, (int(x_min[i]), int(y_min[i])), (int(x_max[i]), int(y_max[i])), (0, 0, 255), 2)
                cv2.putText(image, f"person {conf[i]:.2f}", (int(x_min[i]), int(y_min[i]) - 5), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
                
    return image

In [100]:
def postprocess_onnx(image, output):
    img_shape = 320

    if output:
        bboxes = torch.stack(output, dim=0)
        valid = bboxes[0, :, -1] >= 0 # filter valid bboxes

        if valid.any():
            bboxes = bboxes[:, valid]  # keep only valid bboxes

            # vectorized calculation
            x_min = (bboxes[0, :, 0] - bboxes[0, :, 2] / 2) * img_shape
            y_min = (bboxes[0, :, 1] - bboxes[0, :, 3] / 2) * img_shape
            x_max = x_min + bboxes[0, :, 2] * img_shape
            y_max = y_min + bboxes[0, :, 3] * img_shape
            conf = bboxes[0, :, 4].tolist()

            for i in range(len(conf)):
                cv2.rectangle(image, (int(x_min[i]), int(y_min[i])), (int(x_max[i]), int(y_max[i])), (0, 0, 255), 2)
                cv2.putText(image, f"person {conf[i]:.2f}", (int(x_min[i]), int(y_min[i]) - 5), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
                
    return image

## Camera Callback

In [10]:
def callback(image):
    global now

    elapsed_time = time.time() - now
    fps = f"{int(1 / elapsed_time) if elapsed_time > 0 else 0}"
    if int(fps) > 0:
        fps_list.append(int(fps))
    if len(fps_list) >= 500:
        cam.stop()
        cam.release()
    now = time.time()
    
    image = image[:320, :320]
    image_pre = preprocess(image)
    output = inference(image_pre, model, torch_device)
    image_post = postprocess(image, output)
    
    cv2.putText(image_post, f"fps={fps}", (2, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 255, 0), 2, cv2.LINE_AA)
    
    return image_post

In [91]:
def callback_onnx(image):
    global now

    elapsed_time = time.time() - now
    fps = f"{int(1 / elapsed_time) if elapsed_time > 0 else 0}"
    # if int(fps) > 0:
    #     fps_list.append(int(fps))
    # if len(fps_list) >= 500:
    #     cam.stop()
    #     cam.release()
    now = time.time()
    
    image = image[:320, :320]
    image_pre = preprocess_onnx(image)
    output = inference_onnx(image_pre, onnx_session, torch_device)
    image_post = postprocess_onnx(image, output)
    
    cv2.putText(image_post, f"fps={fps}", (2, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 255, 0), 2, cv2.LINE_AA)
    
    return image_post

# Camera Loop

## Model PyTorch

In [75]:
sd_str = f"./weights/voc/pruning_3-7_fused/voc_fine_tuned_fused{0.9}.pt"
# sd_str = f"./weights/voc/pruning_1-7/voc_fine_tuned_pruned{0.65}.pt"
# sd_str = f"./weights/coco/voc_coco_fine_tuned_augmented.pt"
# sd_str = f"./voc_fine_tuned.pt"
sd = torch.load(sd_str, weights_only=True, map_location=torch_device)

model = PrunedFusedTinyYoloV2(num_classes=1)
# model = PrunedTinyYoloV2(num_classes=1)
# model = TinyYoloV2(num_classes=1)
model.load_state_dict(sd, strict=False)
# model.to(torch_device, dtype=torch.float16)
model.to(torch_device)
model.eval()

GST_ARGUS: Cleaning up
CONSUMER: Done Success
GST_ARGUS: Done Success
Camera released


PrunedFusedTinyYoloV2(
  (pad): ReflectionPad2d((0, 1, 0, 1))
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(20, 46, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv5): Conv2d(46, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv6): Conv2d(96, 199, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv7): Conv2d(199, 401, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv8): Conv2d(401, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv9): Conv2d(1024, 30, kernel_size=(1, 1), stride=(1, 1))
)

In [74]:
# Initialize the camera with the callback
cam = CameraDisplay(callback)

Initializing camera...
GST_ARGUS: Cleaning up
CONSUMER: Done Success
GST_ARGUS: Done Success
GST_ARGUS: Creating output stream
CONSUMER: Waiting until producer is connected...
GST_ARGUS: Available Sensor modes :
GST_ARGUS: 3264 x 2464 FR = 21.000000 fps Duration = 47619048 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 3264 x 1848 FR = 28.000001 fps Duration = 35714284 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1920 x 1080 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1640 x 1232 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1280 x 720 FR = 59.999999 fps Duration = 16666667 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1280 x 720 FR



Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x02\x01\x0…

In [77]:
# The camera stream can be started with cam.start()
# The callback gets asynchronously called (can be stopped with cam.stop())
fps_list = []
cam.start()


Exception in thread Thread-22:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.8/dist-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/jetson/embedded-ml-lab-students-ws2425/exercises/4-challenge/utils/camera.py", line 67, in _capture_frames
    self.value = self._read()
  File "/usr/local/lib/python3.8/dist-packages/traitlets/traitlets.py", line 716, in __set__
    self.set(obj, value)
  File "/usr/local/lib/python3.8/dist-packages/traitlets/traitlets.py", line 706, in set
    obj._notify_trait(self.name, old_value, new_value)
  File "/usr/local/lib/python3.8/dist-packages/traitlets/traitlets.py", line 1513, in _notify_trait
    self.notify_change(
  File "/usr/local/lib/python3.8/dist-packages/traitlets/traitlets.py", line 

In [73]:
# The camera should always be stopped and released for a new camera is instantiated (calling CameraDisplay(callback) again)
# print(len(fps_list))
# print(fps_list)
# print(statistics.mean(fps_list))
# print(statistics.variance(fps_list))

cam.stop()
cam.release()

Camera released


## Model ONNX

In [106]:
# sd_str = f"./weights/voc/pruning_3-7_fused/voc_fine_tuned_fused{0.9}.pt"
# sd_str = f"./weights/voc/pruning_1-7/voc_fine_tuned_pruned{0.65}.pt"
# sd_str = f"./weights/coco/voc_coco_fine_tuned_augmented.pt"
# sd_str = f"./weights/voc/voc_fine_tuned.pt"
# sd = torch.load(sd_str, weights_only=True)

# model = PrunedFusedTinyYoloV2(num_classes=1)
# model = PrunedTinyYoloV2(num_classes=1)
# model = TinyYoloV2(num_classes=1)
# model.load_state_dict(sd)

onnx_filepath = f"./weights/voc/onnx_3-7/tiny_yolo_pruned_0.9.onnx"
provider     = ["CUDAExecutionProvider"]

In [107]:
# Define Inference Session Options
session_options = onnxruntime.SessionOptions()
session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL

# Define Inference Session for 32 Bit Float
onnx_session = onnxruntime.InferenceSession(onnx_filepath, providers=provider, sess_options=session_options)
# Define Inference Session for 16 Bit Float
# ort_session_fp16 = onnxruntime.InferenceSession(onnx_filepath_fp16, providers=provider, sess_options=session_options)

GST_ARGUS: Cleaning up
CONSUMER: Done Success
GST_ARGUS: Done Success


In [103]:
# Initialize the camera with the callback
cam = CameraDisplay(callback_onnx)

Initializing camera...
GST_ARGUS: Creating output stream
CONSUMER: Waiting until producer is connected...
GST_ARGUS: Available Sensor modes :
GST_ARGUS: 3264 x 2464 FR = 21.000000 fps Duration = 47619048 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 3264 x 1848 FR = 28.000001 fps Duration = 35714284 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1920 x 1080 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1640 x 1232 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1280 x 720 FR = 59.999999 fps Duration = 16666667 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1280 x 720 FR = 120.000005 fps Duration = 8333333 ; Analog Gain range min 1.000000,



Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x02\x01\x0…

In [104]:
# The camera stream can be started with cam.start()
# The callback gets asynchronously called (can be stopped with cam.stop())
cam.start()

In [105]:
# The camera should always be stopped and released for a new camera is instantiated (calling CameraDisplay(callback) again)
cam.stop()
cam.release()

Camera released


# Misc

## Old Loop

In [None]:
def preprocess(image, target_size=(320, 320)):
    # Convert from BGR (cv2) to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Normalisieren (z.B. Wertebereich 0-1)
    image = image.astype(np.float32) / 255.0
    # Convert to tensor, HWC to CHW and add batch-dimesnion
    image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0)

    return image

In [None]:
def inference(image, net, torch_device):
    image = image.to(torch_device)
    with torch.no_grad():
        output = net(image.float())
        # output = net(image)
        output = filter_boxes(output, CONFIDENCE_THRESHOLD)
        output = nms(output, NMS_THRESHOLD)

    return output

In [None]:
def postprocess(image, output):
    img_shape = 320

    if output:
        bboxes = torch.stack(output, dim=0)
        valid = bboxes[0, :, -1] >= 0 # filter valid bboxes
    
        for i in range(bboxes.shape[1]):
            if bboxes[0,i,-1] >= 0:
                x_min = int(bboxes[0,i,0]*img_shape - bboxes[0,i,2]*img_shape/2)
                y_min = int(bboxes[0,i,1]*img_shape - bboxes[0,i,3]*img_shape/2)
                x_max = x_min + int(bboxes[0,i,2]*img_shape)
                y_max = y_min + int(bboxes[0,i,3]*img_shape)
                conf  = float(bboxes[0,i,4])
      
                cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0,0,255), 2)
                cv2.putText(image, f"person {conf:.2f}", (x_min, y_min-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)
        
    return image

In [None]:
def callback(image):
    global now
    
    fps = f"{int(1/(time.time() - now))}"
    now = time.time()
    
    # Crop und resize
    # image = image[0:target_size[0], 0:target_size[1], :]
    image = image[0:320,0:320, :]
    
    image_pre = preprocess(image)
    output = inference(image_pre, model, torch_device)
    image_post = postprocess(image, output)
    
    cv2.putText(image_post, "fps="+fps, (2, 25), cv2.FONT_HERSHEY_SIMPLEX, 1,
               (100, 255, 0), 2, cv2.LINE_AA)
    
    return image