## IMAGE

In [None]:
import onnxruntime
import numpy as np
import cv2

# Load the ONNX model
onnx_session = onnxruntime.InferenceSession("test.onnx")

# Load and prepare image
image = cv2.imread("C:/Users/user/Desktop/human.jpg")  # Path to the image file
orig_image = cv2.resize(image, (640, 640))  # Resize image to 640x640
image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)  # Convert color space from BGR to RGB
image = image / 255.0  # Normalize pixel values (range [0, 1])
image = np.transpose(image, (2, 0, 1))  # Change image shape from (H, W, C) to (C, H, W)
image = np.expand_dims(image, axis=0).astype(np.float32)  # Add batch dimension and convert to float32

# Run the model with the processed image
outputs = onnx_session.run(None, {'onnx::Cast_0': image})

# outputs is a list of all the model's output tensors
output = outputs[0]  # Get the first output

# output processing
boxes = output[0, :, :4]
confidences = output[0, :, 4]
class_probs = output[0, :, 5:]

# Get class IDs
class_ids = np.argmax(class_probs, axis=1)

# Thresholds
confidence_threshold = 0.5
class_threshold = 0.5

# Get the class probabilities for the detected class IDs
class_confidences = class_probs[np.arange(len(class_probs)), class_ids]

# Iterate over the detections
for box, confidence, class_confidence, class_id in zip(boxes, confidences, class_confidences, class_ids):
    if confidence > confidence_threshold and class_confidence > class_threshold and class_id == 0:
        x, y, w, h = box
        x, y, w, h = int(x * 640), int(y * 640), int(w * 640), int(h * 640)  # Scale box to image's size
        cv2.rectangle(orig_image, (x, y), (x+w, y+h), (0, 255, 0), 2)  # Draw bounding box

cv2.imshow("Detection", orig_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

## VIDEO

In [11]:
import onnxruntime
import numpy as np
import cv2
import csv

# Load the ONNX model
onnx_session = onnxruntime.InferenceSession("test.onnx")

# Open video file
video = cv2.VideoCapture('C:/Users/user/Desktop/video.mp4')

# Open a csv file to write detections
with open('detections.csv', mode='w') as file:
    writer = csv.writer(file)
    writer.writerow(["frame_number","class_id","confidence","x","y","w","h"])

    # Frame counter
    frame_number = 0
    while video.isOpened():
        ret, frame = video.read()
        if not ret:
            print("Can't receive frame. Exiting ...")
            break

        # Prepare the frame to input the network
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_resized = cv2.resize(frame_rgb, (640, 640)) / 255.0
        frame_input = np.transpose(frame_resized, (2, 0, 1))
        frame_input = np.expand_dims(frame_input, axis=0).astype(np.float32)

        # Run the model with the frame
        outputs = onnx_session.run(None, {'onnx::Cast_0': frame_input})

        # Process the output
        boxes = outputs[0][0, :, :4]
        confidences = outputs[0][0, :, 4]
        class_ids = np.argmax(outputs[0][0, :, 5:], axis=-1)

        confidence_threshold = 0.5
        class_threshold = 0.5

        # Iterate over the detections
        for box, confidence, class_id in zip(boxes, confidences, class_ids):
            if confidence > confidence_threshold and class_id == 0:  # if class_id is "person" and confidence > threshold
                x, y, w, h = box
                x, y, w, h = int(x * 640), int(y * 640), int(w * 640), int(h * 640)  # Scale box to image's size
                writer.writerow([frame_number, class_id, confidence, x, y, w, h])

        frame_number += 1

# Release everything
video.release()
cv2.destroyAllWindows()

Can't receive frame. Exiting ...
