In [1]:
import cv2
import time
import sys
import numpy as np
from openvino.runtime import Core

def getLayers():
    ie = Core()
    classification_model_xml = "./best.xml"
    model = ie.read_model(model=classification_model_xml)
    compiled_model = ie.compile_model(model=model, device_name="CPU") # CPU or MYRIAD
    input_layer = compiled_model.input(0)
    output_layer = compiled_model.output(0)
    return input_layer, output_layer, compiled_model

def build_model(is_cuda):
    net = cv2.dnn.readNet("yolov5s.onnx")
    if is_cuda:
        print("Attempty to use CUDA")
        net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA_FP16)
    else:
        print("Running on CPU")
        net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
        net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
    return net

INPUT_WIDTH = 640
INPUT_HEIGHT = 640
SCORE_THRESHOLD = 0.2
NMS_THRESHOLD = 0.4
CONFIDENCE_THRESHOLD = 0.8

def detect(image, input_layer, output_layer, compiled_model):
    blob = cv2.dnn.blobFromImage(image, 1/255.0, (INPUT_WIDTH, INPUT_HEIGHT), swapRB=True, crop=False)
    preds = compiled_model([blob])[output_layer]
    return preds

def load_capture():
    capture = cv2.VideoCapture("test_data\chloe_video.MOV")
    return capture

def load_classes():
    class_list = []
    with open("classes.txt", "r") as f:
        class_list = [cname.strip() for cname in f.readlines()]
    return class_list

class_list = load_classes()

def wrap_detection(input_image, output_data):
    class_ids = []
    confidences = []
    boxes = []

    rows = output_data.shape[0]

    image_width, image_height, _ = input_image.shape

    x_factor = image_width / INPUT_WIDTH
    y_factor =  image_height / INPUT_HEIGHT

    for r in range(rows):
        row = output_data[r]
        confidence = row[4]
        if confidence >= 0.4:

            classes_scores = row[5:]
            _, _, _, max_indx = cv2.minMaxLoc(classes_scores)
            class_id = max_indx[1]
            if (classes_scores[class_id] > .25):

                confidences.append(confidence)

                class_ids.append(class_id)

                x, y, w, h = row[0].item(), row[1].item(), row[2].item(), row[3].item() 
                left = int((x - 0.5 * w) * x_factor)
                top = int((y - 0.5 * h) * y_factor)
                width = int(w * x_factor)
                height = int(h * y_factor)
                box = np.array([left, top, width, height])
                boxes.append(box)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.25, 0.45) 

    result_class_ids = []
    result_confidences = []
    result_boxes = []

    for i in indexes:
        result_confidences.append(confidences[i])
        result_class_ids.append(class_ids[i])
        result_boxes.append(boxes[i])

    return result_class_ids, result_confidences, result_boxes

def format_yolov5(frame):

    row, col, _ = frame.shape
    _max = max(col, row)
    result = np.zeros((_max, _max, 3), np.uint8)
    result[0:row, 0:col] = frame
    return result


colors = [(255, 0, 0), (0, 255, 0), (0, 255, 255), (255, 0, 0)]

# is_cuda = len(sys.argv) > 1 and sys.argv[1] == "cuda"

# net = build_model(is_cuda)
capture = load_capture()

start = time.time_ns()
frame_count = 0
total_frames = 0
fps = -1
input_layer, output_layer, compiled_model = getLayers()
while True:

    _, frame = capture.read()
    if frame is None:
        print("End of stream")
        break

    inputImage = format_yolov5(frame)
    
    outs = detect(inputImage, input_layer, output_layer, compiled_model)

    class_ids, confidences, boxes = wrap_detection(inputImage, outs[0])

    frame_count += 1
    total_frames += 1

    for (classid, confidence, box) in zip(class_ids, confidences, boxes):
        print(class_list[classid], confidence, box) # Prints class, confidence, and bounding box
    print("Frame Complete ", frame_count)

    if cv2.waitKey(1) > -1:
        print("finished by user")
        break


RED 0.7359761 [ 420 1527   65  109]
RED 0.628767 [2088 1070   98  167]
RED 0.6186788 [3233 1717   52   94]
RED 0.49048838 [3213 1444  106  200]
RED 0.410374 [ 526 1537   44   73]
Frame Complete  1
RED 0.75395876 [ 420 1528   65  109]
RED 0.6017242 [2087 1070  100  169]
RED 0.599598 [3233 1717   52   94]
RED 0.5490018 [3213 1445  105  198]
RED 0.42276645 [ 525 1538   44   75]
Frame Complete  2
RED 0.7538036 [ 420 1528   65  107]
RED 0.6199843 [2087 1071  101  170]
RED 0.59188026 [3233 1717   51   94]
RED 0.5477674 [3212 1444  107  201]
RED 0.44936532 [  20 1754   39   60]
Frame Complete  3
RED 0.7499866 [ 420 1529   65  107]
RED 0.5702046 [2088 1072   99  166]
RED 0.5567949 [3232 1717   52   97]
RED 0.4780988 [3212 1445  105  198]
RED 0.4084454 [ 527 1540   43   72]
RED 0.40035224 [  19 1753   41   64]
Frame Complete  4
RED 0.7447009 [ 420 1531   65  105]
RED 0.5729164 [3231 1718   54  100]
RED 0.5562316 [2089 1074   98  163]
RED 0.5300957 [3211 1444  108  204]
RED 0.5129123 [ 527 1540 

KeyboardInterrupt: 