## Read and show video in new window

In [12]:
import numpy as np
import cv2 as cv
 
cap = cv.VideoCapture('2053100-uhd_3840_2160_30fps.mp4')
cv.namedWindow('frame', cv.WINDOW_NORMAL)
print((cap.get(cv.CAP_PROP_FRAME_WIDTH),cap.get(cv.CAP_PROP_FRAME_HEIGHT)))
while cap.isOpened():
    ret, frame = cap.read()
 
    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break

    frame_resized = cv.resize(frame, (960, 540))
    # gray = cv.cvtColor(frame_resized, cv.COLOR_BGR2GRAY)
 
    cv.imshow('frame', frame_resized)
    if cv.waitKey(1) == ord('q'):
        break
 
cap.release()
cv.destroyAllWindows()

(3840.0, 2160.0)


KeyboardInterrupt: 

## Read and do object detection on video

### Load Model

In [None]:
import cv2
from ultralytics import YOLO

# Load the model
yolo = YOLO('yolov8s.pt')

### Webcam

In [None]:
# Load the video capture, this uses the webcam to do OD
# Change here to read an MP4 file instead for our use case
videoCap = cv2.VideoCapture(0)

# Function to get class colors from an unique ID to RGB
# E.g.: an input of cls_num = 5 (class ID) will return (1, 255, 1), which is a bright green with a slight blue tint
def getColours(cls_num):
    base_colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
    color_index = cls_num % len(base_colors)
    increments = [(1, -2, 1), (-2, 1, -1), (1, -1, 2)]
    color = [base_colors[color_index][i] + increments[color_index][i] * 
    (cls_num // len(base_colors)) % 256 for i in range(3)]
    return tuple(color)

while True:
    ret, frame = videoCap.read()
    if not ret:
        continue
    results = yolo.track(frame, stream=True)

    for result in results:
        # get the classes names
        classes_names = result.names

        # iterate over each box
        for box in result.boxes:
            # check if confidence is greater than 40 percent
            if box.conf[0] > 0.4:
                # get coordinates
                [x1, y1, x2, y2] = box.xyxy[0]
                # convert to int
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

                # get the class
                cls = int(box.cls[0])

                # get the class name
                class_name = classes_names[cls]

                # get the respective colour
                colour = getColours(cls)

                # draw the rectangle
                cv2.rectangle(frame, (x1, y1), (x2, y2), colour, 2)

                # put the class name and confidence on the image
                cv2.putText(frame, f'{classes_names[int(box.cls[0])]} {box.conf[0]:.2f}', (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1, colour, 2)
                
    # show the image
    cv2.imshow('frame', frame)

    # break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


# release the video capture and destroy all windows
videoCap.release()
cv2.destroyAllWindows()


### File

In [6]:
import cv2
from ultralytics import YOLO

# Load the model
yolo = YOLO('yolov8s.pt')

# Load the video capture, this uses the webcam to do OD
cap = cv.VideoCapture('2053100-uhd_3840_2160_30fps.mp4')

# Function to get class colors from an unique ID to RGB
# E.g.: an input of cls_num = 5 (class ID) will return (1, 255, 1), which is a bright green with a slight blue tint
def getColours(cls_num):
    base_colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
    color_index = cls_num % len(base_colors)
    increments = [(1, -2, 1), (-2, 1, -1), (1, -1, 2)]
    color = [base_colors[color_index][i] + increments[color_index][i] * 
    (cls_num // len(base_colors)) % 256 for i in range(3)]
    return tuple(color)

while cap.isOpened():
    ret, frame = cap.read()
 
    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break

    frame_resized = cv.resize(frame, (960, 540))
    results = yolo.track(frame_resized, stream=True)

    for result in results:
        # get the classes names
        classes_names = result.names

        # iterate over each box
        for box in result.boxes:
            # check if confidence is greater than 40 percent
            if box.conf[0] > 0.4:
                # get coordinates
                [x1, y1, x2, y2] = box.xyxy[0]
                # convert to int
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

                # get the class
                cls = int(box.cls[0])

                # get the class name
                class_name = classes_names[cls]

                # get the respective colour
                colour = getColours(cls)

                # draw the rectangle
                cv2.rectangle(frame_resized, (x1, y1), (x2, y2), colour, 2)

                # put the class name and confidence on the image
                cv2.putText(frame_resized, f'{classes_names[int(box.cls[0])]} {box.conf[0]:.2f}', (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1, colour, 2)
                
    # show the image
    cv2.imshow('frame', frame_resized)

    # break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# release the video capture and destroy all windows
videoCap.release()
cv2.destroyAllWindows()




0: 384x640 1 person, 1 motorcycle, 1 cell phone, 304.0ms
Speed: 11.2ms preprocess, 304.0ms inference, 9.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 car, 1 motorcycle, 1 cell phone, 294.7ms
Speed: 2.0ms preprocess, 294.7ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 car, 1 motorcycle, 1 cell phone, 301.7ms
Speed: 11.6ms preprocess, 301.7ms inference, 8.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 motorcycle, 1 cell phone, 312.6ms
Speed: 9.4ms preprocess, 312.6ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 car, 1 motorcycle, 1 cell phone, 328.8ms
Speed: 0.0ms preprocess, 328.8ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 car, 1 motorcycle, 1 cell phone, 286.7ms
Speed: 4.0ms preprocess, 286.7ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 cell phone

KeyboardInterrupt: 