### Setup

Pip install `ultralytics` and [dependencies](https://github.com/ultralytics/ultralytics/blob/main/pyproject.toml) and check software and hardware.

[![PyPI - Version](https://img.shields.io/pypi/v/ultralytics?logo=pypi&logoColor=white)](https://pypi.org/project/ultralytics/) [![Downloads](https://static.pepy.tech/badge/ultralytics)](https://www.pepy.tech/projects/ultralytics) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/ultralytics?logo=python&logoColor=gold)](https://pypi.org/project/ultralytics/)

In [None]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [None]:
import ultralytics
import cv2 as cv
import numpy as np
from ultralytics import solutions
from datetime import datetime

from utils import get_coordinates, crop_and_mask_image, stack_image

ultralytics.checks()

Ultralytics 8.3.74  Python-3.10.11 torch-2.6.0+cpu CPU (11th Gen Intel Core(TM) i7-11700 2.50GHz)
Setup complete  (20 CPUs, 31.8 GB RAM, 365.5/476.1 GB disk)
Setup complete  (20 CPUs, 31.8 GB RAM, 365.5/476.1 GB disk)


### Read the Video File and Write the Output to AVI

- You can either read the video file directly or stream the content from an RTSP (Real-Time Streaming Protocol) source, allowing for flexible video input depending on your needs.
- We will also set up the video writer to handle the output video writing.

In [None]:
name_model = "yolo11n_100_best"
# name_model = "yolo11m"
name_video = "alpha.mp4"
name_datetime = str(datetime.now().replace(microsecond=0)).replace("-", "").replace(":", "").replace(" ", "_")
name_result = f'{name_model}_{name_video.split(".")[0]}_{name_datetime}'

cap = cv.VideoCapture(f"../../asset/{name_video}")
assert cap.isOpened(), "Error reading video file"
w, h, fps = (
    int(cap.get(x)) 
    for x in (cv.CAP_PROP_FRAME_WIDTH, cv.CAP_PROP_FRAME_HEIGHT, cv.CAP_PROP_FPS)
)

# Video writer
video_writer = cv.VideoWriter(f"../../asset/result/{name_result}.avi", cv.VideoWriter_fourcc(*"mp4v"), fps, (w, h))

### Define Region Coordinates

Here, we set the coordinates for specific regions to ensure accurate object tracking and analysis within the video or stream. This helps monitor and count objects effectively in different areas.

In [None]:
success, imx = cap.read()

# # alpha.mp4
# region_points = [(780, 225), (1136, 622)]  # original
region_points = [(120, 30), (400, 475)]   # crop
x_min, x_max = 705, 1456
y_min, y_max = 132, 665
imx_crop = imx[y_min:y_max, x_min:x_max]
polygon = np.array([(0, 0), (388, 531), (750, 189), (304, 1)])

# # delta.mp4
# region_points = [(233, 800), (1795, 800)]  # original
# region_points = [(0, 250), (1178, 250)]    # crop
# x_min, x_max = 422, 1600
# y_min, y_max = 140, 510
# imx_crop = imx[y_min:y_max, x_min:x_max]
# polygon = np.array([(0, imx_crop.shape[0]), (273, 0), (950, 0), (imx_crop.shape[1], imx_crop.shape[0])])

# # beta.mp4
# region_points = [(275, 250), (580, 250)]  # original
# region_points = [(25, 130), (330, 130)]   # crop
# x_min, x_max = 250, 600
# y_min, y_max = 120, 290
# imx_crop = imx[y_min:y_max, x_min:x_max]
# polygon = np.array([(0, imx_crop.shape[0]), (130, 0), (245, 0), (imx_crop.shape[1], imx_crop.shape[0])])

# region_points = [(20, 400), (1080, 400), (1080, 360), (20, 360)]  # For rectangle region counting
# region_points = [(20, 400), (1080, 400), (1080, 360), (20, 360), (20, 400)]  # For polygon region counting

Original Image

In [None]:
success, imx = cap.read()

# Buat window dan set event callback
cv.namedWindow("Image", cv.WINDOW_NORMAL)
cv.namedWindow("Image")
cv.setMouseCallback("Image", get_coordinates)
cv.imshow("Image", imx)
cv.waitKey(0)

Mouse Hovering at: (737, 242)
Mouse Hovering at: (1301, 467)


-1

Crop Image

In [None]:
# success, imx = cap.read()

# imx_crop = imx[y_min:y_max, x_min:x_max]
# imx_crop = crop_and_mask_image(imx, x_min, x_max, y_min, y_max, polygon)
# print(imx_crop.shape)

# cv.namedWindow("Image", cv.WINDOW_NORMAL)
# cv.namedWindow("Image")
# cv.setMouseCallback("Image", get_coordinates)
# cv.imshow("Image", imx_crop)
# cv.waitKey(0)

(700, 750, 3)
Mouse Hovering at: (154, 130)
Mouse Hovering at: (452, 415)
Mouse Hovering at: (122, 32)
Mouse Hovering at: (406, 476)


-1

### Initialize the ObjectCounter Class

- Now, let's initialize the `ObjectCounter` class to track and count objects in each frame of the video.

In [473]:
# model = YOLO("yolo11m.pt")
# path = model.export(format="onnx")

In [474]:
# Init ObjectCounter
counter = solutions.ObjectCounter(
    region=region_points,                           # Pass region points
    model=f"../../asset/result/{name_model}.pt",    # model="yolo11n-obb.pt" for object counting using YOLO11 OBB model.
    # classes=[1, 2, 3, 5, 6, 7, 8, 17, ],            # If you want to count specific classes i.e person and car with COCO pretrained model.
    show=True,                                      # Display the output
    show_in=True,                                   # Display in counts
    show_out=True,                                  # Display out counts
    line_width=1,                                   # Adjust the line width for bounding boxes and text display
    # font_size=0.1,
)

Ultralytics Solutions:  {'region': [(25, 130), (330, 130)], 'show_in': True, 'show_out': True, 'colormap': None, 'up_angle': 145.0, 'down_angle': 90, 'kpts': [6, 8, 10], 'analytics_type': 'line', 'json_file': None, 'records': 5, 'show': True, 'model': '../../asset/result/yolo11n_100_best.pt', 'classes': [1, 2, 3, 5, 6, 7, 8, 17], 'line_width': 1, 'font_size': 0.1}


### Process Video Frames

In this step, we will process each frame of the video to detect and analyze objects. This allows for real-time tracking and counting, based on the visual data in the frames.

In [None]:
i = 0
n_skip = 1
is_skip = True

cv.namedWindow("Ultralytics Solutions", cv.WINDOW_NORMAL)
while cap.isOpened():
    success, im0 = cap.read()
    if not success:
        print("Video frame is empty or video processing has been successfully completed.")
        break
    
    if i == n_skip and is_skip:
        i = 0
        continue
    
    im0_original = im0.copy()
    im0 = crop_and_mask_image(im0, x_min, x_max, y_min, y_max, polygon)
    im0 = counter.count(im0)
    print(f"Classwise counts: {counter.classwise_counts}")
    print(f"Counted IDs: {counter.counted_ids}")
    
    im0 = stack_image(im0_original, im0, x_min, x_max, y_min, y_max)
    counter.display_counts(im0)
    video_writer.write(im0)
    
    i += 1

cap.release()   # Release the capture
video_writer.release()
cv.destroyAllWindows()


0: 320x640 5 motorbikes, 2 cars, 1 person, 40.6ms
Speed: 1.3ms preprocess, 40.6ms inference, 0.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 4 motorbikes, 2 cars, 35.7ms
Speed: 1.5ms preprocess, 35.7ms inference, 0.6ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 4 motorbikes, 2 cars, 32.9ms
Speed: 1.0ms preprocess, 32.9ms inference, 3.2ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 4 motorbikes, 2 cars, 35.0ms
Speed: 1.0ms preprocess, 35.0ms inference, 0.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 4 motorbikes, 2 cars, 35.9ms
Speed: 1.0ms preprocess, 35.9ms inference, 2.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 3 motorbikes, 2 cars, 46.4ms
Speed: 0.0ms preprocess, 46.4ms inference, 0.6ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 4 motorbikes, 2 cars, 42.0ms
Speed: 0.0ms preprocess, 42.0ms inference, 0.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 3 motorbikes

In [None]:
cv.waitKey(0)
cap.release()   # Release the capture
video_writer.release()
cv.destroyAllWindows()

In [478]:
counter.classwise_counts

{'motorbike': {'IN': 120, 'OUT': 0},
 'car': {'IN': 50, 'OUT': 0},
 'person': {'IN': 0, 'OUT': 0}}

In [None]:
# cv.imshow("img", im0_original)
# cv.waitKey(0)

Camlytics Benchmark

In [480]:
# ### manual
# bicycle		: 5
# car		    : 16
# motorbike	: 68
# bus		    : 1
# person		: 4

# ### yolo11n_crop_skip1
# bicycle		: 5
# car		    : 16
# motorbike	: 65
# bus		    : 1
# person		: 4

# ### camlytics
# exit		: 76
# ---
# pedestrian	: 70
# unknown		: 1
# vehicle		: 5

# enter		: 49
# ---
# pedestrian	: 31
# unknown		: 5
# vehicle		: 13