# DATA5000 Workshop 8: YOLOV8 SUPERVISION DETECTS OBJECTS IN VIDEOS

## Preliminaries

In [None]:
# install pytorch
%pip install torch torchvision

In [None]:
# import torch
import torch
from torch import nn
import torch.nn.functional as F

## Check for GPU availability - speeds up video and image recognition and other AI tasks

In [None]:
# check for GPU availability - only works on Google Colab
!nvidia-smi -L

In [None]:
!nvidia-smi -q

In [None]:
!nvidia-smi

In [None]:
!lscpu | grep 'Core(s) each processor has/per socket:'

In [None]:
!lscpu | grep 'Number of threads/core:'

In [None]:
!free -h --si | awk  '/Mem:/{print $2}'

# Set HOME folder to store videos

In [None]:
# create folder for data
import os
HOME = os.getcwd()
print(HOME)

In [None]:
!mkdir {HOME}/videos

In [None]:
%cd {HOME}/videos

# get videos of traffic conditions
!wget -q https://s3.ap-southeast-2.wasabisys.com/data5000/pexels_videos_1860079_traffic.mp4

In [None]:
SOURCE_VIDEO_PATH = f"{HOME}/videos/pexels_videos_1860079_traffic.mp4"
SOURCE_VIDEO_PATH

## Install Ultralytics

In [None]:
!pip install ultralytics

from IPython import display
display.clear_output()

import ultralytics
ultralytics.checks()

# Install Supervision

In [None]:
!pip install supervision

from IPython import display
display.clear_output()

# we are importing supervision aliased as superviz
import supervision as superviz
print("supervision.__version__:", superviz.__version__)

# Object detection in videos

- xyxy `(np.ndarray)`: An array of shape `(n, 4)` containing the bounding boxes coordinates in format `[x1, y1, x2, y2]`
- mask: `(Optional[np.ndarray])`: An array of shape `(n, W, H)` containing the segmentation masks.
- confidence `(Optional[np.ndarray])`: An array of shape `(n,)` containing the confidence scores of the detections.
- class_id `(Optional[np.ndarray])`: An array of shape `(n,)` containing the class ids of the detections.
- tracker_id `(Optional[np.ndarray])`: An array of shape `(n,)` containing the tracker ids of the detections.

In [None]:
MODEL = "yolov8x.pt"

In [None]:
from ultralytics import YOLO

model = YOLO(MODEL)
model.fuse()

In [None]:
# dict maping class_id to class_name
CLASS_NAMES_DICT = model.model.names

# class_ids of interest - car, motorcycle, bus and truck
selected_classes = [2, 3, 5, 7]

In [None]:
import supervision as superviz
import numpy as np

In [None]:
# create frame generator
frame_generator = superviz.get_video_frames_generator(SOURCE_VIDEO_PATH)


# create instance of BoxAnnotator
box_annotator = superviz.BoxAnnotator(thickness=4)


# acquire first video frame
iterator = iter(frame_generator)


frame = next(iterator)

# model prediction on single frame and conversion to supervision Detections
results = model(frame, verbose=False)[0]

# convert to Detections
detections = superviz.Detections.from_ultralytics(results)

# only take class id from selected_classes define above
detections = detections[np.isin(detections.class_id, selected_classes)]


# format custom labels

labels = [
    f"{CLASS_NAMES_DICT[class_id]} {confidence:0.2f}"
    for confidence, class_id in zip(detections.confidence, detections.class_id)
]


# annotate and display frame
anotated_frame=box_annotator.annotate(scene=frame, detections=detections)

%matplotlib inline
superviz.plot_image(anotated_frame, (16,16))

# Great - so we know we can detect vehicales etc...in a frame of video i.e., an image. Now let us extend this to the entire video

# Predict and label the entire video clip

In [None]:
!pip install ultralytics

from IPython import display
display.clear_output()

import ultralytics
ultralytics.checks()

"""install byte-track"""

# Commented out IPython magic to ensure Python compatibility.
# %cd {HOME}
!git clone https://github.com/ifzhang/ByteTrack.git
!cd ByteTrack && pip3 install -q -r requirements.txt
!cd ByteTrack && python3 setup.py -q develop
!pip install -q cython_bbox
!pip install -q onemetric

from IPython import display
display.clear_output()

In [None]:
!pip install roboflow==0.2.34

In [None]:
!pip install supervision==0.1.0
import supervision
print("supervision.__version__:", supervision.__version__)

from supervision.draw.color import ColorPalette
from supervision.geometry.dataclasses import Point
from supervision.video.dataclasses import VideoInfo
from supervision.video.source import get_video_frames_generator
from supervision.video.sink import VideoSink
from supervision.notebook.utils import show_frame_in_notebook
from supervision.tools.detections import Detections, BoxAnnotator
from supervision.tools.line_counter import LineCounter, LineCounterAnnotator

from typing import List

import numpy as np



In [None]:
import sys

import os
HOME = os.getcwd()
print(HOME)
sys.path.append(f"{HOME}/ByteTrack")


!pip install loguru
!pip install tqdm
!pip install lap
from yolox.tracker.byte_tracker import BYTETracker, STrack
from onemetric.cv.utils.iou import box_iou_batch
from dataclasses import dataclass
import tqdm

In [None]:
# converts Detections into format that can be consumed by match_detections_with_tracks function
def detections2boxes(detections: Detections) -> np.ndarray:
    return np.hstack((
        detections.xyxy,
        detections.confidence[:, np.newaxis]
    ))


# converts List[STrack] into format that can be consumed by match_detections_with_tracks function
def tracks2boxes(tracks: List[STrack]) -> np.ndarray:
    return np.array([
        track.tlbr
        for track
        in tracks
    ], dtype=float)


# matches our bounding boxes with predictions
def match_detections_with_tracks(
    detections: Detections,
    tracks: List[STrack]
) -> Detections:
    if not np.any(detections.xyxy) or len(tracks) == 0:
        return np.empty((0,))

    tracks_boxes = tracks2boxes(tracks=tracks)
    iou = box_iou_batch(tracks_boxes, detections.xyxy)
    track2detection = np.argmax(iou, axis=1)

    tracker_ids = [None] * len(detections)

    for tracker_index, detection_index in enumerate(track2detection):
        if iou[tracker_index, detection_index] != 0:
            tracker_ids[detection_index] = tracks[tracker_index].track_id

    return tracker_ids


In [None]:
@dataclass(frozen=True)
class BYTETrackerArgs:
    track_thresh: float = 0.25
    track_buffer: int = 30
    match_thresh: float = 0.8
    aspect_ratio_thresh: float = 3.0
    min_box_area: float = 1.0
    mot20: bool = False

In [None]:
#SOURCE_VIDEO_PATH = f"{HOME}/videos/pexels_videos_1860079_traffic.mp4"
SOURCE_VIDEO_PATH

generator = get_video_frames_generator(SOURCE_VIDEO_PATH)

VideoInfo.from_video_path(SOURCE_VIDEO_PATH)


In [None]:
# create BYTETracker instance
byte_tracker = BYTETracker(BYTETrackerArgs())


# create VideoInfo instance
video_info = VideoInfo.from_video_path(SOURCE_VIDEO_PATH)

# create frame generator
generator = get_video_frames_generator(SOURCE_VIDEO_PATH)

# create LineZone instance, it is previously called LineCounter class
#line_counter = LineCounter(start=LINE_START, end=LINE_END)

# create instance of BoxAnnotator
box_annotator = BoxAnnotator(color=ColorPalette(), thickness=4, text_thickness=4, text_scale=2)

# create instance of TraceAnnotator
#trace_annotator = superviz.TraceAnnotator(thickness=4, trace_length=50)

# create LineZoneAnnotator instance, it is previously called LineCounterAnnotator class
#line_zone_annotator = superviz.LineZoneAnnotator(thickness=4, text_thickness=4, text_scale=2)

# define call back function to be used in video processing
def callback(frame: np.ndarray, index:int) -> np.ndarray:

    # model prediction on single frame and conversion to supervision detections
    results = model(frame, verbose=False)[0]
    detections = superviz.Detections.from_ultralytics(results)

    # only consider class id from selected_classes define above
    detections = detections[np.isin(detections.class_id, selected_classes)]

    # tracking detections
    detections = byte_tracker.update_with_detections(detections)

    labels = [
        f"#{tracker_id} {model.model.names[class_id]} {confidence:0.2f}"
        for _, _, confidence, class_id, tracker_id
        in detections
    ]

    annotated_frame = trace_annotator.annotate(
        scene=frame.copy(),
        detections=detections
    )

    annotated_frame=box_annotator.annotate(
        scene=annotated_frame,
        detections=detections,
        labels=labels)

    # update line counter
    line_zone.trigger(detections)


    # return frame with box and line annotated result
    return  line_zone_annotator.annotate(annotated_frame, line_counter=line_zone)

    # process the whole video
superviz.process_video(
    source_path = SOURCE_VIDEO_PATH,
    target_path = TARGET_VIDEO_PATH,
    callback=callback
)

In [None]:
from IPython.display import HTML

HTML("""
    <video width="320" height="240" controls>
        <source src={SOURCE_VIDEO_PATH} type="video/mp4">
    </video>
""")

In [None]:
from IPython.display import HTML

HTML("""
    <video width="320" height="240" controls>
        <source src={TARGET_VIDEO_PATH} type="video/mp4">
    </video>
""")