### Import necessary libraries

In [None]:
!pip install transformers supervision trackers

Collecting supervision
  Downloading supervision-0.25.1-py3-none-any.whl.metadata (14 kB)
Collecting trackers
  Downloading trackers-2.0.1-py3-none-any.whl.metadata (9.5 kB)
Collecting firerequests>=0.1.2 (from trackers)
  Downloading firerequests-0.1.4-py3-none-any.whl.metadata (8.3 kB)
Collecting supervision
  Downloading supervision-0.26.0rc7-py3-none-any.whl.metadata (14 kB)
Collecting validators>=0.34.0 (from trackers)
  Downloading validators-0.35.0-py3-none-any.whl.metadata (3.9 kB)
Collecting fire (from firerequests>=0.1.2->trackers)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading trackers-2.0.1-py3-none-any.whl (24 kB)
Downloading supervision-0.26.0rc7-py3-none-any.whl (187 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m


In [None]:
import torch
import supervision as sv
from trackers import DeepSORTFeatureExtractor, DeepSORTTracker
from transformers import AutoModelForObjectDetection, AutoImageProcessor

### Defining constants

In [None]:
# Set up device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create a color palette for visualization
# These hex color codes define different colors for tracking different objects
color = sv.ColorPalette.from_hex([
    "#ffff00", "#ff9b00", "#ff8080", "#ff66b2", "#ff66ff", "#b266ff",
    "#9999ff", "#3399ff", "#66ffff", "#33ff99", "#66ff66", "#99ff00"
])

# Set the color lookup mode to assign colors by track ID
# This mean objects with the same track ID will be annotated by the same color
color_lookup = sv.ColorLookup.TRACK

Using device: cuda


### Demo Video

In [None]:
# Define input and output video paths
source_video_path = "/content/Video_Ready_Cat_and_Play.mp4"
save_video_path = "/content/DFine_Object_Detection_Result.mp4"

# Extract video information (width, height, fps) from the source
video_info = sv.VideoInfo.from_video_path(source_video_path)
print(video_info)

VideoInfo(width=1280, height=720, fps=24, total_frames=192)


### Object Detection Model

In [None]:
# DFine model trained on Objects365 dataset
checkpoint = "ustc-community/dfine-large-obj365"
print(f"Loading object detection model: {checkpoint}")

image_processor = AutoImageProcessor.from_pretrained(checkpoint)
model = AutoModelForObjectDetection.from_pretrained(checkpoint).to(device)

label2id = {k.lower(): v for k, v in model.config.label2id.items()}

Loading object detection model: ustc-community/dfine-large-obj365


preprocessor_config.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json:   0%|          | 0.00/20.2k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/128M [00:00<?, ?B/s]

### Tracking Model

In [None]:
# Initialize the DeepSORT feature extractor with a MobileNetV4 backbone
# it's not pretrained for ReID task, so you can find a better model on your own
feature_extractor = DeepSORTFeatureExtractor.from_timm("mobilenetv4_conv_small.e1200_r224_in1k")
tracker = DeepSORTTracker(feature_extractor, frame_rate=video_info.fps)

model.safetensors:   0%|          | 0.00/15.2M [00:00<?, ?B/s]

### Process Video

In [None]:
# Box annotator draws rectangles around detected objects
box_annotator = sv.BoxAnnotator(color, color_lookup=color_lookup)

# Label annotator adds text labels to the detections: track id and class name
label_annotator = sv.LabelAnnotator(color, color_lookup=color_lookup, text_color=sv.Color.BLACK, text_scale=0.8)

In [None]:
TARGET_CLASSES = ["person", "cat", "other balls"]

In [None]:
def process_frame(frame, index):
    """
    Process a single video frame: detect people, track them, and annotate the frame.

    Args:
        frame: The current video frame (numpy array)
        index: The frame number in the sequence

    Returns:
        Annotated frame with detection boxes, labels, and traces
    """

    inputs = image_processor(images=frame, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert raw model outputs to bounding boxes, labels, and scores
    h, w, _ = frame.shape
    detections = image_processor.post_process_object_detection(outputs, target_sizes=[(h, w)], threshold=0.3)
    detections = detections[0]  # Get first image results (we're processing one frame at a time)

    # Filter predictions by multiple classes instead of just "person"
    if TARGET_CLASSES:
        # Create a mask for all target classes
        target_class_ids = [label2id[class_name.lower()] for class_name in TARGET_CLASSES if class_name.lower() in label2id]
        keep = torch.isin(detections["labels"], torch.tensor(target_class_ids).to(detections["labels"].device))
        detections = {k: v[keep] for k, v in detections.items()}

    # Convert detections to Supervision format and update the tracker with new detections
    detections = sv.Detections.from_transformers(detections, id2label=model.config.id2label)
    detections = tracker.update(detections, frame=frame)

    # Create labels for each detection
    labels = [
        f"{model.config.id2label[class_id]}"
        for class_id, tracker_id
        in zip(detections.class_id, detections.tracker_id)
    ]

    frame = box_annotator.annotate(scene=frame, detections=detections)
    frame = label_annotator.annotate(scene=frame, detections=detections, labels=labels)

    return frame

In [None]:
sv.process_video(
    source_path=source_video_path,
    target_path=save_video_path,
    callback=process_frame,  # Apply our processing function to each frame
    show_progress=True,      # Display a progress bar
)
print("Video processing complete!")

Processing video:   0%|          | 0/192 [00:00<?, ?it/s]

Video processing complete!


### View the result!

In [None]:
# We need to encode video with H264 codec to show in browser
converted_video_path = save_video_path.replace(".mp4", "-h264.mp4")
!ffmpeg -y -loglevel error -i {save_video_path} -vcodec libx264 -acodec aac {converted_video_path}

In [None]:
from IPython.display import Video
Video(converted_video_path, embed=True, width=600)