### Import necessary libraries

In [None]:
!pip install transformers supervision trackers

In [None]:
import torch
import supervision as sv
from trackers import DeepSORTFeatureExtractor, DeepSORTTracker
from transformers import AutoModelForObjectDetection, AutoImageProcessor

### Defining constants

In [None]:
# Set up device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create a color palette for visualization
# These hex color codes define different colors for tracking different objects
color = sv.ColorPalette.from_hex([
    "#ffff00", "#ff9b00", "#ff8080", "#ff66b2", "#ff66ff", "#b266ff",
    "#9999ff", "#3399ff", "#66ffff", "#33ff99", "#66ff66", "#99ff00"
])

# Set the color lookup mode to assign colors by track ID
# This mean objects with the same track ID will be annotated by the same color
color_lookup = sv.ColorLookup.TRACK

### Demo Video

In [None]:
# Define input and output video paths
source_video_path = "/content/Video_Ready_Cat_and_Play.mp4"
save_video_path = "/content/DFine_Object_Detection_Result.mp4"

# Extract video information (width, height, fps) from the source
video_info = sv.VideoInfo.from_video_path(source_video_path)
print(video_info)

### Object Detection Model

In [None]:
# DFine model trained on Objects365 dataset
checkpoint = "ustc-community/dfine-large-obj365"
print(f"Loading object detection model: {checkpoint}")

image_processor = AutoImageProcessor.from_pretrained(checkpoint)
model = AutoModelForObjectDetection.from_pretrained(checkpoint).to(device)

label2id = {k.lower(): v for k, v in model.config.label2id.items()}

### Tracking Model

In [None]:
# Initialize the DeepSORT feature extractor with a MobileNetV4 backbone
# it's not pretrained for ReID task, so you can find a better model on your own
feature_extractor = DeepSORTFeatureExtractor.from_timm("mobilenetv4_conv_small.e1200_r224_in1k")
tracker = DeepSORTTracker(feature_extractor, frame_rate=video_info.fps)

### Process Video

In [None]:
# Box annotator draws rectangles around detected objects
box_annotator = sv.BoxAnnotator(color, color_lookup=color_lookup)

# Label annotator adds text labels to the detections: track id and class name
label_annotator = sv.LabelAnnotator(color, color_lookup=color_lookup, text_color=sv.Color.BLACK, text_scale=0.8)

In [None]:
TARGET_CLASSES = ["person", "cat", "other balls"]

In [None]:
def process_frame(frame, index):
    """
    Process a single video frame: detect people, track them, and annotate the frame.

    Args:
        frame: The current video frame (numpy array)
        index: The frame number in the sequence

    Returns:
        Annotated frame with detection boxes, labels, and traces
    """

    inputs = image_processor(images=frame, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert raw model outputs to bounding boxes, labels, and scores
    h, w, _ = frame.shape
    detections = image_processor.post_process_object_detection(outputs, target_sizes=[(h, w)], threshold=0.3)
    detections = detections[0]  # Get first image results (we're processing one frame at a time)

    # Filter predictions by multiple classes instead of just "person"
    if TARGET_CLASSES:
        # Create a mask for all target classes
        target_class_ids = [label2id[class_name.lower()] for class_name in TARGET_CLASSES if class_name.lower() in label2id]
        keep = torch.isin(detections["labels"], torch.tensor(target_class_ids).to(detections["labels"].device))
        detections = {k: v[keep] for k, v in detections.items()}

    # Convert detections to Supervision format and update the tracker with new detections
    detections = sv.Detections.from_transformers(detections, id2label=model.config.id2label)
    detections = tracker.update(detections, frame=frame)

    # Create labels for each detection
    labels = [
        f"{model.config.id2label[class_id]}"
        for class_id, tracker_id
        in zip(detections.class_id, detections.tracker_id)
    ]

    frame = box_annotator.annotate(scene=frame, detections=detections)
    frame = label_annotator.annotate(scene=frame, detections=detections, labels=labels)

    return frame

In [None]:
sv.process_video(
    source_path=source_video_path,
    target_path=save_video_path,
    callback=process_frame,  # Apply our processing function to each frame
    show_progress=True,      # Display a progress bar
)
print("Video processing complete!")

### View the result!

In [None]:
# We need to encode video with H264 codec to show in browser
converted_video_path = save_video_path.replace(".mp4", "-h264.mp4")
!ffmpeg -y -loglevel error -i {save_video_path} -vcodec libx264 -acodec aac {converted_video_path}

In [None]:
from IPython.display import Video
Video(converted_video_path, embed=True, width=600)