<a href="https://colab.research.google.com/github/databydepew/Finance/blob/master/notebooks/video_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -U pip uv
!pip install -U git+https://github.com/qubvel/rt-pose.git
!uv pip install --system "moviepy==2.*" supervision

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Collecting uv
  Downloading uv-0.6.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uv-0.6.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv, pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1 uv-0.6.14
Collecting git+https://github.com/qubvel/rt-pose.git
  Cloning https://github.com/qubvel/rt-pose.git to /tmp/pip-req-build-ogevp3a6
  Running command git clone --filter=blob:none --quiet https://github.c

In [3]:
import os
import torch
import moviepy
import argparse
import numpy as np
import supervision as sv
import huggingface_hub

from tqdm import tqdm
from rt_pose import PoseEstimationPipeline, PoseEstimationOutput

ModuleNotFoundError: No module named 'supervision'

In [None]:
device = "cuda"

capability = torch.cuda.get_device_capability(device)
dtype = torch.bfloat16 if capability > (8, 0) else torch.float16

print(f"Using device: {device}")
print(f"Using dtype: {dtype}")

In [None]:
# Load pose estimation pipeline
pipeline = PoseEstimationPipeline(
    object_detection_checkpoint="PekingU/rtdetr_r34vd",
    pose_estimation_checkpoint="usyd-community/vitpose-plus-small",
    device="cuda",
    dtype=torch.float16,
    compile=True,  # True to get more speedup
)

In [None]:
# As you can see from logs below, model compilation is pretty long step.
# Compilation happens just-in-time, that is why we use warmup, to pass
# a few batches to the models to compile them.
pipeline.warmup()

## Defining some useful functions and loading demo clip

In [None]:
from IPython.display import HTML
from base64 import b64encode

def show_clip(path):
    with open(path, "rb") as f:
        mp4 = f.read()
    data = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML(f'<video width=400 controls><source src={data} type="video/mp4"></video>')

def visualize_output(image: np.ndarray, output: PoseEstimationOutput, confidence: float = 0.3) -> np.ndarray:
    """
    Visualize pose estimation output.
    """
    keypoints_xy = output.keypoints_xy.float().cpu().numpy()
    scores = output.scores.float().cpu().numpy()

    # Supervision will not draw vertices with `0` score
    # and coordinates with `(0, 0)` value
    invisible_keypoints = scores < confidence
    scores[invisible_keypoints] = 0
    keypoints_xy[invisible_keypoints] = 0

    keypoints = sv.KeyPoints(xy=keypoints_xy, confidence=scores)

    _, y_min, _, y_max = output.person_boxes_xyxy.T
    height = int((y_max - y_min).mean().item())
    radius = max(height // 100, 4)
    thickness = max(height // 200, 3)
    edge_annotator = sv.EdgeAnnotator(color=sv.Color.YELLOW, thickness=thickness)
    vertex_annotator = sv.VertexAnnotator(color=sv.Color.ROBOFLOW, radius=radius)

    annotated_frame = image.copy()
    annotated_frame = edge_annotator.annotate(annotated_frame, keypoints)
    annotated_frame = vertex_annotator.annotate(annotated_frame, keypoints)

    return annotated_frame

In [None]:
# Load demo clip from dataset, but you can use a local one
path = huggingface_hub.hf_hub_download(
    repo_id="qubvel-hf/assets", filename="rt_pose_break_dance_v1.mp4", repo_type="dataset"
)
clip = moviepy.VideoFileClip(path)

In [None]:
# Uncomment next line to show annotated clip
show_clip(path)

## Running pose estimation pipeline

In [None]:
annotated_frames = []
frames = list(clip.iter_frames())

for frame in tqdm(frames, total=clip.n_frames):
    output = pipeline(frame)
    annotated_frame = visualize_output(frame, output, confidence=0.3)
    annotated_frames.append(annotated_frame)

In [None]:
# Save annotated frames as video with the same audio from clip
annotated_clip = moviepy.ImageSequenceClip(annotated_frames, fps=clip.fps)
annotated_clip.audio = clip.audio

dst_path = "saved_video.mp4"
annotated_clip.write_videofile(dst_path)

In [None]:
# Uncomment next line to show annotated clip
show_clip(dst_path)