In [1]:
from transformers import Sam3VideoModel, Sam3VideoProcessor
from accelerate import Accelerator
import torch
import glob
from utils import save_masklet_video
import os

device = Accelerator().device
model = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
processor = Sam3VideoProcessor.from_pretrained("facebook/sam3")

Loading weights:   0%|          | 0/1797 [00:00<?, ?it/s]

In [2]:
# Load video frames
from transformers.video_utils import load_video
video_id = "jreOUu1p6MjTlf18ZOwF"
video_path = f"/data/engine_reite_results/{video_id}/ext_top/frames"
video_url = f"/data/engine_reite_results/downloads/{video_id}/{video_id}**top**.mp4"
print(video_url)
video_url = glob.glob(video_url)[0]
print(video_url)
video_frames, _ = load_video(video_url)

/data/engine_reite_results/downloads/jreOUu1p6MjTlf18ZOwF/jreOUu1p6MjTlf18ZOwF**top**.mp4
/data/engine_reite_results/downloads/jreOUu1p6MjTlf18ZOwF/jreOUu1p6MjTlf18ZOwF_ext_top_1761665220.mp4


In [6]:
# Initialize video inference session
inference_session = processor.init_video_session(
    video=video_frames,
    inference_device=device,
    processing_device="cpu",
    video_storage_device="cpu",
    dtype=torch.bfloat16,
)

# Add text prompt to detect and track objects
text = "small object"
inference_session = processor.add_text_prompt(
    inference_session=inference_session,
    text=text,
)

# Process all frames in the video
outputs_per_frame = {}
for model_outputs in model.propagate_in_video_iterator(
    inference_session=inference_session, max_frame_num_to_track=len(video_frames)
):
    processed_outputs = processor.postprocess_outputs(inference_session, model_outputs)
    outputs_per_frame[model_outputs.frame_idx] = processed_outputs

print(f"Processed {len(outputs_per_frame)} frames")

# Access results for a specific frame
frame_0_outputs = outputs_per_frame[0]
print(f"Detected {len(frame_0_outputs['object_ids'])} objects")
print(f"Object IDs: {frame_0_outputs['object_ids'].tolist()}")
print(f"Scores: {frame_0_outputs['scores'].tolist()}")
print(f"Boxes shape (XYXY format, absolute coordinates): {frame_0_outputs['boxes'].shape}")
print(f"Masks shape: {frame_0_outputs['masks'].shape}")


  0%|          | 0/480 [00:00<?, ?it/s]

Processed 480 frames
Detected 4 objects
Object IDs: [0, 1, 2, 4]
Scores: [0.75390625, 0.52734375, 0.68359375, 0.53125]
Boxes shape (XYXY format, absolute coordinates): torch.Size([4, 4])
Masks shape: torch.Size([4, 480, 640])


In [7]:
save_root = video_path.replace("frames", "vis")
os.makedirs(save_root, exist_ok=True)
file_name = f"{text}_v2.mp4"
save_masklet_video(video_frames=video_frames, outputs=outputs_per_frame, out_path=os.path.join(save_root, file_name), fps=30)

  5%|▍         | 22/480 [00:00<00:02, 219.78it/s]

100%|██████████| 480/480 [00:03<00:00, 158.37it/s]
ffmpeg version 7.0.2-static https://johnvansickle.com/ffmpeg/  Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 8 (Debian 8.3.0-6)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-debug --disable-ffplay --disable-indev=sndio --disable-outdev=sndio --cc=gcc --enable-fontconfig --enable-frei0r --enable-gnutls --enable-gmp --enable-libgme --enable-gray --enable-libaom --enable-libfribidi --enable-libass --enable-libvmaf --enable-libfreetype --enable-libmp3lame --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-librubberband --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libvorbis --enable-libopus --enable-libtheora --enable-libvidstab --enable-libvo-amrwbenc --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libdav1d --enable-libxvid --enable-libzvbi --enable-libzimg
  libavutil      59.  8.100 / 59.  8.100
  liba

Re-encoded video saved to /data/engine_reite_results/jreOUu1p6MjTlf18ZOwF/ext_top/vis/small object_v2.mp4


[out#0/mp4 @ 0x1ed24e80] video:2094KiB audio:0KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.294563%
frame=  480 fps=337 q=-1.0 Lsize=    2100KiB time=00:00:15.93 bitrate=1079.6kbits/s speed=11.2x    
[libx264 @ 0x1ed25080] frame I:3     Avg QP:19.21  size: 13713
[libx264 @ 0x1ed25080] frame P:178   Avg QP:23.21  size:  8125
[libx264 @ 0x1ed25080] frame B:299   Avg QP:26.85  size:  2193
[libx264 @ 0x1ed25080] consecutive B-frames: 11.9% 11.2% 11.9% 65.0%
[libx264 @ 0x1ed25080] mb I  I16..4: 16.4% 74.2%  9.4%
[libx264 @ 0x1ed25080] mb P  I16..4:  3.1% 13.2%  1.8%  P16..4: 36.8% 12.0%  8.2%  0.0%  0.0%    skip:24.8%
[libx264 @ 0x1ed25080] mb B  I16..4:  0.9%  2.5%  0.4%  B16..8: 30.3%  4.6%  1.3%  direct: 2.7%  skip:57.2%  L0:47.9% L1:44.4% BI: 7.7%
[libx264 @ 0x1ed25080] 8x8 transform intra:71.2% inter:73.7%
[libx264 @ 0x1ed25080] coded y,uvDC,uvAC intra: 46.6% 63.8% 17.3% inter: 13.3% 18.5% 4.7%
[libx264 @ 0x1ed25080] i16 v,h,dc,p: 19% 45% 13% 24%
[libx264 