In [1]:
# Copyright (c) Meta Platforms, Inc. and affiliates.

# SAM 3 Agent

This notebook shows an example of how an MLLM can use SAM 3 as a tool, i.e., "SAM 3 Agent", to segment more complex text queries such as "the rightmost person wearing blue vest".

## Env Setup

First install `sam3` in your environment using the [installation instructions](https://github.com/facebookresearch/sam3?tab=readme-ov-file#installation) in the repository.

In [2]:
import torch
# turn on tfloat32 for Ampere GPUs
# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# use bfloat16 for the entire notebook. If your card doesn't support it, try float16 instead
torch.autocast("cuda", dtype=torch.bfloat16).__enter__()

# inference mode for the whole notebook. Disable if you need gradients
torch.inference_mode().__enter__()

In [3]:
import os

SAM3_ROOT = os.path.dirname(os.getcwd())
os.chdir(SAM3_ROOT)

# setup GPU to use -  A single GPU is good with the purpose of this demo
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
_ = os.system("nvidia-smi")

Fri Oct 31 00:15:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  |   00000000:53:00.0 Off |                    0 |
| N/A   26C    P0             69W /  700W |       4MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  |   00

## Build SAM3 Model

In [4]:
from sam3 import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor

bpe_path = os.path.join(SAM3_ROOT, "assets/bpe_simple_vocab_16e6.txt.gz")
checkpoint_path = "/checkpoint/sam3/chayryali/omnivision_onevision/config/experiments/chayryali/sam3v1_stage3/final/paper_launch_model_oct/launch_stage3_v2.yaml/0/checkpoints/checkpoint.pt"

model = build_sam3_image_model(bpe_path=bpe_path, checkpoint_path=checkpoint_path).cuda()
processor = Sam3Processor(model, confidence_threshold=0.5)

## LLM Setup

Config which MLLM to use, it can either be a model served by vLLM that you launch from your own machine or a model is served via external API. If you want to using a vLLM model, we also provided insturctions below.

In [5]:
LLM_CONFIGS = {
    # vLLM-served models
    "qwen2.5_7b": {
        "provider": "vllm",
        "model": "Qwen/Qwen2.5-VL-7B-Instruct",
    },
    "qwen2.5_72b": {
        "provider": "vllm",
        "model": "Qwen/Qwen2.5-VL-72B-Instruct",
    },
    "qwen3_vl_2b_instruct": {
        "provider": "vllm",
        "model": "Qwen/Qwen3-VL-2B-Instruct",
    },
    "qwen3_vl_2b_thinking": {
        "provider": "vllm",
        "model": "Qwen/Qwen3-VL-2B-Thinking",
    },
    "qwen3_vl_4b_instruct": {
        "provider": "vllm",
        "model": "Qwen/Qwen3-VL-4B-Instruct",
    },
    "qwen3_vl_4b_thinking": {
        "provider": "vllm",
        "model": "Qwen/Qwen3-VL-4B-Thinking",
    },
    "qwen3_vl_8b_instruct": {
        "provider": "vllm",
        "model": "Qwen/Qwen3-VL-8B-Instruct",
    },
    "qwen3_vl_8b_thinking": {
        "provider": "vllm",
        "model": "Qwen/Qwen3-VL-8B-Thinking",
    }, 
    "llama4_maverick": {
        "provider": "vllm",
        "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
    },
    "llama4_scout": {
        "provider": "vllm",
        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    },
    # models with external API calls
    "gpt5": {
        "provider": "openai",
        "base_url": "https://api.openai.com/v1/",
        "model": "gpt-5",
    },
    "gemini_pro": {
        "provider": "gemini",
        "base_url": "https://generativelanguage.googleapis.com/v1beta/",
        "model": "gemini-2.5-pro",
    },
    "gemini_flash": {
        "provider": "gemini",
        "base_url": "https://generativelanguage.googleapis.com/v1beta/",
        "model": "gemini-2.5-flash",
    }
}

model = "qwen3_vl_8b_thinking"
# model = "qwen2.5_7b"
LLM_API_KEY = "DUMMY_API_KEY"

# model = "gpt5"
# LLM_API_KEY = "YOUR_OPENAI_API_KEY"

# model = "gemini_pro"
# LLM_API_KEY = "YOUR_GEMINI_API_KEY"

llm_config = LLM_CONFIGS[model]
llm_config["api_key"] = LLM_API_KEY
llm_config["name"] = model

# setup API endpoint
if llm_config["provider"] == "vllm":
    LLM_SERVER_URL = "http://0.0.0.0:8001/v1"  # replace this with your vLLM server address as needed
else:
    LLM_SERVER_URL = llm_config["base_url"]

### Setup vLLM server (optional)
This step is only required if you are using a model served by vLLM, skip this step if you are calling LLM using an API like Gemini and GPT.

* Install vLLM (in a separate conda env from SAM 3 to avoid dependency conflicts).
  ```bash
    conda create -n vllm python=3.12
    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
  ```
* Start vLLM server on the same machine of this notebook
  ```bash
    # qwen 2.5 VL 7B
    vllm serve Qwen/Qwen2.5-VL-7B-Instruct --tensor-parallel-size 1 --allowed-local-media-path / --enforce-eager --port 8001
    
    # qwen 2.5 VL 72B
    vllm serve Qwen/Qwen2.5-VL-72B-Instruct --tensor-parallel-size 8 --allowed-local-media-path / --enforce-eager --port 8001

    # qwen 3 VL 2B instruct
    vllm serve Qwen/Qwen3-VL-2B-Instruct --tensor-parallel-size 1 --allowed-local-media-path / --enforce-eager --port 8001

    # qwen 3 VL 2B thinking
    vllm serve Qwen/Qwen3-VL-2B-Thinking --tensor-parallel-size 1 --allowed-local-media-path / --enforce-eager --port 8001

    # qwen 3 VL 4B instruct
    vllm serve Qwen/Qwen3-VL-4B-Instruct --tensor-parallel-size 1 --allowed-local-media-path / --enforce-eager --port 8001

    # qwen 3 VL 4B thinking
    vllm serve Qwen/Qwen3-VL-4B-Thinking --tensor-parallel-size 1 --allowed-local-media-path / --enforce-eager --port 8001

    # qwen 3 VL 8B instruct
    vllm serve Qwen/Qwen3-VL-8B-Instruct --tensor-parallel-size 1 --allowed-local-media-path / --enforce-eager --port 8001

    # qwen 3 VL 8B thinking
    vllm serve Qwen/Qwen3-VL-8B-Thinking --tensor-parallel-size 1 --allowed-local-media-path / --enforce-eager --port 8001

    # Llama 4 Maverick
    VLLM_DISABLE_COMPILE_CACHE=1 vllm serve meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8   --tensor-parallel-size 8   --max-model-len 430000 --allowed-local-media-path / --port 8001
    
    # Llama 4 Scout
    VLLM_DISABLE_COMPILE_CACHE=1 vllm serve meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8   --tensor-parallel-size 4   --max-model-len 30000 --allowed-local-media-path / --max-num-seqs 1 --port 8001
  ```

## Run SAM3 Agent Inference

In [6]:
from functools import partial
from IPython.display import display, Image
from sam3.agent.client_llm import send_generate_request as send_generate_request_orig
from sam3.agent.client_sam3 import call_sam_service as call_sam_service_orig
from sam3.agent.inference import run_single_image_inference

  assert (


In [None]:
# prepare input args and run single image inference
image = "assets/images/test_image.jpg"
prompt = "the rightmost person wearing blue vest"
image = os.path.abspath(image)
send_generate_request = partial(send_generate_request_orig, server_url=LLM_SERVER_URL, model=llm_config["model"], api_key=llm_config["api_key"])
call_sam_service = partial(call_sam_service_orig, sam3_processor=processor)
output_image_path = run_single_image_inference(image, prompt, llm_config, send_generate_request, call_sam_service, debug=True)

# display output
if output_image_path is not None:
    display(Image(filename=output_image_path))

------------------------------ Starting SAM 3 Agent Session... ------------------------------ 
> Text prompt: the rightmost person wearing blue vest
> Image path: /storage/home/jielei/sam3/assets/images/test_image.jpg



------------------------------ Round 1------------------------------



image_path /storage/home/jielei/sam3/assets/images/test_image.jpg
üîç Calling model Qwen/Qwen3-VL-8B-Thinking...

>>> MLLM Response [start]
State that there is only one image in the message history (the raw input image). Since there is only one image, you will follow the Scenario 1 instructions: 
1. Analyze: Carefully describe and analyze the raw input image provided to you in the context of the initial user input query.
2. Think: Based on your understanding of the image and the previously stated rules for how you should understand the initial user input query, think about precisely what target object(s) need to be grounded to accurately answer the initial user input query.
3. Remind: Remind yours