### TODOs

[x] Load SAM3 as a model
[x] Gemini / OpenAI API support
[ ] Final model checkpoint path
[ ] clean up client_sam3.py
[ ] dependencies
[ ] Add a Short intro
[ ] clean up TODOs in code


This notebook requires 1 gpu to run SAM3.

In [1]:
# Go to sam3 root dir
%cd ~/code/sam3

/storage/home/jialez/code/sam3


  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import os
import json
from functools import partial
from pathlib import Path
from typing import List

import torch
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.colors import to_rgb
import pycocotools.mask as mask_utils


from sam3 import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor
from sam3.train.masks_ops import rle_encode
from sam3.agent.helpers.mask_overlap_removal import remove_overlapping_masks
from sam3.agent.viz import visualize
from sam3.agent.agent_core import agent_inference
from sam3.agent.client_llm import send_generate_request as send_generate_request_orig
#from sam3.agent.client_sam3 import call_sam_service as call_sam_service_orig



Enabled the use of perflib.


# LLM Setup

In [3]:
LLM_CONFIGS = {
    # VLLM-served models
    "qwen2.5_7b": {
        "provider": "vllm",
        "model": "Qwen/Qwen2.5-VL-7B-Instruct",
    },
    "qwen2.5_72b": {
        "provider": "vllm",
        "model": "Qwen/Qwen2.5-VL-72B-Instruct",
    },
    "qwen3_235b": {
        "provider": "vllm",
        "model": "Qwen/Qwen3-VL-235B-A22B-Instruct",
    },
    "llama4_maverick": {
        "provider": "vllm",
        "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
    },
    "llama4_scout": {
        "provider": "vllm",
        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    },
    "gpt5": {
        "provider": "openai",
        "base_url": "https://api.openai.com/v1/",
        "model": "gpt-5",
    },
    "gemini_pro": {
        "provider": "gemini",
        "base_url": "https://generativelanguage.googleapis.com/v1beta/",
        "model": "gemini-2.5-pro",
    },
    "gemini_flash": {
        "provider": "gemini",
        "base_url": "https://generativelanguage.googleapis.com/v1beta/",
        "model": "gemini-2.5-flash",
    }
}

model = "qwen2.5_7b" # @param ["qwen2.5_7b,"qwen2.5_72b","llama4_maverick","llama4_scout","gpt5","gemini_pro","gemini_flash"] {"allow-input":true}
LLM_API_KEY = "DUMMY_API_KEY" # @param ["DUMMY_API_KEY"]

#model = "gpt5"
#LLM_API_KEY = "YOUR_OPENAI_API_KEY"

#model = "gemini_pro"
#LLM_API_KEY = "YOUR_GEMINI_API_KEY"


llm_config = LLM_CONFIGS[model]
llm_config["api_key"] = LLM_API_KEY
llm_config["name"] = model

In [4]:
# Start LLM server, skip this step if you are calling LLM using an API

# qwen 2.5VL 7B
# vllm serve Qwen/Qwen2.5-VL-7B-Instruct --tensor-parallel-size 1 --allowed-local-media-path / --enforce-eager --port 8001

# qwen 2.5VL 72B
#vllm serve Qwen/Qwen2.5-VL-72B-Instruct --tensor-parallel-size 8 --allowed-local-media-path / --enforce-eager --port 8001

# Llama 4 Maverick
# VLLM_DISABLE_COMPILE_CACHE=1 vllm serve meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8   --tensor-parallel-size 8   --max-model-len 430000 --allowed-local-media-path / --port 8001

# Llama 4 Scout
# VLLM_DISABLE_COMPILE_CACHE=1 vllm serve meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8   --tensor-parallel-size 4   --max-model-len 30000 --allowed-local-media-path / --max-num-seqs 1 --port 8001

if llm_config["provider"] == "vllm":
    #LLM_SERVER_URL = "http://localhost:8001/v1"
    LLM_SERVER_URL = "http://h200-017-014:8001/v1" # TODO replace this with default LLM url
else:
    LLM_SERVER_URL = llm_config["base_url"]


# Build SAM3 Model

In [5]:

# Get the home directory
home_dir = Path.home()
# Construct the path to $HOME/me
me_path = home_dir / "me" / "code"
print(me_path)
sam3_root = Path.home() / "code" / "sam3"
bpe_path = f"{sam3_root}/assets/bpe_simple_vocab_16e6.txt.gz"


/home/jialez/me/code


In [6]:
sam3_root = Path.home() / "code" / "sam3"

bpe_path = f"{sam3_root}/assets/bpe_simple_vocab_16e6.txt.gz"

# checkpoint_path = f"{sam3_root}/assets/checkpoints/sam3_prod_v12_interactive_5box_image_only.pt"

checkpoint_path = f"{sam3_root}/assets/checkpoints/checkpoint.pt"
has_presence_token = True

model = build_sam3_image_model(bpe_path=bpe_path, checkpoint_path=checkpoint_path, has_presence_token=has_presence_token)
processor = Sam3Processor()


In [7]:
def sam3_inference(model, processor, image_path, prompt):
    image = Image.open(image_path)
    inference_state = processor(image, instance_prompt=False)
    #processor.reset_state(inference_state)
    processor.add_prompt(inference_state, text_str=prompt,  instance_prompt=False)
    model.run_inference(inference_state )
    out = processor.postprocess_output(inference_state ,output_prob_thresh=0.5)

    orig_img_w, orig_img_h = image.size

    pred_masks = rle_encode(torch.from_numpy(out['out_binary_masks']))
    pred_masks = [m['counts'] for m in pred_masks]
    
    outputs = {
        "orig_img_h": orig_img_h,
        "orig_img_w": orig_img_w,
        "pred_boxes": out['out_boxes_xywh'].tolist(),
        "pred_masks": pred_masks,
        "pred_scores": out['out_probs'].tolist(),
        #"out_binary_masks": out["out_binary_masks"],
    }

    return outputs

In [8]:
def call_sam_service(image_path: str, text_prompt: str, output_folder_path: str = "sam3_output", threshold: float = 0.5, selected_masks: List[int]=None):
    """
    Loads an image, sends it with a text prompt to the service,
    saves the results, and renders the visualization.
    """
    print(f"üìû Loading image '{image_path}' and sending with prompt '{text_prompt}'...")
    
    text_prompt_for_save_path = text_prompt.replace("/", "_") if "/" in text_prompt else text_prompt
    
    os.makedirs(os.path.join(output_folder_path, image_path.replace("/", "-")), exist_ok=True)
    output_json_path = os.path.join(output_folder_path, image_path.replace("/", "-"), rf"{text_prompt_for_save_path}.json")
    output_image_path = os.path.join(output_folder_path, image_path.replace("/", "-"), rf"{text_prompt_for_save_path}.png")


    try:
        # Send the image and text prompt as a multipart/form-data request
        #with open(image_path, "rb") as f:
        #    data = {'image_path': image_path, 'find_input_text': text_prompt, 'threshold': threshold}
            #response = requests.post(server_url, data=data)
    
        #response.raise_for_status()
        serialized_response = sam3_inference(model, processor, image_path, text_prompt)
        
        # 1. Get the raw JSON response from SAM3 Server
        #serialized_response = response.json()
        
        # add remove duplicate masks
        serialized_response = remove_overlapping_masks(serialized_response)
        serialized_response = {"original_image_path": image_path, **serialized_response}
        serialized_response = {"output_image_path": output_image_path, **serialized_response}
        
    
        # 2. Reorder predictions by scores (highest to lowest) if scores are available
        if 'pred_scores' in serialized_response and serialized_response['pred_scores']:
            # Create indices sorted by scores in descending order
            score_indices = sorted(range(len(serialized_response['pred_scores'])), 
                                 key=lambda i: serialized_response['pred_scores'][i], reverse=True)
            
            # Reorder all three lists based on the sorted indices
            serialized_response['pred_scores'] = [serialized_response['pred_scores'][i] for i in score_indices]
            serialized_response['pred_boxes'] = [serialized_response['pred_boxes'][i] for i in score_indices]
            serialized_response['pred_masks'] = [serialized_response['pred_masks'][i] for i in score_indices]
        
        # 3. Remove any invalid RLE masks that is too short (shorter than 5 characters)
        valid_masks = []
        valid_boxes = []
        valid_scores = []
        for i, rle in enumerate(serialized_response['pred_masks']):
            if len(rle) > 4:
                valid_masks.append(rle)
                valid_boxes.append(serialized_response['pred_boxes'][i])
                valid_scores.append(serialized_response['pred_scores'][i])
        serialized_response['pred_masks'] = valid_masks
        serialized_response['pred_boxes'] = valid_boxes
        serialized_response['pred_scores'] = valid_scores
    
        with open(output_json_path, 'w') as f:
            json.dump(serialized_response, f, indent=4)
        print(f"‚úÖ Raw JSON response saved to '{output_json_path}'")
        
        
        # 4. Render and save visualizations on the image and save it in the SAM3 output folder
        print("üîç Rendering visualizations on the image...")
        # pil_image = np.array(Image.open(image_path).convert('RGB'))
        cv2_img = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
        boxes_array = np.array(serialized_response['pred_boxes'])
        coco_rle_masks = [{'size': (serialized_response["orig_img_h"], serialized_response["orig_img_w"]), 'counts': rle} for rle in serialized_response['pred_masks']]
        binary_masks = [mask_utils.decode(i) for i in coco_rle_masks]
        viz_image = visualize(cv2_img, boxes_array, coco_rle_masks, binary_masks)
        os.makedirs(os.path.dirname(output_image_path), exist_ok=True)
        viz_image.save(output_image_path)
        print("‚úÖ Saved visualization at:", output_image_path)

    except Exception as e:
        print(f"‚ùå Error calling service: {e}")
    
    return output_json_path

In [9]:
def run_single_image_inference(image_path, text_prompt, llm_config, send_generate_request, call_sam_service, output_dir="agent_output"):
    """Run inference on a single image with provided prompt"""

    llm_name = llm_config["name"]

    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image file not found: {image_path}")

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Generate output file names
    image_basename = os.path.splitext(os.path.basename(image_path))[0]
    prompt_for_filename = text_prompt.replace("/", "_").replace(" ", "_")

    base_filename = f"{image_basename}_{prompt_for_filename}_Agent_{llm_name}"
    output_json_path = os.path.join(output_dir, f"{base_filename}_Pred.json")
    output_image_path = os.path.join(output_dir, f"{base_filename}_Pred.png")
    agent_history_path = os.path.join(output_dir, f"{base_filename}_History.json")

    # Check if output already exists and skip
    if os.path.exists(output_json_path):
        print(f"Output JSON {output_json_path} already exists. Skipping.")
        return

    print(f"\n\n\n--------------Processing single image with prompt: {text_prompt}--------------\n")
    print(f"Image: {image_path}")
    print(f"Output directory: {output_dir}")

    #try:
    agent_history, final_output_dict, rendered_final_output = agent_inference(
        image_path, text_prompt,
        send_generate_request=send_generate_request,
        call_sam_service=call_sam_service
    )

    final_output_dict["text_prompt"] = text_prompt
    final_output_dict["image_path"] = image_path

    # Save outputs
    json.dump(final_output_dict, open(output_json_path, 'w'), indent=4)
    json.dump(agent_history, open(agent_history_path, 'w'), indent=4)
    rendered_final_output.save(output_image_path)

    print(f"\n‚úÖ Successfully processed single image!")
    print(f"Output JSON: {output_json_path}")
    print(f"Output Image: {output_image_path}")
    print(f"Agent History: {agent_history_path}")

# Run SAM3 Agent Inference

In [10]:
image = "assets/images/test_image.jpg"
prompt = "People wearing blue clothes"

# get absolute path for image
image = os.path.abspath(image)
send_generate_request = partial(send_generate_request_orig, server_url=LLM_SERVER_URL, model=llm_config["model"], api_key=llm_config["api_key"])
#call_sam_service = partial(call_sam_service_orig, server_url=SAM3_SERVICE_URL)

# Run single image inference
run_single_image_inference(image, prompt, llm_config, send_generate_request, call_sam_service)




--------------Processing single image with prompt: People wearing blue clothes--------------

Image: /storage/home/jialez/code/sam3/assets/images/test_image.jpg
Output directory: agent_output

Initial text prompt:

 People wearing blue clothes


Initial image path:

 /storage/home/jialez/code/sam3/assets/images/test_image.jpg
image_path /storage/home/jialez/code/sam3/assets/images/test_image.jpg
Calling model Qwen/Qwen2.5-VL-7B-Instruct...
Received response: ChatCompletionMessage(content='<think>There is only one image in the message history (the raw input image). Since there is only one image, I will follow the Scenario 1 instructions: \n1. Analyze: The image shows a group of children playing on a basketball court. Some of the children are wearing blue clothes, while others are wearing red and pink clothes.\n2. Think: Based on the image, the target object(s) are the children wearing blue clothes. The initial user input query is straightforward and does not require any complex inter

  with torch.cuda.amp.autocast(enabled=False):


‚úÖ Raw JSON response saved to '/checkpoint/sam3/jialez//code/out/sam_out/-storage-home-jialez-code-sam3-assets-images-test_image.jpg/people wearing blue clothes.json'
üîç Rendering visualizations on the image...
‚úÖ Saved visualization at: /checkpoint/sam3/jialez//code/out/sam_out/-storage-home-jialez-code-sam3-assets-images-test_image.jpg/people wearing blue clothes.png


sam3_output_text_message:
 The segment_phrase tool generated 3 available masks. All 3 available masks are rendered in this image below, now you must analyze the 3 available mask(s) carefully, compare them against the raw input image and the original user query, and determine your next action. Please be reminded that the original user query was 'People wearing blue clothes'.
image_path /storage/home/jialez/code/sam3/assets/images/test_image.jpg
image_path /checkpoint/sam3/jialez//code/out/sam_out/-storage-home-jialez-code-sam3-assets-images-test_image.jpg/people wearing blue clothes.png
Calling model Qwen/Qwen2.5-VL