In [1]:
import cv2
import numpy as np
from ultralytics import YOLOE
from ultralytics.models.yolo.yoloe import YOLOEVPSegPredictor

In [2]:
image_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\Bounding_Berries_LLM\train\images\IMG_9394_jpg.rf.93cd662dac6324bfa4ef17b55494eaf7.jpg"
model_path = "yoloe-11l-seg.pt"
use_same_class = True
default_class_id = 0
max_display_width = 1200

# === Globals ===
boxes = []
class_ids = []
drawing = False
current_box = []
resize_ratio = 1.0
corner_size = 8
selected_idx = -1
corner_drag = False
corner_index = -1
mouse_x, mouse_y = 0, 0

def point_near(p1, p2, thresh=10):
    return abs(p1[0] - p2[0]) < thresh and abs(p1[1] - p2[1]) < thresh

def find_corner(point, box):
    x1, y1, x2, y2 = box
    corners = [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]  # All 4 corners
    for i, (cx, cy) in enumerate(corners):
        if point_near(point, (cx, cy), thresh=corner_size):
            return i
    return -1

def click_event(event, x, y, flags, param):
    global drawing, current_box, boxes, class_ids
    global selected_idx, corner_drag, corner_index, mouse_x, mouse_y

    mouse_x, mouse_y = x, y

    if event == cv2.EVENT_LBUTTONDOWN:
        for idx, box in enumerate(boxes):
            ci = find_corner((x, y), box)
            if ci != -1:
                selected_idx = idx
                corner_drag = True
                corner_index = ci
                return
        drawing = True
        current_box = [(x, y)]

    elif event == cv2.EVENT_MOUSEMOVE:
        mouse_x, mouse_y = x, y
        if drawing and current_box:
            current_box = [current_box[0], (x, y)]
        elif corner_drag and selected_idx != -1:
            box = boxes[selected_idx]
            # Allow dragging corners inward and outward
            if corner_index == 0:  # top-left
                box[0], box[1] = x, y
            elif corner_index == 1:  # top-right
                box[2], box[1] = x, y
            elif corner_index == 2:  # bottom-right
                box[2], box[3] = x, y
            elif corner_index == 3:  # bottom-left
                box[0], box[3] = x, y
            boxes[selected_idx] = box

    elif event == cv2.EVENT_LBUTTONUP:
        if drawing and len(current_box) == 2:
            x1, y1 = current_box[0]
            x2, y2 = current_box[1]
            box = [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
            boxes.append(box)
            class_ids.append(default_class_id if use_same_class else int(input(f"Enter class ID for box {len(boxes)}: ")))
            current_box.clear()
        drawing = False
        corner_drag = False
        corner_index = -1

def draw_all(img):
    for i, box in enumerate(boxes):
        x1, y1, x2, y2 = map(int, box)
        color = (0, 255, 0) if i != selected_idx else (0, 0, 255)
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)

        # Draw corner handles
        for (cx, cy) in [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]:
            cv2.rectangle(img, (cx - corner_size, cy - corner_size),
                          (cx + corner_size, cy + corner_size), (255, 255, 0), -1)

def draw_cursor_guides(img):
    step = 10
    for y in range(0, img.shape[0], step * 2):
        cv2.line(img, (mouse_x, y), (mouse_x, y + step), (200, 200, 200), 1)
    for x in range(0, img.shape[1], step * 2):
        cv2.line(img, (x, mouse_y), (x + step, mouse_y), (200, 200, 200), 1)

def main():
    global resize_ratio, selected_idx

    original = cv2.imread(image_path)
    if original is None:
        print("Image not found.")
        return

    h, w = original.shape[:2]
    if w > max_display_width:
        resize_ratio = max_display_width / w
        display = cv2.resize(original, (int(w * resize_ratio), int(h * resize_ratio)))
    else:
        display = original.copy()

    clone = display.copy()
    cv2.namedWindow("Image")
    cv2.setMouseCallback("Image", click_event)

    print("Draw boxes (click-drag). Drag corners to edit. \nPress 'Enter' to run inference.\nPress 'Backspace' while box is selected to delete it.")

    while True:
        img_show = clone.copy()
        draw_all(img_show)
        draw_cursor_guides(img_show)

        if drawing and len(current_box) == 2:
            cv2.rectangle(img_show, current_box[0], current_box[1], (255, 255, 255), 1)

        cv2.imshow("Image", img_show)
        key = cv2.waitKey(1) & 0xFF
        if key == ord('\r'):
            break
        elif key == ord('\b'):  # Backspace key
            if selected_idx != -1 and selected_idx < len(boxes):
                print(f"Deleted box {selected_idx + 1}")
                boxes.pop(selected_idx)
                class_ids.pop(selected_idx)
                selected_idx = -1

    cv2.destroyAllWindows()

    if not boxes:
        print("No boxes labeled.")
        return

    # Normalize coordinates before inference
    bboxes = []
    for box in boxes:
        x1, y1, x2, y2 = box
        x1, x2 = sorted([x1, x2])
        y1, y2 = sorted([y1, y2])
        scaled_box = [x1 / resize_ratio, y1 / resize_ratio, x2 / resize_ratio, y2 / resize_ratio]
        bboxes.append(scaled_box)

    visual_prompts = dict(
        bboxes=np.array(bboxes, dtype=np.float32),
        cls=np.array(class_ids, dtype=np.int32),
    )

    print("Running inference...")
    model = YOLOE(model_path)
    results = model.predict(
        source=image_path,
        visual_prompts=visual_prompts,
        predictor=YOLOEVPSegPredictor,
        conf=0.05,
    )
    results[0].show()

In [3]:
main()

Draw boxes (click-drag). Drag corners to edit. 
Press 'Enter' to run inference.
Press 'Backspace' while box is selected to delete it.
Running inference...
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yoloe-11l-seg.pt to 'yoloe-11l-seg.pt'...


100%|██████████| 67.7M/67.7M [00:12<00:00, 5.78MB/s]


Ultralytics 8.3.123  Python-3.12.8 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4080, 16376MiB)
YOLOe-11l-seg summary (fused): 227 layers, 35,117,862 parameters, 2,254,374 gradients

image 1/1 C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\Bounding_Berries_LLM\train\images\IMG_9394_jpg.rf.93cd662dac6324bfa4ef17b55494eaf7.jpg: 448x640 12 object0s, 106.4ms
Speed: 14.6ms preprocess, 106.4ms inference, 1081.1ms postprocess per image at shape (1, 3, 448, 640)


# Prompt-free

In [9]:
# Initialize a YOLOE model
model = YOLOE("yoloe-11l-seg-pf.pt")

# Run prediction. No prompts required.
results = model.predict(r"C:\Users\Mechanized Systems\DataspellProjects\AutoAnnotate\autoannotate study\berries-1\train\images\IMG_9331_jpg.rf.8cb583bb444b98e1eaf3f2f6d46d2f4c.jpg", conf=.40)

# Show results
results[0].show()

Ultralytics 8.3.122  Python-3.12.4 torch-2.4.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24576MiB)
YOLOe-11l-seg summary (fused): 229 layers, 36,680,414 parameters, 2,638,581 gradients, 159.7 GFLOPs

image 1/1 C:\Users\Mechanized Systems\DataspellProjects\AutoAnnotate\autoannotate study\berries-1\train\images\IMG_9331_jpg.rf.8cb583bb444b98e1eaf3f2f6d46d2f4c.jpg: 448x640 1 nature, 55.9ms
Speed: 2.3ms preprocess, 55.9ms inference, 4.5ms postprocess per image at shape (1, 3, 448, 640)
