In [2]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2
import csv
import os
import warnings
import pynvml
import torch
import time
from ultralytics import SAM
from pathlib import Path
from shapely.geometry import Polygon
from PIL import Image, ImageDraw
import numpy as np
from ultralytics import YOLOE
from ultralytics.models.yolo.yoloe import YOLOEVPSegPredictor
from tkinter import filedialog, Tk

warnings.filterwarnings("ignore")

In [8]:
def clean_labels(boxes, max_area):
    clean_boxes = []
    box_list = boxes.tolist()
    for box in box_list:
        #if width * height < 0.9, add box to list.
        if (box[2] * box[3]) < max_area:
            clean_boxes.append(box)
    if len(clean_boxes) < 1:
        return boxes
    return torch.FloatTensor(clean_boxes)

def load_dino_model(model_size):
    #choose swinb or swint
    if model_size == 'swint':
        config_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\GroundingDINO\groundingdino\config\GroundingDINO_SwinT_OGC.py"
        checkpoint_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GUI and Pipeline\GroundingDINO\weights\groundingdino_swint_ogc.pth"
    elif model_size == 'swinb':
        checkpoint_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GUI and Pipeline\GroundingDINO\weights\groundingdino_swinb_cogcoor.pth"
        config_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\GroundingDINO\groundingdino\config\GroundingDINO_SwinB_cfg.py"

    model = load_model(config_path, checkpoint_path)
    return model


def run_dino_from_model(model, img_path, prompt, box_threshold, text_threshold, maxarea=0.7, save_dir="DINO-labels"):
    image_source, image = load_image(img_path)
    boxes, accuracy, obj_name = predict(model=model, image=image, caption=prompt, box_threshold=box_threshold,
                                        text_threshold=text_threshold)

    #print(boxes, accuracy, obj_name)
    #Convert boxes from YOLOv8 format to xyxy
    img_height, img_width = cv2.imread(img_path).shape[:2]
    clean_boxes = clean_labels(boxes, maxarea)
    absolute_boxes = [[(box[0] - (box[2] / 2)) * img_width,
                       (box[1] - (box[3] / 2)) * img_height,
                       (box[0] + (box[2] / 2)) * img_width,
                       (box[1] + (box[3] / 2)) * img_height] for box in clean_boxes.tolist()]
    #annotated_frame = annotate(image_source=image_source, boxes=clean_boxes, logits=accuracy, phrases=obj_name)
    #sv.plot_image(annotated_frame, (16,16))
    save_labels = True
    if save_labels:
        clean_boxes = clean_boxes.tolist()

        for x in clean_boxes:
            x.insert(0, 0)

        with open(f'{save_dir}/{os.path.splitext(os.path.basename(img_path))[0]}.txt', 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=' ')
            writer.writerows(clean_boxes)
            #print("Labels saved in /DINO-labels")
    return absolute_boxes

def calculate_metrics(TP, FP, FN, TN):
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    mcc = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if np.sqrt(
        (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) > 0 else 0
    specificity = TN / (TN + FP) if TN + FP > 0 else 0
    return precision, recall, f1, mcc, specificity


def read_and_draw_boxes(file_path, image_dim=(1280, 720)):
    boxes = []
    with open(file_path, 'r') as file:
        for line in file:
            class_id, x, y, width, height = map(float, line.strip().split())
            x1 = (x - (width / 2)) * image_dim[0]
            x2 = (x + (width / 2)) * image_dim[0]
            y1 = (y - (height / 2)) * image_dim[1]
            y2 = (y + (height / 2)) * image_dim[1]
            boxes.append([x1, y1, x2, y2])
    image = Image.new('L', image_dim, 0)
    draw = ImageDraw.Draw(image)
    for box in boxes:
        draw.rectangle(box, fill=255)
        #draw.rectangle([1,1,20,20], fill=255)
    image.save("test.jpg")
    return np.array(image, dtype=np.uint8)

def process_files(predicted_mask_dir, ground_truth_mask_dir):
    predicted_files = os.listdir(ground_truth_mask_dir)
    metrics = {
        'iou_scores': [],
        'precision_scores': [],
        'recall_scores': [],
        'f1_scores': [],
        'mcc_scores': [],
        'specificity_scores': []
    }

    for fname in predicted_files:
        predicted_mask_path = os.path.join(predicted_mask_dir, fname)
        ground_truth_mask_path = os.path.join(ground_truth_mask_dir, os.path.splitext(fname)[0] + '.txt')

        if not os.path.exists(ground_truth_mask_path):
            metrics['iou_scores'].append(0)
            metrics['precision_scores'].append(0)
            metrics['recall_scores'].append(0)
            metrics['f1_scores'].append(0)
            metrics['mcc_scores'].append(0)
            metrics['specificity_scores'].append(0)
            continue
        #print(ground_truth_mask_path)
        predicted_mask = read_and_draw_boxes(predicted_mask_path)
        ground_truth_mask = read_and_draw_boxes(ground_truth_mask_path)

        COMMON_HEIGHT, COMMON_WIDTH = 1280, 720  # or any other desired size

        predicted_mask = cv2.resize(predicted_mask, (COMMON_WIDTH, COMMON_HEIGHT))

        ground_truth_mask = cv2.resize(ground_truth_mask, (COMMON_WIDTH, COMMON_HEIGHT))

        _, predicted_mask_bin = cv2.threshold(predicted_mask, 127, 255, cv2.THRESH_BINARY)
        _, ground_truth_mask_bin = cv2.threshold(ground_truth_mask, 127, 255, cv2.THRESH_BINARY)

        predicted_mask_bin = predicted_mask_bin / 255
        ground_truth_mask_bin = ground_truth_mask_bin / 255
        TP = np.float64(np.sum(np.logical_and(predicted_mask_bin == 1, ground_truth_mask_bin == 1)))
        TN = np.float64(np.sum(np.logical_and(predicted_mask_bin == 0, ground_truth_mask_bin == 0)))
        FP = np.float64(np.sum(np.logical_and(predicted_mask_bin == 1, ground_truth_mask_bin == 0)))
        FN = np.float64(np.sum(np.logical_and(predicted_mask_bin == 0, ground_truth_mask_bin == 1)))

        intersection = np.logical_and(predicted_mask_bin, ground_truth_mask_bin)
        union = np.logical_or(predicted_mask_bin, ground_truth_mask_bin)
        metrics['iou_scores'].append(np.sum(intersection) / np.sum(union))
        precision, recall, f1, mcc, specificity = calculate_metrics(TP, FP, FN, TN)
        metrics['precision_scores'].append(precision)
        metrics['recall_scores'].append(recall)
        metrics['f1_scores'].append(f1)
        metrics['mcc_scores'].append(mcc)
        metrics['specificity_scores'].append(specificity)

    return metrics

def save_masks(sam_results, output_dir):
    segments = sam_results[0].masks.xyn
    with open(f"{Path(output_dir) / Path(sam_results[0].path).stem}.txt", "w") as f:
        for i in range(len(segments)):
            s = segments[i]
            if len(s) == 0:
                continue
            coords = np.array(s).reshape(-1, 2)
            polygon = Polygon(coords)
            segment = map(str, segments[i].reshape(-1).tolist())
            print(polygon.area)
            f.write(f"0 " + " ".join(segment) + "\n")


def append_mask(sam_results, output_dir):
    segments = sam_results[0].masks.xyn
    with open(f"{Path(output_dir) / Path(sam_results[0].path).stem}.txt", "a") as f:
        for i in range(len(segments)):
            s = segments[i]
            if len(s) == 0:
                continue
            segment = map(str, segments[i].reshape(-1).tolist())
            f.write(f"0 " + " ".join(segment) + "\n")

def load_YOLOE(model_path="yoloe-11l-seg.pt"):
    model = YOLOE(model_path)
    return model
# Set text prompt to detect person and bus. You only need to do this once after you load the model.

def run_YOLOE(model, folder_path, names="", conf=0.05, box_threshold=0.9):
    model.set_classes(names, model.get_text_pe(names))
    results = model.predict(folder_path, conf=conf)
    for result in results:
        class_ids = result.boxes.cls.int().tolist()  # noqa
        shape = result.orig_shape
        boxes = result.boxes.xyxy
        max_area = shape[0]*shape[1]*box_threshold
        boxes = clean_labels(boxes, max_area)
    if len(boxes):
        return boxes

def run_YOLOE_vis(model, image_path, visual_prompts, conf=0.05, box_threshold=0.9):
    results = model.predict(source=image_path, visual_prompts=visual_prompts, predictor=YOLOEVPSegPredictor, conf=conf)
    for result in results:
        class_ids = result.boxes.cls.int().tolist()  # noqa
        shape = result.orig_shape
        boxes = result.boxes.xyxy
        max_area = shape[0]*shape[1]*box_threshold
        boxes = clean_labels(boxes, max_area)
    if len(boxes):
        return boxes, results

# Text-Prompt

### Grounding DINO

In [None]:
ground_truth_paths = [
    r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\Bounding_Berries_LLM\train\images",
    r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\Bounding_Buds_LLM\train\images",
    r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\Bounding_redleaf_LLM\train\images",
    r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\Bounding_Fescue_LLM\train\images"]
dino_model = 'swint'
prompts = ['Dark blue globes', 'Young blueberry clusters', 'Red leaves', 'light-colored mold stain']
confidences = [0.35, 0.30, 0.40, 0.30]
SAM_model = 'sam2_t.pt'
DINO_time = {
    'berries': [],
    'buds': [],
    'red leaf': [],
    'fescue': [],
}

sam_masks = []
categories = ['berries', 'buds', 'red leaf', 'fescue']
max_areas = [0.3, 0.3, 0.8, 0.5]
save_dir = 'DINO-labels'

for category in categories:
    save_path = fr"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\study_2\GroundingDINO\{category}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
# Initialize NVML
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # Assuming single GPU (index 0)
power = []

dino = load_dino_model(dino_model)
initial_time = time.time()
power_before = pynvml.nvmlDeviceGetPowerUsage(handle)  # in milliwatts
for x in range(len(categories)):
    img_path = ground_truth_paths[x]
    prompt = prompts[x]
    conf = confidences[x]
    for fname in os.listdir(img_path):
        save_path = fr"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\study_2\GroundingDINO\{categories[x]}"
        path = img_path + "\\" + fname
        time_start = time.time()
        boxes = run_dino_from_model(dino, path, prompt, conf, 0.1, max_areas[x], save_path)
        DINO_time[f'{categories[x]}'].append(time.time() - time_start)
        power.append(pynvml.nvmlDeviceGetPowerUsage(handle))

# Get final power reading
duration = time.time() - initial_time  # Time in seconds
power_after = np.mean(power)
# Calculate energy usage
avg_power = (power_after - power_before)  # Average power in milliwatts
energy_consumed = (avg_power * duration) / 1000  # Convert to Joules

# Shutdown NVML
pynvml.nvmlShutdown()


for j in power:
    print(j)
print(power_before)
print(f"{energy_consumed} J")
print(DINO_time)


In [None]:
metrics = process_files(r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\fescue",
                        r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\Bounding_Fescue_LLM\train\labels")

print(f"Average IoU: {np.mean(metrics['iou_scores'])}")
print(f"Average Precision: {np.mean(metrics['precision_scores'])}")
print(f"Average Recall: {np.mean(metrics['recall_scores'])}")
print(f"Average F1: {np.mean(metrics['f1_scores'])}")

### Grounded SAM

In [None]:
for category in categories:
    save_path = fr"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\study_2\GroundedSAM\{category}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # Assuming single GPU (index 0)
power = []

SAM_time = {
    'berries': [],
    'buds': [],
    'red leaf': [],
    'fescue': [],
}

dino = load_dino_model(dino_model)
model = SAM(SAM_model)
initial_time = time.time()
power_before = pynvml.nvmlDeviceGetPowerUsage(handle)  # in milliwatts
for x in range(len(categories)):
    img_path = ground_truth_paths[x]
    prompt = prompts[x]
    conf = confidences[x]
    for fname in os.listdir(img_path):
        sam_masks.clear()
        save_path = fr"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\SAM_test\{categories[x]}"
        path = img_path + "\\" + fname
        time_start = time.time()
        boxes = run_dino_from_model(dino, path, prompt, conf, 0.1, max_areas[x])
        sam_results = model(os.path.join(img_path, fname), model=model, bboxes=boxes)
        save_masks(sam_results, save_path)
        SAM_time[f'{categories[x]}'].append(time.time() - time_start)
        power.append(pynvml.nvmlDeviceGetPowerUsage(handle))

# Get final power reading
duration = time.time() - initial_time  # Time in seconds
power_after = np.mean(power)
# Calculate energy usage
avg_power = (power_after - power_before)  # Average power in milliwatts
energy_consumed = (avg_power * duration) / 1000  # Convert to Joules

# Shutdown NVML
pynvml.nvmlShutdown()


for j in power:
    print(j)
print(power_before)
print(f"{energy_consumed} J")
print(SAM_time)

In [None]:
for category in categories:
    save_path = fr"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\study_2\YOLOE\{category}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)

# Initialize NVML
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # Assuming single GPU (index 0)
power = []

YOLO_time = {
    'berries': [],
    'buds': [],
    'red leaf': [],
    'fescue': [],
}

yoloe = load_YOLOE()
initial_time = time.time()
power_before = pynvml.nvmlDeviceGetPowerUsage(handle)  # in milliwatts
for x in range(len(categories)):
    img_path = ground_truth_paths[x]
    prompt = prompts[x]
    conf = confidences[x]
    for fname in os.listdir(img_path):
        save_path = fr"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\study_2\YOLOE\{categories[x]}"
        path = img_path + "\\" + fname
        time_start = time.time()
        boxes = run_YOLOE(yoloe, path, prompt, conf)
        YOLO_time[f'{categories[x]}'].append(time.time() - time_start)
        power.append(pynvml.nvmlDeviceGetPowerUsage(handle))

# Get final power reading
duration = time.time() - initial_time  # Time in seconds
power_after = np.mean(power)
# Calculate energy usage
avg_power = (power_after - power_before)  # Average power in milliwatts
energy_consumed = (avg_power * duration) / 1000  # Convert to Joules

# Shutdown NVML
pynvml.nvmlShutdown()


for j in power:
    print(j)
print(power_before)
print(f"{energy_consumed} J")
print(YOLO_time)

# Box-Prompt

In [4]:
use_same_class = True
default_class_id = 0
max_display_width = 1200

# === Globals ===
boxes = []
class_ids = []
drawing = False
current_box = []
resize_ratio = 1.0
corner_size = 8
selected_idx = -1
corner_drag = False
corner_index = -1
mouse_x, mouse_y = 0, 0


def point_near(p1, p2, thresh=10):
    return abs(p1[0] - p2[0]) < thresh and abs(p1[1] - p2[1]) < thresh


def find_corner(point, box):
    x1, y1, x2, y2 = box
    corners = [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]  # All 4 corners
    for i, (cx, cy) in enumerate(corners):
        if point_near(point, (cx, cy), thresh=corner_size):
            return i
    return -1


def click_event(event, x, y, flags, param):
    global drawing, current_box, boxes, class_ids
    global selected_idx, corner_drag, corner_index, mouse_x, mouse_y

    mouse_x, mouse_y = x, y

    if event == cv2.EVENT_LBUTTONDOWN:
        for idx, box in enumerate(boxes):
            ci = find_corner((x, y), box)
            if ci != -1:
                selected_idx = idx
                corner_drag = True
                corner_index = ci
                return
        drawing = True
        current_box = [(x, y)]

    elif event == cv2.EVENT_MOUSEMOVE:
        mouse_x, mouse_y = x, y
        if drawing and current_box:
            current_box = [current_box[0], (x, y)]
        elif corner_drag and selected_idx != -1:
            box = boxes[selected_idx]
            # Allow dragging corners inward and outward
            if corner_index == 0:  # top-left
                box[0], box[1] = x, y
            elif corner_index == 1:  # top-right
                box[2], box[1] = x, y
            elif corner_index == 2:  # bottom-right
                box[2], box[3] = x, y
            elif corner_index == 3:  # bottom-left
                box[0], box[3] = x, y
            boxes[selected_idx] = box

    elif event == cv2.EVENT_LBUTTONUP:
        if drawing and len(current_box) == 2:
            x1, y1 = current_box[0]
            x2, y2 = current_box[1]
            box = [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
            boxes.append(box)
            class_ids.append(
                default_class_id if use_same_class else int(input(f"Enter class ID for box {len(boxes)}: ")))
            current_box.clear()
        drawing = False
        corner_drag = False
        corner_index = -1


def draw_all(img):
    for i, box in enumerate(boxes):
        x1, y1, x2, y2 = map(int, box)
        color = (0, 255, 0) if i != selected_idx else (0, 0, 255)
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)

        # Draw corner handles
        for (cx, cy) in [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]:
            cv2.rectangle(img, (cx - corner_size, cy - corner_size),
                          (cx + corner_size, cy + corner_size), (255, 255, 0), -1)


def draw_cursor_guides(img):
    step = 10
    for y in range(0, img.shape[0], step * 2):
        cv2.line(img, (mouse_x, y), (mouse_x, y + step), (200, 200, 200), 1)
    for x in range(0, img.shape[1], step * 2):
        cv2.line(img, (x, mouse_y), (x + step, mouse_y), (200, 200, 200), 1)


def main(image_path):
    global resize_ratio, selected_idx

    original = cv2.imread(image_path)
    if original is None:
        print("Image not found.")
        return

    h, w = original.shape[:2]
    if w > max_display_width:
        resize_ratio = max_display_width / w
        display = cv2.resize(original, (int(w * resize_ratio), int(h * resize_ratio)))
    else:
        display = original.copy()

    clone = display.copy()
    cv2.namedWindow("Image")
    cv2.setMouseCallback("Image", click_event)

    print(
        "Draw boxes (click-drag). Drag corners to edit. \nPress 'Enter' to run inference.\nPress 'Backspace' while box is selected to delete it.")

    while True:
        img_show = clone.copy()
        draw_all(img_show)
        draw_cursor_guides(img_show)

        if drawing and len(current_box) == 2:
            cv2.rectangle(img_show, current_box[0], current_box[1], (255, 255, 255), 1)

        cv2.imshow("Image", img_show)
        key = cv2.waitKey(1) & 0xFF
        if key == ord('\r'):
            break
        elif key == ord('\b'):  # Backspace key
            if selected_idx != -1 and selected_idx < len(boxes):
                print(f"Deleted box {selected_idx + 1}")
                boxes.pop(selected_idx)
                class_ids.pop(selected_idx)
                selected_idx = -1

    cv2.destroyAllWindows()

    if not boxes:
        print("No boxes labeled.")
        return

    # Normalize coordinates before inference
    bboxes = []
    for box in boxes:
        x1, y1, x2, y2 = box
        x1, x2 = sorted([x1, x2])
        y1, y2 = sorted([y1, y2])
        scaled_box = [x1 / resize_ratio, y1 / resize_ratio, x2 / resize_ratio, y2 / resize_ratio]
        bboxes.append(scaled_box)

    visual_prompts = dict(
        bboxes=np.array(bboxes, dtype=np.float32),
        cls=np.array(class_ids, dtype=np.int32),
    )
    return visual_prompts

In [5]:
root = Tk()
root.withdraw()  # Hide the main window

file_path = filedialog.askopenfilename(
    title="Select an Image",
    filetypes=[("Image Files", "*.png;*.jpg;*.jpeg;*.bmp;*.gif")]
)

In [6]:
prompt = main(file_path)

Draw boxes (click-drag). Drag corners to edit. 
Press 'Enter' to run inference.
Press 'Backspace' while box is selected to delete it.


### YOLOE

In [10]:
yoloe = load_YOLOE()

boxes, results = run_YOLOE_vis(yoloe, file_path, prompt, conf=0.05, box_threshold=0.9)

results[0].show()

Ultralytics 8.3.129  Python-3.12.8 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce RTX 4080, 16376MiB)
YOLOe-11l-seg summary (fused): 227 layers, 35,117,862 parameters, 2,254,374 gradients

image 1/1 C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\Bounding_Berries_LLM\train\images\IMG_9394_jpg.rf.93cd662dac6324bfa4ef17b55494eaf7.jpg: 448x640 9 object0s, 83.6ms
Speed: 1.6ms preprocess, 83.6ms inference, 2.2ms postprocess per image at shape (1, 3, 448, 640)
