In [10]:
from groundingdino.util.inference import load_model, load_image, predict
import cv2
import torch
import csv
from ultralytics import SAM
from pathlib import Path
import time as t
from PIL import Image, ImageDraw
import numpy as np
from PyQt5 import QtWidgets, QtGui, QtCore
import os

In [11]:
def clean_labels(boxes, max_area):
    clean_boxes = []
    box_list = boxes.tolist()
    for box in box_list:
        # if width * height < 0.9, add box to list.
        if (box[2] * box[3]) < max_area:
            clean_boxes.append(box)
    if len(clean_boxes) < 2:
        return boxes
    return torch.FloatTensor(clean_boxes)

def load_dino_model(model_size='swint'):
    #choose swinb or swint
    if model_size == 'swint':
        config_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GroundingDINO\groundingdino\config\GroundingDINO_SwinT_OGC.py"
        checkpoint_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GroundingDINO\weights\groundingdino_swint_ogc.pth"
    elif model_size == 'swinb':
        checkpoint_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GroundingDINO\weights\groundingdino_swinb_cogcoor.pth"
        config_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GroundingDINO\groundingdino\config\GroundingDINO_SwinB_cfg.py"

    model = load_model(config_path, checkpoint_path)
    return model

def run_dino_from_model(model, img_path, prompt, box_threshold, text_threshold, maxarea=0.7, save_dir="DINO-labels"):
    image_source, image = load_image(img_path)
    boxes, accuracy, obj_name = predict(model=model, image=image, caption=prompt, box_threshold=box_threshold,
                                        text_threshold=text_threshold)

    #Convert boxes from YOLOv8 format to xyxy
    img_height, img_width = cv2.imread(img_path).shape[:2]
    clean_boxes = clean_labels(boxes, maxarea)
    absolute_boxes = [[(box[0] - (box[2] / 2)) * img_width,
                       (box[1] - (box[3] / 2)) * img_height,
                       (box[0] + (box[2] / 2)) * img_width,
                       (box[1] + (box[3] / 2)) * img_height] for box in clean_boxes.tolist()]
    save_labels = True
    if save_labels:
        clean_boxes = clean_boxes.tolist()
        for x in clean_boxes:
            x.insert(0, 0)
        with open(f'{save_dir}/{os.path.splitext(os.path.basename(img_path))[0]}.txt', 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=' ')
            writer.writerows(clean_boxes)
    return absolute_boxes

def save_masks(sam_results, output_dir):
    segments = sam_results[0].masks.xyn
    with open(f"{Path(output_dir) / Path(sam_results[0].path).stem}.txt", "w") as f:
        for i in range(len(segments)):
            s = segments[i]
            if len(s) == 0:
                continue
            segment = map(str, segments[i].reshape(-1).tolist())
            f.write(f"0 " + " ".join(segment) + "\n")

def run_image(DINO, img_dir, output_dir, prompt, conf, box_threshold, save_dir):
    sam_model = "sam2_t.pt"
    dino_model = "swint"
    start = t.time()
    fname = os.path.basename(img_dir)
    path = img_dir
    boxes = run_dino_from_model(DINO, img_dir, prompt, conf, 0.1, box_threshold, save_dir=save_dir)
    model = SAM(sam_model)
    sam_results = model(img_dir, model=sam_model, bboxes=boxes, verbose=False)
    save_masks(sam_results, output_dir)

    print(f"Completed in: {t.time() - start} seconds, masks saved in {output_dir}")
    return sam_results

def adjust_masks(sam_results):
    result = sam_results[0]

    masks = result.masks.data.cpu().numpy()  # masks, (N, H, W)
    masks = np.moveaxis(masks, 0, -1)  # masks, (H, W, N)
    masks = np.moveaxis(masks, -1, 0)  # masks, (N, H, W)

    return masks

def overlay_with_borders(image, mask, color, thickness=2):
    # Convert mask to uint8 type
    mask_uint8 = (mask * 255).astype(np.uint8)

    # Find contours in the mask
    contours, _ = cv2.findContours(mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Draw contours on the image
    cv2.drawContours(image, contours, -1, color, thickness)
    return image

def draw_boxes_on_image(image, boxes):
    """
    Draw bounding boxes on the image using absolute coordinates.

    Args:
        image (np.ndarray): The original image.
        boxes (list): List of bounding boxes in the format [x1, y1, x2, y2].

    Returns:
        np.ndarray: Image with bounding boxes drawn on it.
    """
    # Convert the OpenCV image (BGR) to PIL for drawing
    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    # Create a drawing object
    draw = ImageDraw.Draw(pil_image)

    # Iterate over the list of boxes and draw them
    for box in boxes:
        x1, y1, x2, y2 = box
        draw.rectangle([x1, y1, x2, y2], outline=(255, 0, 255), width=2)  # Drawing a rectangle with purple border

    # Convert back to OpenCV format for display
    return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

def optimize_prompts(prompts_file, gt_path, img_dir, save_file, threshold, DINO):
    inf_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GUI and Pipeline\DINO-labels"
    if not os.path.exists(inf_path):
        try:
            os.makedirs(inf_path)
            print(f"Directory '{inf_path}' created as it was missing.")
        except:
            pass

    with open(prompts_file, 'r') as file:
        result_dict = {}
        for x in file:
            result_dict[x.strip()] = {}

    # result_dict = dict.fromkeys(prompts,{})
    for prompt in result_dict.keys():
        print(f'Trying prompt: "{prompt}"')

        box_threshold = 0.3
        text_threshold = 0.1
        model_size = 'swint'
        run_dino_from_model(DINO, img_dir, prompt, box_threshold, text_threshold, maxarea=threshold)

        metrics = process_file(inf_path, gt_path, threshold=threshold)

        result_dict[prompt]['iou_scores'] = np.mean(metrics['iou_scores'])

    results = sorted(list(result_dict.items()), key=lambda a: a[1]['iou_scores'], reverse=True)
    print(results)

    with open(save_file, 'w') as output:
        for prompt_stats in results:
            output.write(str(prompt_stats) + '\n')

    return results

def calculate_metrics(tp, fp, fn, tn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    mcc = ((tp * tn) - (fp * fn)) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) \
        if np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) > 0 else 0
    specificity = tn / (tn + fp) if tn + fp > 0 else 0
    return precision, recall, f1, mcc, specificity

def read_and_draw_boxes(file_path, image_dim=(1280, 720)):
    boxes = []
    with open(file_path, 'r') as file:
        for line in file:
            class_id, x, y, width, height = map(float, line.strip().split())
            x1 = (x-(width/2))*image_dim[0]
            x2 = (x+(width/2))*image_dim[0]
            y1 = (y-(height/2))*image_dim[1]
            y2 = (y+(height/2))*image_dim[1]
            boxes.append([x1, y1, x2, y2])
    image = Image.new('L', image_dim, 0)
    draw = ImageDraw.Draw(image)
    for box in boxes:
        draw.rectangle(box, fill=255)
        #draw.rectangle([1,1,20,20], fill=255)
    #image.save("test.jpg")
    return np.array(image, dtype=np.uint8)

def clean_labels_from_file(file_path, cleaning_threshold=0.6):
    # Read the file and check if it has more than one line
    with open(file_path, 'r') as f:
        lines = f.readlines()

    if len(lines) > 1:
        accepted_lines = []

        # Process each line
        for line in lines:
            class_id, x, y, width, height = map(float, line.strip().split())
            # if width * height < 0.9:
            if (width * height) < cleaning_threshold:
                accepted_lines.append(line)

        # Overwrite the file with accepted lines
        with open(file_path, 'w') as f:
            if len(accepted_lines) > 0:
                for line in accepted_lines:
                    f.write(line)

In [12]:
def draw_boxes(boxes, image_dim=(1280, 720)):
    """
    Draw bounding boxes directly from a list of absolute boxes.

    Parameters:
    boxes (list): List of absolute box coordinates in xyxy format.
    image_dim (tuple): Dimensions of the output image (width, height).

    Returns:
    np.array: Binary image with boxes drawn.
    """
    # Create a blank image to draw the boxes
    image = Image.new('L', image_dim, 0)
    draw = ImageDraw.Draw(image)

    # Draw each box on the image
    for box in boxes:
        draw.rectangle(box, fill=255)

    return np.array(image, dtype=np.uint8)

In [19]:
def prompt_optimizer(prompts_file, gt_path, img_path, save_file, threshold, DINO):
    # Ensure inference path exists
    inf_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GUI and Pipeline\DINO-labels"
    os.makedirs(inf_path, exist_ok=True)

    # Initialize result dictionary from prompt file
    with open(prompts_file, 'r') as file:
        result_dict = {x.strip(): {} for x in file}

    # Process each prompt
    for prompt in result_dict.keys():
        print(f'Trying prompt: "{prompt}"')

        # Run prediction and save labels
        run_dino_from_model(DINO, img_path, prompt, box_threshold=0.3, text_threshold=0.1, maxarea=threshold)

        # Process single predicted and ground truth file
        predicted_mask_file = os.path.join(inf_path, f"{os.path.splitext(os.path.basename(img_path))[0]}.txt")
        metrics = process_file(predicted_mask_file, gt_path, threshold)

        # Save the IoU score for the prompt
        result_dict[prompt]['iou_scores'] = np.mean(metrics['iou_scores'])

    # Sort and save results
    results = sorted(result_dict.items(), key=lambda a: a[1]['iou_scores'], reverse=True)
    print("Results:", results)

    with open(save_file, 'w') as output:
        for prompt_stats in results:
            output.write(str(prompt_stats) + '\n')

    return results

def process_file(predicted_mask_file, ground_truth_mask_file, threshold):
    # Initialize metrics dictionary
    metrics = {
        'iou_scores': [],
        'precision_scores': [],
        'recall_scores': [],
        'f1_scores': [],
        'mcc_scores': [],
        'specificity_scores': []
    }

    # Preprocess predicted mask
    clean_labels_from_file(predicted_mask_file, threshold)
    predicted_mask = read_and_draw_boxes(predicted_mask_file)
    ground_truth_mask = read_and_draw_boxes(ground_truth_mask_file)

    # Convert masks to binary
    _, predicted_mask_bin = cv2.threshold(predicted_mask, 127, 255, cv2.THRESH_BINARY)
    _, ground_truth_mask_bin = cv2.threshold(ground_truth_mask, 127, 255, cv2.THRESH_BINARY)

    predicted_mask_bin = predicted_mask_bin / 255
    ground_truth_mask_bin = ground_truth_mask_bin / 255

    # Calculate true positives, true negatives, false positives, and false negatives
    tp = np.float64(np.sum(np.logical_and(predicted_mask_bin == 1, ground_truth_mask_bin == 1)))
    tn = np.float64(np.sum(np.logical_and(predicted_mask_bin == 0, ground_truth_mask_bin == 0)))
    fp = np.float64(np.sum(np.logical_and(predicted_mask_bin == 1, ground_truth_mask_bin == 0)))
    fn = np.float64(np.sum(np.logical_and(predicted_mask_bin == 0, ground_truth_mask_bin == 1)))

    # Calculate metrics
    intersection = np.logical_and(predicted_mask_bin, ground_truth_mask_bin)
    union = np.logical_or(predicted_mask_bin, ground_truth_mask_bin)
    metrics['iou_scores'].append(np.sum(intersection) / np.sum(union))
    # Calculate precision, recall, f1-score, MCC, and specificity
    precision, recall, f1, mcc, specificity = calculate_metrics(tp, fp, fn, tn)
    metrics['precision_scores'].append(precision)
    metrics['recall_scores'].append(recall)
    metrics['f1_scores'].append(f1)
    metrics['mcc_scores'].append(mcc)
    metrics['specificity_scores'].append(specificity)
    #print(metrics['iou_scores'])
    return metrics

def process_mask_arrays(predicted_mask_array, ground_truth_mask_array):
    # Resize predicted mask to match the ground truth mask's dimensions
    if predicted_mask_array.shape != ground_truth_mask_array.shape:
        predicted_mask_array = cv2.resize(predicted_mask_array, (ground_truth_mask_array.shape[1], ground_truth_mask_array.shape[0]), interpolation=cv2.INTER_NEAREST)

    # Initialize metrics dictionary
    metrics = {
        'iou_scores': [],
        #'pixel_accuracies': [],
        'precision_scores': [],
        'recall_scores': [],
        'f1_scores': [],
        'mcc_scores': [],
        'specificity_scores': []
    }

    # Convert masks to binary based on threshold
    _, predicted_mask_bin = cv2.threshold(predicted_mask_array, 127, 255, cv2.THRESH_BINARY)
    _, ground_truth_mask_bin = cv2.threshold(ground_truth_mask_array, 127, 255, cv2.THRESH_BINARY)

    # Normalize binary masks for calculation
    predicted_mask_bin = predicted_mask_bin / 255
    ground_truth_mask_bin = ground_truth_mask_bin / 255

    # Calculate true positives, true negatives, false positives, and false negatives
    tp = np.float64(np.sum(np.logical_and(predicted_mask_bin == 1, ground_truth_mask_bin == 1)))
    tn = np.float64(np.sum(np.logical_and(predicted_mask_bin == 0, ground_truth_mask_bin == 0)))
    fp = np.float64(np.sum(np.logical_and(predicted_mask_bin == 1, ground_truth_mask_bin == 0)))
    fn = np.float64(np.sum(np.logical_and(predicted_mask_bin == 0, ground_truth_mask_bin == 1)))

    # Calculate IoU and pixel accuracy
    intersection = np.logical_and(predicted_mask_bin, ground_truth_mask_bin)
    union = np.logical_or(predicted_mask_bin, ground_truth_mask_bin)
    metrics['iou_scores'].append(np.sum(intersection) / np.sum(union))
    #metrics['pixel_accuracies'].append(pixel_accuracy(predicted_mask_bin, ground_truth_mask_bin))

    # Calculate precision, recall, f1-score, MCC, and specificity
    precision, recall, f1, mcc, specificity = calculate_metrics(tp, fp, fn, tn)
    metrics['precision_scores'].append(precision)
    metrics['recall_scores'].append(recall)
    metrics['f1_scores'].append(f1)
    metrics['mcc_scores'].append(mcc)
    metrics['specificity_scores'].append(specificity)

    return metrics

def confidence_optimizer(prompt, DINO, gt_path, img_path, threshold):
    inf_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\GUI and Pipeline\DINO-labels"
    os.makedirs(inf_path, exist_ok=True)

    best_iou = 0
    best_conf = 0

    image = cv2.imread(img_path)
    shape = image.shape

    # Step 1: Precision 1 sweep (coarse) from 0.0 to 0.9 in steps of 0.1
    for conf in np.arange(0.0, 0.91, 0.1):
        box_threshold = conf
        text_threshold = 0.1
        boxes = run_dino_from_model(DINO, img_path, prompt, box_threshold, text_threshold)
        pred_masks = draw_boxes(boxes, (shape[1], shape[0]))
        gt_masks = read_and_draw_boxes(gt_path)

        metrics = process_mask_arrays(pred_masks, gt_masks)
        iou = np.mean(metrics['iou_scores'])
        print('P1 rep')
        print(f"[Precision 1] Confidence: {conf:.1f}, IoU: {iou:.4f}")

        if iou > best_iou:
            best_iou = iou
            best_conf = conf

    print(f"Best from Precision 1: Confidence = {best_conf:.1f}, IoU = {best_iou:.4f}")

    # Step 2: Precision 2 sweep from (best_conf - 0.1) to (best_conf + 0.1) in steps of 0.01
    lower = best_conf - 0.1
    upper = best_conf + 0.1
    step = 0.01

    for conf in np.arange(lower, upper + step, step):
        box_threshold = conf
        text_threshold = 0.01
        boxes = run_dino_from_model(DINO, img_path, prompt, box_threshold, text_threshold)
        pred_masks = draw_boxes(boxes, (shape[1], shape[0]))
        gt_masks = read_and_draw_boxes(gt_path)

        metrics = process_mask_arrays(pred_masks, gt_masks)
        iou = np.mean(metrics['iou_scores'])
        print('P2 rep')
        print(f"[Precision 2] Confidence: {conf:.2f}, IoU: {iou:.4f}")

        if iou > best_iou:
            best_iou = iou
            best_conf = conf

    print(f"Final Best: Confidence = {best_conf:.2f}, IoU = {best_iou:.4f}")
    return best_iou, best_conf


def multi_optimizer(img_dir, gt_label_dir, DINO, prompts, threshold=0.9):
    start = t.time()
    best_iou = 0
    best_prompt = ""
    best_conf = 0
    for prompt in prompts:
        print(f"Trying prompt: '{prompt}'")
        iou, conf = confidence_optimizer(prompt, DINO, gt_label_dir, img_dir, threshold)
        if iou > best_iou:
            best_iou = iou
            best_conf = conf
            best_prompt = prompt
        print(f"So far: best prompt is '{best_prompt}', conf is {best_conf}, resulting in {best_iou} IOU)")
    print(f"\n\n\n\n\nFinal Result: best prompt is '{best_prompt}', conf is {best_conf}, resulting in {best_iou} IOU)")
    print(f"final time: {t.time() - start}")
    return {"prompt": best_prompt, "conf": best_conf, "iou": best_iou}

In [20]:
def sort_largest_file(folder_path):
    # Dictionary to store file names and their line counts
    file_line_counts = {}

    # Iterate through files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a .txt file
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            # Open the file and count lines
            with open(file_path, 'r') as file:
                line_count = sum(1 for line in file)
            # Add the file and line count to the dictionary
            file_line_counts[file_name] = line_count
        else:
            print("File encountered not in .txt format.")
    # Sort files by line count in descending order and return as list of file names
    sorted_files = sorted(file_line_counts, key=file_line_counts.get, reverse=True)
    return sorted_files

# Usage
folder_path = r'C:/Users/cmull/DataspellProjects/AutoAnnotate/autoannotate study/berries-bounding-box-1/train/labels'
image_folder_path = r'C:/Users/cmull/DataspellProjects/AutoAnnotate/autoannotate study/berries-bounding-box-1/train/images'
sorted_txt_files = sort_largest_file(folder_path)
print("Files sorted by line count:", sorted_txt_files)
reference_txt = folder_path + '\\' + sorted_txt_files[0]
reference_image = image_folder_path + '\\' + sorted_txt_files[0].split(".txt")[0] + ".jpg"

Files sorted by line count: ['IMG_9331_jpg.rf.20009327b80c55eec840b8b4f5cddf57.txt', 'IMG_9355_jpg.rf.40d4de298491188a33bcdfd995d9e855.txt', 'IMG_9379_jpg.rf.42c280b08420d4271486e3cdebe8a30e.txt', 'IMG_9394_jpg.rf.93cd662dac6324bfa4ef17b55494eaf7.txt', 'IMG_9383_jpg.rf.7af81e391f70df26bca8c741d75bcf24.txt', 'IMG_9387_jpg.rf.9ae726fc1ddc490013a19db8c1c2a1f1.txt']


In [21]:
box_threshold = 0.5
DINO = load_dino_model()
prompts_file = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\prompts\blueberry-prompts.txt"

final text_encoder_type: bert-base-uncased


In [22]:
prompt_result = prompt_optimizer(prompts_file, reference_txt, reference_image, "best.txt", box_threshold, DINO)

top_2 = prompt_result[:2]
top2 = [result[0] for result in prompt_result][0:2]

Trying prompt: "blueberry"
Trying prompt: "a blueberry"
Trying prompt: "single blueberry"
Trying prompt: "a single blueberry"
Trying prompt: "a single, round blueberry"
Trying prompt: "single, round blueberry"
Trying prompt: "individual blueberry"
Trying prompt: "an individual blueberry"
Trying prompt: "one blueberries"
Trying prompt: "small blue sphere"
Trying prompt: "a small blue sphere"
Trying prompt: "a small blue berry"
Trying prompt: "a blueberry among a patch of green leaves"
Trying prompt: "a wild blueberry"
Trying prompt: "wild blueberry"
Trying prompt: "single  wild blueberry"
Trying prompt: "a single wild blueberry"
Trying prompt: "a single, round  wild blueberry"
Trying prompt: "blueberries"
Trying prompt: "a blue berry"
Trying prompt: "blue berry"
Trying prompt: "a small blueberry"
Trying prompt: "blueberry among a patch of green leaves"
Trying prompt: "a round blueberry among a patch of green leaves"
Trying prompt: "a blue sphere  among a patch of green leaves"
Trying pr

In [23]:
print(top2)

['a single blueberry', 'a small blueberry']


In [24]:
multi_optimizer(reference_image, reference_txt, DINO, top2, box_threshold)

Trying prompt: 'a single blueberry'
P1 rep
[Precision 1] Confidence: 0.0, IoU: 0.0297
P1 rep
[Precision 1] Confidence: 0.1, IoU: 0.4341
P1 rep
[Precision 1] Confidence: 0.2, IoU: 0.5795
P1 rep
[Precision 1] Confidence: 0.3, IoU: 0.5478
P1 rep
[Precision 1] Confidence: 0.4, IoU: 0.3398
P1 rep
[Precision 1] Confidence: 0.5, IoU: 0.1316
P1 rep
[Precision 1] Confidence: 0.6, IoU: 0.0000
P1 rep
[Precision 1] Confidence: 0.7, IoU: 0.0000
P1 rep
[Precision 1] Confidence: 0.8, IoU: 0.0000
P1 rep
[Precision 1] Confidence: 0.9, IoU: 0.0000
Best from Precision 1: Confidence = 0.2, IoU = 0.5795
P2 rep
[Precision 2] Confidence: 0.10, IoU: 0.4341
P2 rep
[Precision 2] Confidence: 0.11, IoU: 0.4341
P2 rep
[Precision 2] Confidence: 0.12, IoU: 0.4345
P2 rep
[Precision 2] Confidence: 0.13, IoU: 0.4476
P2 rep
[Precision 2] Confidence: 0.14, IoU: 0.4476
P2 rep
[Precision 2] Confidence: 0.15, IoU: 0.4830
P2 rep
[Precision 2] Confidence: 0.16, IoU: 0.5470
P2 rep
[Precision 2] Confidence: 0.17, IoU: 0.5553
P2

{'prompt': 'a small blueberry',
 'conf': 0.20999999999999996,
 'iou': 0.5806771778584392}

In [1]:
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
import re

def extract_descriptions(response):
    """
    Extracts only the clean descriptions from the model response, removing metadata, numbering, and unnecessary text.

    :param response: The raw response from the model.
    :return: A list of cleaned descriptions without numbering or unwanted text.
    """
    lines = response.split("\n")  # Split response into lines
    unwanted_keywords = ["user", "assistant", "describe", "text & image output"]  # Keywords to ignore

    descriptions = []
    for line in lines:
        clean_line = line.strip()
        if not clean_line:
            continue  # Skip empty lines
        if any(keyword in clean_line.lower() for keyword in unwanted_keywords):
            continue  # Skip lines with unwanted keywords

        # Remove leading numbering like "1. ", "2)", "3 - ", etc.
        clean_line = re.sub(r"^\s*\d+[\.\)\-]\s*", "", clean_line)

        if clean_line:  # Only add if the line still has content
            descriptions.append(clean_line)

    return descriptions
# Load Model and Processor
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
model.tie_weights()

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
# Load Image
image_path = r"C:\Users\cmull\DataspellProjects\AutoAnnotate\autoannotate study\bounding-buds-1\train\images\IMG_1067_JPG_jpg.rf.a07350fa4e778c7ae999106ecc5dde24.jpg"
raw_image = Image.open(image_path).convert("RGB")
manual_entry = input("object in image")
# Define Conversation Prompt (Corrected)
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"},  # Note: No "image": raw_image here!
            {"type": "text", "text": f"Describe the {manual_entry} of the image in 3 words maximum for prompt use in a zero-shot detection model, and give 5 separate entries, each separated by a new line, and its own separate descriptor of the target. number each prompt. then simply new line. strictly the prompts, no other response is required. use visual description of the target in the image only. no prompts should be the same."},
        ],
    },
]

# Convert to Text Prompt
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Process Inputs Correctly
inputs = processor(text=prompt, images=raw_image, return_tensors="pt").to(model.device)  # Ensure correct parameter order

In [3]:
# Generate Output
output = model.generate(**inputs, temperature=0.7, top_p=0.9, max_new_tokens=512)

# Decode and Print Output
response = processor.decode(output[0], skip_special_tokens=True)
print("Text & Image Output:", response)


Text & Image Output: user

Describe the blueberry buds of the image in 3 words maximum for prompt use in a zero-shot detection model, and give 5 separate entries, each separated by a new line, and its own separate descriptor of the target. number each prompt. then simply new line. strictly the prompts, no other response is required. use visual description of the target in the image only. no prompts should be the same.assistant

1. Small greenish-red, closed flower-like structures at end of twigs
2. Buds of small, closed flower-like structures
3. Small greenish-red, flower-like structures on branches
4. Small, greenish-red, closed flower-like structures on end of branches
5. Closed flower-like structures on end of small branches


In [5]:
# Extracted descriptions
cleaned_descriptions = extract_descriptions(response)

# Print results
for prompt in cleaned_descriptions:
    print(prompt)

Small greenish-red, closed flower-like structures at end of twigs
Buds of small, closed flower-like structures
Small greenish-red, flower-like structures on branches
Small, greenish-red, closed flower-like structures on end of branches
Closed flower-like structures on end of small branches


In [6]:
print(cleaned_descriptions)

['Small greenish-red, closed flower-like structures at end of twigs', 'Buds of small, closed flower-like structures', 'Small greenish-red, flower-like structures on branches', 'Small, greenish-red, closed flower-like structures on end of branches', 'Closed flower-like structures on end of small branches']
