In [1]:
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import gradio as gr
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from torchvision.ops import masks_to_boxes
from sklearn.cluster import KMeans
import cv2
import colorsys
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont

In [3]:
import sam3
from sam3 import build_sam3_image_model
from sam3.model.box_ops import box_xywh_to_cxcywh
from sam3.model.sam3_image_processor import Sam3Processor
from sam3.visualization_utils import draw_box_on_image, normalize_bbox, plot_results

sam3_root = os.path.join(os.path.dirname(sam3.__file__), "..")

In [4]:
DEVICE = "cuda"

In [5]:
bpe_path = f"{sam3_root}/sam3/assets/bpe_simple_vocab_16e6.txt.gz"
model = build_sam3_image_model(bpe_path=bpe_path)
model.to(DEVICE)
model.eval()

In [6]:
def crop_barcode(img, mask):
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    c = max(contours, key = cv2.contourArea)

    # Gives the rotated bounding box
    rect = cv2.minAreaRect(c)
    center, size, angle = rect
    center, size = list(center), list(size)

    # Horizontal Photo
    if size[0] < size[1]: # width < height
        angle = angle + 90
        size[0], size[1] = size[1], size[0] 
    
    M = cv2.getRotationMatrix2D(tuple(center), angle, 1.0)
    
    dst_w, dst_h = int(size[0]), int(size[1])
        
    rotation = cv2.warpAffine(img, M, (img.shape[1], img.shape[0]), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_CONSTANT, borderValue=(255, 255, 255))
    
    barcode = cv2.getRectSubPix(rotation, (dst_w, dst_h), tuple(center))
    
    return barcode

In [7]:
# Tried decoding directly from the image raw data. It didn't work because of the bad image resolution.
def bar_value(img):
    img = cv2.resize(img, None, fx = 4, fy = 2, interpolation = cv2.INTER_CUBIC)
    binary = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 81, 4)   
    #display(Image.fromarray(binary))

    h, w = binary.shape
    center_y = h // 2
    row = binary[center_y, :] 

    segments = []
    current_val = row[0]
    count = 0
    
    for pixel in row:
        if pixel == current_val:
            count += 1
        else:
            segments.append({'w': count, 'c': "White" if current_val == 255 else "Black"})
            count = 1            
            current_val = pixel  
    segments.append({'w': count, 'c': "White" if current_val == 255 else "Black"})
    if segments[0]['c'] == "Black": segments.pop(0)
    if segments[-1]['c'] == "Black": segments.pop()
    segments.pop(0)
    segments.pop()

    widths = []
    for s in segments:
        w = float(s['w'])
        
        if s['c'] == "Black":
            w = w + 4.0 
        else:
            w = w - 4.0
            if w < 0.1: w = 0.1 
        widths.append(w)

    valid_widths = [w for w in widths if w > 5] 

    sorted_widths = sorted(valid_widths)
    
    take_n = max(1, len(sorted_widths) // 10) # 10% of the most small lines, we exclude the noise
    smallest_sample = sorted_widths[:take_n]
    
    unit_width = np.mean(smallest_sample)

    code = []
    for w in valid_widths:
        num_modules = int(round(w / unit_width))
        
        if num_modules < 1: num_modules = 1
        if num_modules > 4: num_modules = 4
        
        code.append(num_modules)

    return code

In [8]:
def process_image(input_image, text_prompt, word):
    image = input_image.convert("RGB")
    width, height = image.size
    processor = Sam3Processor(model, confidence_threshold=0.5)
    inference_state = processor.set_image(image)
    
    # 1. Detect and Identify the objects requested by the user on the image: #
    objects = [x.strip() for x in text_prompt.split(",")]
    if objects == ["all"] or text_prompt == "": objects = ["item"]
    
    img0 = input_image.convert("RGBA")
    layer = Image.new("RGBA", img0.size, (0, 0, 0, 0))
    draw = ImageDraw.Draw(layer)
    font = ImageFont.truetype("arial.ttf", 40)
    
    box_map = {}
    
    for i, obj in enumerate(objects):
        # COLOR
        rgb = colorsys.hsv_to_rgb(i / len(objects), 1.0, 1.0) 
        r, g, b = int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
        color = (r, g, b) 
    
        # BOX
        processor.reset_all_prompts(inference_state)
        inference_state = processor.set_text_prompt(state = inference_state, prompt = obj)
        boxes = inference_state["boxes"]
        scores = inference_state["scores"]
        masks = inference_state["masks"]
        for j, box in enumerate(inference_state["boxes"]):
            
            if objects == ["item"]:
                rgb = colorsys.hsv_to_rgb(j / len(boxes), 1.0, 1.0) 
                r, g, b = int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
                color = (r, g, b)
            
            x1, y1, x2, y2 = box[0].item(), box[1].item(), box[2].item(), box[3].item()
            # box_map[f"{obj}_{j}"] = (x1, y1, x2, y2) If there is more than one object of the same class
            box_map[f"{obj}"] = (x1, y1, x2, y2)

            draw.rectangle((x1, y1, x2, y2), outline = color, width = 5)  
            
            # MASK
            mask_data = masks[j].cpu().numpy()            
            mask_data = (mask_data > 0).astype('uint8') * 255
            if mask_data.ndim > 2: mask_data = mask_data[0]   
            mask_image = Image.fromarray(mask_data, mode = "L") 
            solid_color = Image.new("RGBA", img0.size, color + (100,)) 
            layer.paste(solid_color, (0,0), mask_image)
    
            # TEXT
            score = scores[j].item()
    
            label = f"{obj} {j} ({score:.2f})"
            text_bbox = draw.textbbox((x1, y1-5), label, font = font)
            draw.rectangle(text_bbox, fill = color)
            draw.text((x1, y1-5), label, fill = (255, 255, 255), font = font)
    
                
    img0 = Image.alpha_composite(img0, layer)
    #display(img0)
    #img0.save("Detection0.png")
    #plot_results(img0, inference_state)
    
    # 2. Detect and decode the Code128 barcodes of the requested objects and compute their normal surface vector: #
    processor.reset_all_prompts(inference_state)
    inference_state = processor.set_text_prompt(state = inference_state, prompt = "tight crop of black vertical barcode lines, ink pattern only")
    
    img1 = input_image.convert("RGBA")
    img1_gray = img1.convert("L")
    img1_gray = np.array(img1_gray)
    layer = Image.new("RGBA", img1.size, (0, 0, 0, 0))
    draw = ImageDraw.Draw(layer)
    font = ImageFont.truetype("arial.ttf", 40)
    
    boxes = inference_state["boxes"]
    masks = inference_state["masks"]

    barcode_map = {}
    
    for k, barcode in enumerate(inference_state["boxes"]):
        # COLOR
        rgb = colorsys.hsv_to_rgb(k / len(boxes), 1.0, 1.0) 
        r, g, b = int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
        color = (r, g, b)
    
        # Barcode Coordinates
        bcx1, bcy1, bcx2, bcy2 = barcode[0].item(), barcode[1].item(), barcode[2].item(), barcode[3].item()
        bc_center_x = (bcx1 + bcx2) / 2
        bc_center_y = (bcy1 + bcy2) / 2
        
        # Barcode's Item
        for name, item in box_map.items():
            ix1, iy1, ix2, iy2 = item
            if ix1 < bc_center_x < ix2 and iy1 < bc_center_y < iy2:
                # Rectangle
                draw.rectangle((bcx1, bcy1, bcx2, bcy2), outline = color, width = 5)  
    
                # Mask
                mask_data = masks[k].cpu().numpy()            
                mask_data = (mask_data > 0).astype('uint8') * 255
                if mask_data.ndim > 2: mask_data = mask_data[0]   
                mask_image = Image.fromarray(mask_data, mode = "L") 
                solid_color = Image.new("RGBA", img1.size, color + (100,)) 
                layer.paste(solid_color, (0,0), mask_image)
    
                # Cropped Barcode
                crop_bar = crop_barcode(img1_gray, mask_data)
                barcode_value = bar_value(crop_bar)
                barcode_map[name] = barcode_value
                #barcode_mao[barcode_value] = name
                
                #display(Image.fromarray(crop_bar))
                #crop_bar.save("Normal_Barcode.png")
                #print(name, ": ", barcode_value)
     
                # Text
                label = f"{name} {k}" # Put the barcode number 
                text_bbox = draw.textbbox((bcx1, bcy1-5), label, font = font)
                draw.rectangle(text_bbox, fill = color)
                draw.text((bcx1, bcy1-5), label, fill = (255, 255, 255), font = font)
                
                # Normal vector            
                mask_n = masks[k].cpu().numpy()
                if mask_n.ndim > 2: mask_n = mask_n[0]
                y_row, x_column = np.nonzero(mask_n > 0)
                coords = np.stack([x_column, y_row], axis = 1) # Matrix
                
                cov = np.cov(coords.T) # Covariance Matrix
                    
                evals, evecs = np.linalg.eig(cov) # Eigenvalues and Eigenvectors
    
                min_eval = np.argmin(evals) # The normal vector corresponds to the lowest eigenvalue
                normal_v = evecs[:, min_eval] # This is the normal axis
    
                magnitude = np.sqrt(evals[min_eval]) 
                arrow_len = magnitude * 2.0
    
                parent_cx = (ix1 + ix2) / 2
                parent_cy = (iy1 + iy2) / 2
    
                # Vector that goes from the center of the item to the center of its barcode
                vec_outward = np.array([bc_center_x - parent_cx, bc_center_y - parent_cy]) 
                
                # If they go in a "similar direction", their dot product is positive; else is negative.
                # Since the items are objects, if you go from the center to the surface (barcode) you are going outwards of the object
                # That's why the normal vector goes in the same direction as the one that goes from the center to the surface (the outwards vector)
                if np.dot(normal_v, vec_outward) < 0:
                    normal_v = -normal_v
                    
                p_end_x = bc_center_x + normal_v[0] * arrow_len
                p_end_y = bc_center_y + normal_v[1] * arrow_len
    
                # The absence of depht information makes really difficult the calculation of the normal vector
                draw.line([(bc_center_x, bc_center_y), (p_end_x, p_end_y)], fill = "black", width = 4)        
                draw.ellipse((p_end_x - 8, p_end_y - 8, p_end_x + 8, p_end_y + 8), fill = "black")
    
    img1 = Image.alpha_composite(img1, layer)
    
    if word == "Croc": rel = "Shoe"
    elif word == "Spanner": rel = "Wrench"
    elif word == "X001BZ87F1": rel = "Box"
    else: rel = word
    
    print("______")
    print("OUTPUT")
    print("______")
    print()
    print("Detect and Identify the Listed Items")
    display(img0)
    print()
    print("Barcodes's Detection and Normal Vector")
    display(img1)
    print()
    print("Relationship")
    print(word, "->", rel)
    print("______")

    return img0, img1, word
    #img0.save("Normal_Barcode.png")

In [9]:
with gr.Blocks(title = "SAM3 Barcode Detector") as demo:
    gr.Markdown("# üïµÔ∏è SAM3 Object & Barcode Detector")
    gr.Markdown("Upload an image and list the objects")
    
    with gr.Row():
        with gr.Column():
            in_image = gr.Image(type = "pil", label = "Imatge Input")
            in_text1 = gr.Textbox(label = "Objects to detect (first capital letter and separated by a comma)", 
                                 value = "Bottle, Box, Mug, Spanner, Scissors, Screwdriver, Croc")
            in_text2 = gr.Textbox(label = "Object vs Barcode", value = "Bottle")
            btn = gr.Button("Analyzse", variant = "primary")
        
        with gr.Column():
            out_image_1 = gr.Image(type = "pil", label = "Object Detection Output")
            out_image_2 = gr.Image(type = "pil", label = "Barcodes and Normal Vector Output")
            out_text = gr.Textbox(label = "Object vs Barcode")


    btn.click(fn = process_image, inputs = [in_image, in_text1, in_text2], outputs = [out_image_1, out_image_2, out_text])

if __name__ == "__main__":
    demo.launch(share = True)