In [37]:
import cv2
import json
import numpy as np
from IPython.display import display
from google import genai
from google.genai import types
from PIL import Image
import os
import re

In [38]:
API_KEY = os.getenv("GEMINI_API_KEY")
OUTPUT_PATH = "./gemini_extracted_images"
INPUT_FOLDER = "./converted_images"

In [None]:
client = genai.Client(api_key=)

In [40]:
def inference(image, prompt, temp=0.5):
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[prompt, image],
        config=types.GenerateContentConfig(
            temperature=temp
        )
    )

    return response.text

In [41]:
def read_image(filename):
    image = cv2.imread(filename)
    h, w = image.shape[:2]
    return Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

In [42]:
prompt = """
Please analyze the uploaded image and do the following:

1. Identify and draw bounding boxes around three categories:

   * **Text regions** (all paragraphs, headings, labels, and sentences)
   * **Table regions** (structured rows/columns with certification details)
   * **Image regions**

"""

output_prompt = "Return just bounding boxes coordinates and labels: text, image, or table"


In [43]:
def clean_results(data):
    """
    Extract only the JSON portion from model output
    """
    results = data.strip().removeprefix("```json").removesuffix("```").strip()
    try:
        return json.loads(results)
    except json.JSONDecodeError:
        print("⚠️ JSON parse failed, returning empty list.")
        return []

In [44]:
def draw_boxes_with_read_image(pil_image, detections):
    # Convert PIL image to NumPy array in RGB
    image_rgb = np.array(pil_image)
    h, w = image_rgb.shape[:2]

    # Convert RGB to BGR for OpenCV
    image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)

    for obj in detections:
        y1, x1, y2, x2 = obj["box_2d"]
        label = obj["label"]

        # Scale coordinates (assuming normalized to 1000)
        y1 = y1 / 1000 * h
        x1 = x1 / 1000 * w
        y2 = y2 / 1000 * h
        x2 = x2 / 1000 * w

        # Ensure coordinates are ordered correctly
        if x1 > x2:
            x1, x2 = x2, x1
        if y1 > y2:
            y1, y2 = y2, y1

        # Draw rectangle and label
        cv2.rectangle(image_bgr, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        cv2.putText(image_bgr, label, (int(x1), int(y1) - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)

    # Convert back to PIL
    image_rgb_out = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    return Image.fromarray(image_rgb_out)

In [45]:
def process_folder(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        filepath = os.path.join(input_folder, filename)
        pil_img = read_image(filepath)
        
        raw_response = inference(pil_img, prompt+output_prompt)
        clean_response = clean_results(raw_response)

        output_img = draw_boxes_with_read_image(pil_img, clean_response)
        save_path = os.path.join(output_folder, filename)
        output_img.save(save_path)

        print(f"Saved annotated image to {save_path}")

In [46]:
process_folder(INPUT_FOLDER, OUTPUT_PATH)

Saved annotated image to ./gemini_extracted_images/page_038.png
Saved annotated image to ./gemini_extracted_images/page_033.png
Saved annotated image to ./gemini_extracted_images/page_013.png
Saved annotated image to ./gemini_extracted_images/page_001.png
⚠️ JSON parse failed, returning empty list.
Saved annotated image to ./gemini_extracted_images/page_041.png
Saved annotated image to ./gemini_extracted_images/page_043.png


KeyboardInterrupt: 