In [None]:
from google import genai
from google.genai import types
import cv2 
import numpy as np
import io 


In [None]:
# --- GenAI Setup ---
MODEL_ID = "gemini-robotics-er-1.5-preview"
client = genai.Client()


In [None]:
# --- Camera Capture (Same as previous solution) ---
CAMERA_INDEX = 0 
cap = cv2.VideoCapture(CAMERA_INDEX)
if not cap.isOpened(): raise IOError("Cannot open webcam")
ret, frame = cap.read()
cap.release()
if not ret: raise IOError("Failed to capture image from camera")
ret, buffer = cv2.imencode('.jpg', frame)
if not ret: raise IOError("Failed to encode image to JPEG")
image_bytes = buffer.tobytes()


In [None]:
# --- NEW: Queries for Sorting by Size and Debris ---

# The model will visually classify these categories. 
# You can use general terms for size or specific variety names.
queries = [
    "large coffee bean",
    "medium coffee bean",
    "small coffee bean (peaberry or undersized)",
    "stick",
    "rock or stone",
    "miscellaneous debris",
]



In [None]:
# --- Prompt Construction for Size-Based Sorting and Debris Removal ---
# We make the prompt more descriptive to guide the model's reasoning.
prompt = f"""
    You are a coffee bean quality control robot.
    Identify and locate all the following objects in the image: {', '.join(queries)}.
    For the coffee beans, try to categorize them by approximate size.
    For sticks, rocks, and debris, identify them clearly so they can be removed.
    
    The answer should follow the json format:
    [\{\{"point": <point>, "label": <label1>\}\}, ...]. The points are in
    [y, x] format normalized to 0-1000.
    """



In [None]:
# --- Model Execution ---
print("Sending captured image to Gemini model for size and debris recognition...")

image_response = client.models.generate_content(
  model=MODEL_ID,
  contents=[
    types.Part.from_bytes(
      data=image_bytes,
      mime_type='image/jpeg',
    ),
    prompt
  ],
  config = types.GenerateContentConfig(
      temperature=0.5,
      thinking_config=types.ThinkingConfig(thinking_budget=0)
  ))

print("\n--- Recognition Results (Visual Sorting/Debris) ---")
print(image_response.text)



In [None]:
# --- POST-PROCESSING & ROBOT ACTION LOGIC (Conceptual) ---
# After receiving the JSON, your robot control code (not shown here) would:
# 1. Parse the JSON points and labels.
# 2. **Move to all points labeled "stick," "rock," or "debris" and discard them.**
# 3. **Move to points labeled "large," "medium," or "small" coffee bean and place them in the corresponding bin.**

# --- ADVANCED SORTING (Requires Additional Hardware/Sensors) ---
# For *precise* size and density sorting, the Gemini model provides the visual
# location (the <point>). Your robot would then use its physical tools:
# - **Precise Sizing:** The robot's gripper could attempt to measure the object
#   (if equipped with force/tactile sensors) or use a calibrated vision system
#   (stereo camera/depth sensor) integrated with the robot's control stack to get
#   a precise 3D size *after* the initial coarse classification from Gemini.
# - **Density Sorting:** Density cannot be determined visually. This requires a 
#   physical process, such as dropping the bean into an air classifier or on an 
#   inclined vibrating table (which is what industrial sorters use), or possibly
#   using a calibrated force sensor on a gripper to measure weight (mass) after 
#   calculating the approximate volume.