In [1]:
from google import genai
from google.genai import types
import os

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
from google import genai

client = genai.Client(api_key=os.getenv("GOOGLE_GENERATIVE_AI_API_KEY"))

# myfile = client.files.upload(file="output.mp4")
myfile = client.files.upload(file="videos/output_long_again_2.mp4")
# myfile = client.files.upload(file="output_720p_5fps.mp4")

In [3]:
type(myfile)

google.genai.types.File

In [12]:
from typing import Optional, List
from pydantic import BaseModel, Field

from enum import Enum

class OperationType(str, Enum):
    ASPIRATE = "aspirate"
    DISPENSE = "dispense"

class WarningNote(BaseModel):
    """Represents a safety warning or important note for a pipette operation"""
    message: str = Field(..., description="Warning or note message")

class FrameEvent(BaseModel):
    """Represents a significant event observed at a specific timestamp. Used to make sure you have all the data you need for the later parts of the analysis."""
    timestamp: float = Field(..., description="Time in seconds when this event occurs")
    description: str = Field(..., description="Detailed description of what is happening at this moment")

class PipetteOperation(BaseModel):
    """Represents a single aspirate or dispense operation - a subset of frame events"""
    thinking: str = Field(..., description="Thinking behind the operation")
    timestamp: float = Field(..., description="Time in seconds when this operation occurs")
    operation_type: OperationType = Field(..., description="Either 'aspirate' or 'dispense'")
    source_destination: str = Field(..., description="Source container for aspirate, destination container for dispense")
    amount: Optional[str] = Field(None, description="Volume amount with units (e.g., '70µl')")
    warnings: List[WarningNote] = Field(default_factory=list, description="Safety warnings or important notes for this operation")

class ObjectiveType(str, Enum):
    PREPARATION = "preparation"
    ANALYSIS = "analysis"
    PURIFICATION = "purification"
    OTHER = "other"

class ExtractedProtocol(BaseModel):
    """General protocol information with thinking"""
    thinking: str = Field(..., description="High-level reasoning and purpose of the protocol")
    title: str = Field(..., description="Title or name of the procedure")
    objective: ObjectiveType = Field(..., description="What this procedure aims to accomplish")

class VideoAnalysis(BaseModel):
    """Complete video analysis with frame-by-frame breakdown and extracted pipette operations"""
    thinking: str = Field(..., description="High-level reasoning and purpose of the protocol")
    frame_by_frame_analysis: List[FrameEvent] = Field(..., description="Chronological list of all significant events observed in the video")
    extracted_protocol: ExtractedProtocol = Field(..., description="Protocol information with thinking")
    pipette_operations: List[PipetteOperation] = Field(..., description="Subset of frame events that are specifically aspirate/dispense operations with detailed analysis")

In [13]:
prompt = """Analyze this video using a two-step approach to extract ALL directly observable events and pipetting operations.

ANALYSIS APPROACH:
Step 1: Frame-by-frame chronological analysis of ALL significant events
Step 2: Extract pipette operations as a focused subset of the frame events

CRITICAL INSTRUCTIONS:
1. First, document EVERY significant event you observe chronologically
2. Then, identify which of those events are specifically pipette operations (aspirate/dispense)
3. The pipette operations should be a subset of your frame-by-frame analysis
4. Do not skip any actions - capture practice attempts, errors, corrections, and successes

Your tasks:

1. EXTRACT PROTOCOL INFORMATION:
- Only include titles, objectives, or descriptions that are explicitly shown or stated in the video
- Base thinking on what is directly observable, not on assumptions about intent

2. FRAME-BY-FRAME ANALYSIS:
- Document ALL significant events chronologically with precise timestamps
- Include: volume setting changes, pipette handling, liquid transfers, tip changes, errors, corrections
- Mark whether each event is pipette-related (aspirate/dispense) or not
- Capture the complete narrative of what happens in the video
- Include failed attempts, practice runs, and error corrections as separate events

3. PIPETTE OPERATIONS (SUBSET OF FRAME EVENTS):
- Extract only the aspirate/dispense operations from your frame-by-frame analysis
- Each operation should correspond to an event you already documented
- Record exact timestamps for EVERY aspirate/dispense action
- Track complete sequences: aspirate → dispense pairs
- Include successful operations, failed attempts, error corrections, and practice runs
- Only identify containers that are clearly visible and distinguishable
- Only record volume amounts that are explicitly shown on pipette displays
- Note warnings for issues that are visibly demonstrated (air bubbles, incorrect technique)

VALIDATION:
- Every pipette operation timestamp must match a pipette-related event in your frame analysis
- Every timestamp must correspond to a clearly visible action
- Every container identification must be based on what's visually apparent
- Every volume must be readable from pipette display or clearly stated
- Every warning must be based on observable technical errors or safety issues

The goal is comprehensive documentation where pipette operations are clearly identified as the critical subset of all video events."""

In [14]:
from google import genai

client = genai.Client(api_key=os.getenv("GOOGLE_GENERATIVE_AI_API_KEY"))
contents = [myfile, prompt]

print(
    client.models.count_tokens(
        model="models/gemini-2.5-pro", contents=contents
    )
)
# ( e.g., total_tokens: 300 )

total_tokens=38885 cached_content_token_count=None


In [None]:
response = client.models.generate_content(
    model="models/gemini-2.5-pro",
    contents=contents,
    config={
        "response_mime_type": "application/json",
        "response_schema": VideoAnalysis,
    },
)

print(response.text)

{
  "thinking": "The user is performing a pipetting protocol that involves transferring a yellow liquid from a paper cup into a test tube. The video captures multiple steps, including setting the pipette volume, practice aspirations and dispenses, and repeated transfers into the same test tube. Volume is adjusted mid-protocol from 30µl to 20µl. A visual 'thumbs down' cue indicates at least one of the dispense operations was considered an error, suggesting this might be a training or practice session.",
  "frame_by_frame_analysis": [
    {
      "timestamp": 0.0,
      "description": "A person attaches a pipette tip to a micropipette."
    },
    {
      "timestamp": 2.5,
      "description": "The micropipette volume is shown set to '030', which corresponds to 30µl."
    },
    {
      "timestamp": 9.2,
      "description": "The user aspirates an unknown amount of yellow liquid into the tip for practice."
    },
    {
      "timestamp": 17.5,
      "description": "The user dispenses the

In [None]:
print(response.usage_metadata)

In [None]:
response.parsed

In [None]:
# SIMPLIFIED STEP-BY-STEP APPROACH
# Let's build this thoughtfully, piece by piece

from typing import List, Optional
from pydantic import BaseModel, Field

# Step 2: Reagent model - just name and volume for now
class Reagent(BaseModel):
    """Simple reagent with name and volume"""
    name: str = Field(..., description="Name of the reagent (e.g., 'Buffer A', 'Sample 1')")
    volume: str = Field(..., description="Volume with units (e.g., '10µl')")

# Step 3: Well model - ID and list of reagents
class Well(BaseModel):
    """Well with reagents inside"""
    well_id: str = Field(..., description="Well position (e.g., 'A1', 'tube1')")
    reagents: List[Reagent] = Field(default_factory=list, description="List of reagents in this well")

# Step 4: Core structure with triggered flags
class VideoAnalysis(BaseModel):
    """Simple base model with core components and triggered flags"""
    thinking: str = Field(..., description="Your reasoning about what you observed happening in this video segment. Explain what pipetting operations occurred and how they relate to the protocol.")
    
    goal_state_triggered: bool = Field(default=False, description="TRIGGER FLAG: Set to true if you learned new information about what the final protocol should achieve, requiring you to update or define the goal state.")

    goal_state: Optional[List[Well]] = Field(None, description="The target end state - what each well should contain when the entire protocol is complete. List all wells with their intended reagents and volumes. Only fill this out if goal_state_triggered is True.")
    
    current_state_triggered: bool = Field(default=False, description="TRIGGER FLAG: Set to true if the video segment showed pipetting operations that changed the contents of any wells, requiring you to update the current state.")

    current_state: Optional[List[Well]] = Field(None, description="The actual current state right now - what each well currently contains after the operations observed in this video segment. Update reagent lists and volumes based on what you saw. Only fill this out if current_state_triggered is True.")
    
    protocol_log_triggered: bool = Field(default=False, description="TRIGGER FLAG: Set to true if you observed significant events that should be documented in the running protocol log for future reference.")

    protocol_log: Optional[str] = Field(None, description="APPEND-ONLY running verbal description of significant events across all video segments. This will be APPENDED to existing log, not replaced. Only add information that is relevant and useful for later protocol steps (e.g., 'Added 10µl Buffer A to A1', 'Observed air bubble in tip during aspiration', 'Changed pipette volume to 20µl'). Be selective - do not add redundant or trivial information that clutters the log. Only fill this out if protocol_log_triggered is True.")
    
    warnings_triggered: bool = Field(default=False, description="TRIGGER FLAG: Set to true if you observed any problems, errors, or concerning techniques in this video segment that require generating new warnings.")

    warnings: Optional[List[str]] = Field(None, description="List of issues, errors, or concerns you observed in this video segment (e.g., 'Wrong volume dispensed into A1', 'Air bubble in pipette tip', 'Used wrong reagent'). Only fill this out if warnings_triggered is True. Will be immediately displayed to the user.")
    

# Example usage:
example_analysis = VideoAnalysis(
    thinking="User aspirated 10µl of Buffer A and dispensed it into well A1. This appears to be the first step of a multi-step protocol.",
    goal_state_triggered=False,
    goal_state=None,  # Not updating goal state this segment
    current_state_triggered=True,  # Well A1 contents changed, so update current state
    current_state=[Well(well_id="A1", reagents=[Reagent(name="Buffer A", volume="10µl")])],
    protocol_log_triggered=True,  # Significant event to log
    protocol_log="Added 10µl Buffer A to well A1.",  # This will be appended to existing log
    warnings_triggered=False,
    warnings=None,  # No problems observed this segment
)

print("Example with protocol log:", example_analysis.dict())

In [None]:
def create_stateful_prompt(current_goal_state, current_current_state, current_protocol_log, current_warnings):
    """
    Creates a prompt with current state variables that guides the LM to only update what's needed
    Input: 5 frames from ongoing lab procedure + this prompt
    """
    
    prompt = f"""LAB COPILOT - REAL-TIME PROCEDURE INTERPRETATION

You are a lab copilot helping interpret an ongoing laboratory procedure. You're seeing a snapshot of what's happening right now through 5 consecutive frames.

CURRENT STATE (what you know so far):
- Goal State: {current_goal_state if current_goal_state else "Not yet defined"}
- Current State: {current_current_state if current_current_state else "Empty - no reagents added yet"}  
- Protocol Log: {current_protocol_log if current_protocol_log else "No previous events logged"}
- Active Warnings: {current_warnings if current_warnings else "No current warnings"}

YOUR ROLE:
You're in the middle of helping with a task. Analyze these 5 frames to understand what the person is doing right now and update your knowledge accordingly.

Use the triggered flags to indicate what aspects of your understanding need updating:

1. THINKING: Always interpret what you observe in these frames
   - What is the person trying to accomplish?
   - What pipetting actions are they performing?
   - How does this fit with what you've seen before?

2. GOAL STATE: Update if you learn more about what they're ultimately trying to achieve
   - Set goal_state_triggered=True if you gain new insight into the end goal
   - Leave goal_state_triggered=False if their objective is still unclear

3. CURRENT STATE: Update if their actions changed what's in the wells
   - Set current_state_triggered=True if reagents were added/moved
   - Calculate new volumes based on what you observed
   - Leave current_state_triggered=False if no transfers occurred

4. PROTOCOL LOG: Record significant events that matter for the ongoing task
   - Set protocol_log_triggered=True if something important happened
   - Add brief, useful notes (will be appended to your running memory)
   - Skip routine actions that don't change the situation
   - Leave protocol_log_triggered=False if nothing noteworthy occurred

5. WARNINGS: Alert to problems or issues you notice
   - Set warnings_triggered=True if you spot technique issues or errors
   - Note anything that might affect the procedure's success
   - Leave warnings_triggered=False if everything looks normal

FRAME INTERPRETATION:
- Look across the 5 frames to see the progression of their actions
- Focus on what changed: pipette position, liquid movement, container interactions
- Try to understand the intent behind their actions

IMPORTANT:
- You're helping with an ongoing task - provide relevant, actionable insights
- Only set triggered=True for aspects that actually need updating
- If triggered=False, leave that field as None
- Base everything on what you can directly observe

OUTPUT: Return VideoAnalysis with appropriate triggered flags set."""

    return prompt

print("✅ LAB COPILOT PROMPT READY!")
print("🤖 Role: Real-time procedure interpretation")  
print("📸 Input: 5 frames from ongoing task")
print("🔄 Output: Updated understanding of what's happening")