In [6]:
from google import genai
from google.genai import types
import os

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
from google import genai

client = genai.Client(api_key=os.getenv("GOOGLE_GENERATIVE_AI_API_KEY"))

# myfile = client.files.upload(file="output.mp4")
myfile = client.files.upload(file="videos/output_og.mp4")
# myfile = client.files.upload(file="output_720p_5fps.mp4")

In [7]:
myfile

File(name='files/8lpl7yr81ifd', display_name=None, mime_type='video/mp4', size_bytes=18943012, create_time=datetime.datetime(2025, 6, 21, 23, 37, 24, 376099, tzinfo=TzInfo(UTC)), expiration_time=datetime.datetime(2025, 6, 23, 23, 37, 24, 327499, tzinfo=TzInfo(UTC)), update_time=datetime.datetime(2025, 6, 21, 23, 37, 24, 376099, tzinfo=TzInfo(UTC)), sha256_hash='ZDZlMGQwZTkyNGEzNDZhZWYxZDQ3YjY5OGFjYTJkOWRjMzVlZjZmZmUyNjFmN2U2Y2NkZGNhYzZlOTY1M2I0Yg==', uri='https://generativelanguage.googleapis.com/v1beta/files/8lpl7yr81ifd', download_uri=None, state=<FileState.PROCESSING: 'PROCESSING'>, source=<FileSource.UPLOADED: 'UPLOADED'>, video_metadata=None, error=None)

In [12]:
from typing import Optional, List
from pydantic import BaseModel, Field

from enum import Enum

class OperationType(str, Enum):
    ASPIRATE = "aspirate"
    DISPENSE = "dispense"

class WarningNote(BaseModel):
    """Represents a safety warning or important note for a pipette operation"""
    message: str = Field(..., description="Warning or note message")

class FrameEvent(BaseModel):
    """Represents a significant event observed at a specific timestamp. Used to make sure you have all the data you need for the later parts of the analysis."""
    timestamp: float = Field(..., description="Time in seconds when this event occurs")
    description: str = Field(..., description="Detailed description of what is happening at this moment")

class PipetteOperation(BaseModel):
    """Represents a single aspirate or dispense operation - a subset of frame events"""
    thinking: str = Field(..., description="Thinking behind the operation")
    timestamp: float = Field(..., description="Time in seconds when this operation occurs")
    operation_type: OperationType = Field(..., description="Either 'aspirate' or 'dispense'")
    source_destination: str = Field(..., description="Source container for aspirate, destination container for dispense")
    amount: Optional[str] = Field(None, description="Volume amount with units (e.g., '70µl')")
    warnings: List[WarningNote] = Field(default_factory=list, description="Safety warnings or important notes for this operation")

class ObjectiveType(str, Enum):
    PREPARATION = "preparation"
    ANALYSIS = "analysis"
    PURIFICATION = "purification"
    OTHER = "other"

class ExtractedProtocol(BaseModel):
    """General protocol information with thinking"""
    thinking: str = Field(..., description="High-level reasoning and purpose of the protocol")
    title: str = Field(..., description="Title or name of the procedure")
    objective: ObjectiveType = Field(..., description="What this procedure aims to accomplish")

class VideoAnalysis(BaseModel):
    """Complete video analysis with frame-by-frame breakdown and extracted pipette operations"""
    thinking: str = Field(..., description="High-level reasoning and purpose of the protocol")
    frame_by_frame_analysis: List[FrameEvent] = Field(..., description="Chronological list of all significant events observed in the video")
    extracted_protocol: ExtractedProtocol = Field(..., description="Protocol information with thinking")
    pipette_operations: List[PipetteOperation] = Field(..., description="Subset of frame events that are specifically aspirate/dispense operations with detailed analysis")

In [13]:
prompt = """Analyze this video using a two-step approach to extract ALL directly observable events and pipetting operations.

ANALYSIS APPROACH:
Step 1: Frame-by-frame chronological analysis of ALL significant events
Step 2: Extract pipette operations as a focused subset of the frame events

CRITICAL INSTRUCTIONS:
1. First, document EVERY significant event you observe chronologically
2. Then, identify which of those events are specifically pipette operations (aspirate/dispense)
3. The pipette operations should be a subset of your frame-by-frame analysis
4. Do not skip any actions - capture practice attempts, errors, corrections, and successes

Your tasks:

1. EXTRACT PROTOCOL INFORMATION:
- Only include titles, objectives, or descriptions that are explicitly shown or stated in the video
- Base thinking on what is directly observable, not on assumptions about intent

2. FRAME-BY-FRAME ANALYSIS:
- Document ALL significant events chronologically with precise timestamps
- Include: volume setting changes, pipette handling, liquid transfers, tip changes, errors, corrections
- Mark whether each event is pipette-related (aspirate/dispense) or not
- Capture the complete narrative of what happens in the video
- Include failed attempts, practice runs, and error corrections as separate events

3. PIPETTE OPERATIONS (SUBSET OF FRAME EVENTS):
- Extract only the aspirate/dispense operations from your frame-by-frame analysis
- Each operation should correspond to an event you already documented
- Record exact timestamps for EVERY aspirate/dispense action
- Track complete sequences: aspirate → dispense pairs
- Include successful operations, failed attempts, error corrections, and practice runs
- Only identify containers that are clearly visible and distinguishable
- Only record volume amounts that are explicitly shown on pipette displays
- Note warnings for issues that are visibly demonstrated (air bubbles, incorrect technique)

VALIDATION:
- Every pipette operation timestamp must match a pipette-related event in your frame analysis
- Every timestamp must correspond to a clearly visible action
- Every container identification must be based on what's visually apparent
- Every volume must be readable from pipette display or clearly stated
- Every warning must be based on observable technical errors or safety issues

The goal is comprehensive documentation where pipette operations are clearly identified as the critical subset of all video events."""

In [14]:
from google import genai

client = genai.Client(api_key=os.getenv("GOOGLE_GENERATIVE_AI_API_KEY"))
contents = [myfile, prompt]

print(
    client.models.count_tokens(
        model="models/gemini-2.5-pro", contents=contents
    )
)
# ( e.g., total_tokens: 300 )

total_tokens=38885 cached_content_token_count=None


In [15]:
response = client.models.generate_content(
    model="models/gemini-2.5-pro",
    contents=contents,
    config={
        "response_mime_type": "application/json",
        "response_schema": VideoAnalysis,
    },
)

print(response.text)

{
  "thinking": "The user is performing a pipetting protocol that involves transferring a yellow liquid from a paper cup into a test tube. The video captures multiple steps, including setting the pipette volume, practice aspirations and dispenses, and repeated transfers into the same test tube. Volume is adjusted mid-protocol from 30µl to 20µl. A visual 'thumbs down' cue indicates at least one of the dispense operations was considered an error, suggesting this might be a training or practice session.",
  "frame_by_frame_analysis": [
    {
      "timestamp": 0.0,
      "description": "A person attaches a pipette tip to a micropipette."
    },
    {
      "timestamp": 2.5,
      "description": "The micropipette volume is shown set to '030', which corresponds to 30µl."
    },
    {
      "timestamp": 9.2,
      "description": "The user aspirates an unknown amount of yellow liquid into the tip for practice."
    },
    {
      "timestamp": 17.5,
      "description": "The user dispenses the

In [None]:
print(response.usage_metadata)

In [None]:
response.parsed