In [31]:
from google import genai
from google.genai import types
from pydantic import BaseModel, Field
import os

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
from google import genai

client = genai.Client(api_key=os.getenv("GOOGLE_GENERATIVE_AI_API_KEY"))

myfile = client.files.upload(file="output.mp4")
# myfile = client.files.upload(file="output_720p_5fps.mp4")

In [32]:
myfile

File(name='files/07kdc79wf02m', display_name=None, mime_type='video/mp4', size_bytes=18369575, create_time=datetime.datetime(2025, 6, 21, 22, 43, 17, 95145, tzinfo=TzInfo(UTC)), expiration_time=datetime.datetime(2025, 6, 23, 22, 43, 17, 44407, tzinfo=TzInfo(UTC)), update_time=datetime.datetime(2025, 6, 21, 22, 43, 17, 95145, tzinfo=TzInfo(UTC)), sha256_hash='NmNmMzI0MWZiOTdiZmQ4MzZhZDE1MjQ2ZGU4MGUxMGY4ODYwZDFhMTgwNDk0NzhiNTZmNjczYTU3MDc5NjU0Yw==', uri='https://generativelanguage.googleapis.com/v1beta/files/07kdc79wf02m', download_uri=None, state=<FileState.PROCESSING: 'PROCESSING'>, source=<FileSource.UPLOADED: 'UPLOADED'>, video_metadata=None, error=None)

In [33]:
from typing import Optional, List
from pydantic import BaseModel, Field

from enum import Enum

class OperationType(str, Enum):
    ASPIRATE = "aspirate"
    DISPENSE = "dispense"

class PipetteOperation(BaseModel):
    """Represents a single aspirate or dispense operation"""
    thinking: str = Field(..., description="Thinking behind the operation")
    timestamp: float = Field(..., description="Time in seconds when this operation occurs")
    operation_type: OperationType = Field(..., description="Either 'aspirate' or 'dispense'")
    source_destination: str = Field(..., description="Source container for aspirate, destination container for dispense")
    amount: Optional[str] = Field(None, description="Volume amount with units (e.g., '70Âµl')")
    warnings: List[Warning] = Field(default_factory=list, description="Safety warnings or important notes for this operation")

class ObjectiveType(str, Enum):
    PREPARATION = "preparation"
    ANALYSIS = "analysis"
    PURIFICATION = "purification"
    OTHER = "other"

class ExtractedProtocol(BaseModel):
    """General protocol information with thinking"""
    thinking: str = Field(..., description="High-level reasoning and purpose of the protocol")
    title: str = Field(..., description="Title or name of the procedure")
    objective: ObjectiveType = Field(..., description="What this procedure aims to accomplish")

class VideoAnalysis(BaseModel):
    """Analysis focused on pipetting operations with timestamps"""
    thinking: str = Field(..., description="High-level reasoning and purpose of the protocol")
    extracted_protocol: ExtractedProtocol = Field(..., description="Protocol information with thinking")
    pipette_operations: List[PipetteOperation] = Field(..., description="Time-stamped aspirate/dispense operations")

In [34]:
prompt = """Analyze this video to extract only directly observable information about pipetting operations. 

CRITICAL: Only report what is clearly visible and verifiable in the video. Do not infer, assume, or add interpretations that could be disputed. Users are concerned about AI hallucinations and need factual, observable data only.

Your task is to:
1. Extract basic protocol information that is directly shown or stated
2. Record precise timestamps of visible aspirate/dispense operations

For protocol extraction:
- Only include titles, objectives, or descriptions that are explicitly shown or stated in the video
- Base thinking on what is directly observable, not on assumptions about intent

For pipette operations:
- Record exact timestamps (in seconds) only when aspirate/dispense actions are clearly visible
- Only specify operation type if the action is unambiguously observable
- Only identify containers that are clearly visible and distinguishable  
- Only record volume amounts that are explicitly shown on pipette displays or stated
- Only note warnings for issues that are visibly demonstrated (e.g., air bubbles, incorrect technique)

Validation requirements:
- Every timestamp must correspond to a clearly visible pipetting action
- Every container identification must be based on what's visually apparent
- Every volume must be readable from pipette display or clearly stated
- Every warning must be based on observable technical errors or safety issues
- Thinking should explain what specific visual evidence supports each observation

Focus strictly on documenting what any viewer could independently verify by watching the video."""

In [36]:
# video_file_name = "output.mp4"
# video_bytes = open(video_file_name, 'rb').read()

contents=types.Content(
    parts=[
        types.Part(
            file_data=types.FileData(file_uri=myfile.uri, mime_type="video/mp4"),
            video_metadata=types.VideoMetadata(fps=10)
        ),
        types.Part(text=prompt)
    ]
)

In [37]:
from google import genai
import time

client = genai.Client(api_key=os.getenv("GOOGLE_GENERATIVE_AI_API_KEY"))
# contents = [myfile.uri, prompt]

print(
    client.models.count_tokens(
        model="models/gemini-2.5-pro", contents=contents
    )
)
# ( e.g., total_tokens: 300 )

total_tokens=165993 cached_content_token_count=None


In [38]:
response = client.models.generate_content(
    model="models/gemini-2.5-pro",
    contents=contents,
    config={
        "response_mime_type": "application/json",
        "response_schema": VideoAnalysis,
    },
)

ServerError: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}

In [None]:
print(response.usage_metadata)