In [None]:
!pip install google-genai Pillow termcolor




Load the GEMINI_API_KEY secret into an env var

In [14]:
from google.colab import userdata
import os
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')

Define the class that interacts with the API Server. This can be hosted in Cloud Run or locally

In [24]:
import base64
import termcolor
import time
from typing import Any
import requests
from typing import TypeAlias

Screenshot: TypeAlias = bytes

class CloudRunComputer:
    """Connects to a Cloud Run server and uses Chromium there."""

    def __init__(self, api_server: str, screen_size: tuple[int, int] = (1000, 1000)):
        self.api_server = api_server
        if screen_size != (1000, 1000):
            raise ValueError("Only (1000, 1000) screen size is supported.")

    def __enter__(self):
        print("Creating session...")
        start_time = time.time()
        response = requests.post(self.api_server + "sessions", json={"type": "browser"})
        end_time = time.time()
        termcolor.cprint(
            f"Session created in {end_time - start_time:.2f} seconds.",
            color="green",
            attrs=["bold"],
        )
        self._session_id = response.json()["id"]
        print("Session ready.")
        print(
            f"Follow along at: {self.api_server}session.html?session_id={self._session_id}"
        )
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        requests.delete(self.api_server + "sessions/" + self._session_id)

    def _run_command(
        self, command: str, args: dict[str, Any] | None = None
    ) -> Screenshot:
        print(f"Running command: {command} with args: {args}")
        response = requests.post(
            self.api_server + "sessions/" + self._session_id + "/commands",
            json={
                "name": command,
                "args": args,
            },
        )
        response.raise_for_status()
        screenshot_str: str = response.json()["screenshot"]
        screenshot_bytes = base64.b64decode(screenshot_str)
        return screenshot_bytes

    def open_web_browser(self) -> Screenshot:
        return self._run_command("open_web_browser")

    def click_at(self, y, x):
        return self._run_command("click_at", args={"x": x, "y": y})

    def hover_at(self, y, x):
        return self._run_command("hover_at", args={"x": x, "y": y})

    def type_text_at(self, x: int, y: int, text: str) -> Screenshot:
        return self._run_command(
            "type_text_at",
            args={
                "x": x,
                "y": y,
                "text": text,
            },
        )

    def scroll_document(self, direction: str) -> Screenshot:
        return self._run_command("scroll_document", args={"direction": direction})

    def wait_5_seconds(self) -> Screenshot:
        return self._run_command("wait_5_seconds")

    def go_back(self) -> Screenshot:
        return self._run_command("go_back")

    def go_forward(self) -> Screenshot:
        return self._run_command("go_forward")

    def search(self) -> Screenshot:
        return self._run_command("search")

    def navigate(self, url: str) -> Screenshot:
        return self._run_command("navigate", args={"url": url})

    def key_combination(self, keys: str) -> Screenshot:
        return self._run_command("key_combination", args={"keys": keys})

    def screenshot(self) -> Screenshot:
        return self._run_command("screenshot")

Define the system prompts for influencing Gemini behavior

In [17]:
DEVELOPER_INSTRUCTIONS = """
# Action guidelines
Key Guidelines You MUST follow:
- You should use the y x coordinate to interact with the elements you want. You are clicking on the bottom image (current_screenshot) if there are two. Aim for the CENTER of the element.
- If after clicking or typing, the page is the same, the x and y were WRONG. Use DIFFERENT values next time. You will see a red dot on the top image (last_action_location_screenshot) where you clicked or typed. Adjust your x and y to get closer to the correct spot!
- NEVER repeat the exact same action twice. You should almost never scroll twice in a row.
- To input text, *NO* need to click the input element first, directly type content. After typing, the system automatically hits `ENTER` key. Rarely you should click the search button to apply search filters. Try to use simple language when searching. Do not end your search with a special character like `!`.
- You must ensure you type into the box, never type into a button! Sometimes, a search box will only display after clicking a button.
- If the page is the same, your coordinates (y,x) were incorrect. Try different values. Adjust based on the red dot in last_action_location_screenshot.
- Execute only one action per iteration.
- ALWAYS choose a new action if the webpage did not change.
- When a complex Task involves multiple questions, provide an answer only at the very end, after addressing all questions. Flexibly combine your own abilities with the information in the web page. Double check the formatting requirements in the task when providing an answer.
- When a task involves going to another website use the Navigate action to go there directly without going to Google.
- Whenever you need to apply a filter, *make sure you first click on the filter*, and *then click on the appropriate filter option*. *DO NOT assume that the filter will apply automatically after clicking the dropdown menu button. You have to select the option from the dropdown menu*.
- Always make sure you double check the dates whenever you are booking something. Do not move forward until you have the correct date.
- When you are done with the task and have answered the question, give your answer in a new paragraph.

# Thought Guidelines
- Carefully read and reflect on errors. If there was an error, you did something wrong. The system did not fail. You chose the wrong action.
- If you encounter an error on a webpage, think about how to get around it.
- If the webpage did not change, reflect what mistake YOU made. Do not assume a webpage is still loading. Think about changing the coordinates (y,x) based on last_action_location_screeshot.
- Read the goal reminder carefully. Reflect on what you still need to do to reach that goal from the current page.
- If your thought includes 'again', *STOP*. Reflect carefully and do something else.

# General Hints
- If you know the direct URL, always use the Navigate action. (e.g. navigate(www.espn.com)). Avoid unnecessary steps.
- Sometimes, it is easier to directly navigate to a URL to apply search filters instead of manually applying them.
- Don't interact with web elements like Login, Sign-in, donation that appear in Webpages. Pay attention to Key Web Elements like search textbox and menu.
- You can search multiple times if the results are not good. If you get no results, you are likely not searching correctly. Avoid searching again in this case.
- Sometimes search capabilities available on the page will not yield good results. Try using Google in this case.
- Visiting video websites like YouTube is allowed BUT you can't play videos. Clicking to download PDFs is also not allowed.
- When you are done with the task, and have answered all the questions, make sure you double check that you have answered all the questions. If you have not answered all the questions, you should not give a final answer.
- When prompted on privacy preferences on a new website, always first Accept the cookies before proceeding with the given task.
- Some websites might block you. If you get an error or blank page, then try the navigate or search actions.

# Final answer
The final answer should be precise and be its own paragraph. Do not use any tools to provide your answer.

*IMPORTANT*: After each step take you'll be provided with a screenshot of the browser. Carefully evaluate if the right outcome was present. Explicitly show your thinking: "I have evaluated step X…". If not correct, try again. Only when you confirm the step was executed correctly move on to the next step.

Remember: break down your reasoning into manageable parts step by step, summarize and analyze the information within each segment, and ensure a clear and logical flow between segments! Spell out all of your thoughts to avoid ambiguity.

VERY IMPORTANT: Always surround tool calls with print(), for example:

```
print(default_api.open_web_browser())
```
""".strip()

Define the tool / function for browser control

In [None]:
from google.genai.types import FunctionDeclaration, Tool, Schema, Type

OPEN_WEB_BROWSER_TOOL = FunctionDeclaration(
    name="open_web_browser",
    description="""Opens the web browser.
Returns:
    Image: Screenshot of the opened webpage.
""",
)

CLICK_AT_TOOL = FunctionDeclaration(
    name="click_at",
    description="""Clicks at a specific y, x coordinate on the webpage.

    Make sure to click in the middle of the element.

    Returns:
        Image: Screenshot after clicking.
    """,
    parameters=Schema(
        type=Type.OBJECT,
        properties={
            "y": Schema(
                type=Type.INTEGER,
                description="The y-coordinate on the webpage (0-1000).",
            ),
            "x": Schema(
                type=Type.INTEGER,
                description="The x-coordinate on the webpage (0-1000).",
            ),
        },
    ),
)

HOVER_AT_TOOL = FunctionDeclaration(
    name="hover_at",
    description="""Hovers at a specific y, x coordinate on the webpage.

    May be used to explore sub-menus that appear on hover.

    Returns:
        Image: Screenshot showing hover effects.
    """,
    parameters=Schema(
        type=Type.OBJECT,
        properties={
            "y": Schema(
                type=Type.INTEGER,
                description="The y-coordinate on the webpage (0-1000).",
            ),
            "x": Schema(
                type=Type.INTEGER,
                description="The x-coordinate on the webpage (0-1000).",
            ),
        },
    ),
)

TYPE_TEXT_AT_TOOL = FunctionDeclaration(
    name="type_text_at",
    description="""Types text at a specific y, x coordinate, optionally clearing existing content.

    The system automatically presses ENTER after typing.

    When typing on an element, make sure to type in the middle of the element.

    Returns:
        Image: Screenshot after typing.
    """,
    parameters=Schema(
        type=Type.OBJECT,
        properties={
            "y": Schema(
                type=Type.INTEGER,
                description="The y-coordinate on the webpage (0-1000).",
            ),
            "x": Schema(
                type=Type.INTEGER,
                description="The x-coordinate on the webpage (0-1000).",
            ),
            "text": Schema(
                type=Type.STRING,
                description="The text to type.",
            ),
        },
    ),
)

SCROLL_DOCUMENT_TOOL = FunctionDeclaration(
    name="scroll_document",
    description="""Scrolls the entire webpage up or down.

    Returns:
        Image: Screenshot after scrolling.
    """,
    parameters=Schema(
        type=Type.OBJECT,
        properties={
            "direction": Schema(
                type=Type.STRING,
                description='The scroll direction ("up" or "down").',
            )
        },
    ),
)

WAIT_5_SECONDS_TOOL = FunctionDeclaration(
    name="wait_5_seconds",
    description="""Waits for 5 seconds to allow unfinished webpage processes to complete.""",
)

GO_FORWARD_TOOL = FunctionDeclaration(
    name="go_forward",
    description="""Navigates forward to the previous webpage in the browser history.

    Returns:
        Image: Screenshot of the previous page.
    """,
)

GO_BACK_TOOL = FunctionDeclaration(
    name="go_back",
    description="""Navigates back to the previous webpage in the browser history.

    Returns:
        Image: Screenshot of the previous page.
    """,
)

SEARCH_TOOL = FunctionDeclaration(
    name="search",
    description="""Directly jumps to a search engine home page.

    Used when you need to start with a search. For example, this is used when
    the current website doesn't have the information needed or because a new
    task is being started .

    Returns:
        Image: Screenshot of the search engine.
    """,
)

NAVIGATE_TOOL = FunctionDeclaration(
    name="navigate",
    description="""Navigates directly to a specified URL.

    Preferred over search() when you know the exact destination URL.

    Returns:
        Image: Screenshot of the navigated webpage.
    """,
    parameters=Schema(
        type=Type.OBJECT,
        properties={
            "url": Schema(
                type=Type.STRING,
                description="The URL to navigate to.",
            )
        },
    ),
)

KEY_COMBINATION_TOOL = FunctionDeclaration(
    name="key_combination",
    description="""Presses keyboard keys and combinations, such as "control+c" or "enter".""",
    parameters=Schema(
        type=Type.OBJECT,
        properties={
            "keys": Schema(
                type=Type.STRING,
                description="The keys to press",
            )
        },
    ),
)

FUNCTION_CALLING_TOOL = Tool(
    function_declarations=[
        OPEN_WEB_BROWSER_TOOL,
        CLICK_AT_TOOL,
        HOVER_AT_TOOL,
        TYPE_TEXT_AT_TOOL,
        SCROLL_DOCUMENT_TOOL,
        WAIT_5_SECONDS_TOOL,
        GO_FORWARD_TOOL,
        GO_BACK_TOOL,
        SEARCH_TOOL,
        NAVIGATE_TOOL,
        KEY_COMBINATION_TOOL,
    ]
)

Now, let's define the Agent that puts it all together. The agent connects to Gemini and uses the tools to intract with the browser / computer via the API Server.

In [25]:
import copy
import time
import os
import io
from typing import Tuple
from google import genai
from google.genai import types
from PIL import Image
from PIL import ImageDraw
import termcolor
import pathlib
from google.genai.types import Part, GenerateContentConfig, Content, Blob, Candidate

SCREEN_SIZE = (1000, 1000)
IMG_SAVE_COUNT = 0
LATEST_SCREENSHOT_PATH = pathlib.Path("/tmp/live.png")


def normalize_x(x: int) -> int:
    return int(x / 1000 * SCREEN_SIZE[0])


def normalize_y(y: int) -> int:
    return int(y / 1000 * SCREEN_SIZE[1])


def _get_colorable_grayscale_image(img: Image.Image) -> Image.Image:
    """Returns grayscale version of image that can be drawn on with color."""
    # Convert to grayscale.
    img = img.convert("L")

    # Create rgb version of image so that we can draw colors on it.
    rgb_img = Image.new("RGBA", img.size)
    rgb_img.paste(img)
    return rgb_img


def _render_circle(
    image: Image.Image,
    x: int,
    y: int,
    radius: int = 10,
) -> Image.Image:
    """Renders a red circle on top of the image."""

    draw = ImageDraw.Draw(image)
    fill_color = (255, 0, 0, 200)  # Slightly transparent red.
    coordinates = [(x - radius, y - radius), (x + radius, y + radius)]
    draw.ellipse(xy=coordinates, fill=fill_color)

    return image


def _bytes_to_image(image_bytes: bytes) -> Image.Image:
    """Converts image bytes to PIL image."""
    return Image.open(io.BytesIO(image_bytes))


def _image_to_bytes(image: Image.Image) -> bytes:
    """Converts PIL image to bytes."""
    with io.BytesIO() as output:
        image.save(output, format="PNG")
        return output.getvalue()


def visualize_action_from_image(image: Image.Image, x: int, y: int) -> Image.Image:
    """Renders a red circle on top of the greyscaled image."""
    grayscale_image = _get_colorable_grayscale_image(image)
    return _render_circle(grayscale_image, x, y)


def visualize_action(screenshot_png: bytes, x: int, y: int) -> bytes:
    """Renders a red circle on top of the greyscaled image."""
    image = _bytes_to_image(screenshot_png)
    return _image_to_bytes(visualize_action_from_image(image, x, y))


def save_screenshot(image: bytes | Image.Image) -> bytes:
    if isinstance(image, Image.Image):
        png_bytes = io.BytesIO()
        image.save(png_bytes, format="png")
        png_bytes = png_bytes.getvalue()
    else:
        png_bytes = image

    global IMG_SAVE_COUNT
    LATEST_SCREENSHOT_PATH.write_bytes(png_bytes)
    pathlib.Path(f"/tmp/screenshot-{IMG_SAVE_COUNT}.png").write_bytes(png_bytes)
    IMG_SAVE_COUNT += 1

    return png_bytes


class BrowserAgent:
    def __init__(
        self,
        browser_computer: ComputerUseEnvironment,
        query: str,
        model_name: str = "gemini-2.5-flash-jarvis",
    ):
        self.browser_computer = browser_computer
        self.query = query
        self.model_name = model_name
        self.done = False
        self.client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
        self.contents: list[Content] = [
            Content(
                role="user",
                parts=[
                    Part(text=self.query),
                ],
            )
        ]
        self.generate_content_config = GenerateContentConfig(
            temperature=1,
            top_p=0.95,
            top_k=40,
            max_output_tokens=8192,
            tools=[FUNCTION_CALLING_TOOL],
            system_instruction=types.Content(
                parts=[
                    Part(text=DEVELOPER_INSTRUCTIONS),
                ]
            ),
        )

    def handle_action(
        self, action: types.FunctionCall, screenshot_bytes: bytes
    ) -> Tuple[Screenshot, Screenshot | None]:
        """Handles the action and returns the screenshot.

        Returns:
            Screenshot: The screenshot after the action.
            Screenshot | None: The feedback screenshot (if applicable).
        """
        screenshot = None
        feedback_screenshot = None
        match action.name:
            case "open_web_browser":
                screenshot = self.browser_computer.open_web_browser()
            case "click_at":
                y = action.args["y"]
                x = action.args["x"]
                x = normalize_x(x)
                y = normalize_y(y)
                feedback_screenshot = visualize_action(screenshot_bytes, x, y)
                screenshot = self.browser_computer.click_at(
                    x=x,
                    y=y,
                )
            case "hover_at":
                y = action.args["y"]
                x = action.args["x"]
                x = normalize_x(x)
                y = normalize_y(y)
                feedback_screenshot = visualize_action(screenshot_bytes, x, y)
                screenshot = self.browser_computer.hover_at(
                    x=x,
                    y=y,
                )
            case "type_text_at":
                y = action.args["y"]
                x = action.args["x"]
                x = normalize_x(x)
                y = normalize_y(y)
                feedback_screenshot = visualize_action(screenshot_bytes, x, y)
                screenshot = self.browser_computer.type_text_at(
                    x=x,
                    y=y,
                    text=action.args["text"],
                )
            case "scroll_document":
                screenshot = self.browser_computer.scroll_document(
                    action.args["direction"]
                )
            case "wait_5_seconds":
                screenshot = self.browser_computer.wait_5_seconds()
            case "go_back":
                screenshot = self.browser_computer.go_back()
            case "go_forward":
                screenshot = self.browser_computer.go_forward()
            case "search":
                screenshot = self.browser_computer.search()
            case "navigate":
                screenshot = self.browser_computer.navigate(action.args["url"])
            case "key_combination":
                screenshot = self.browser_computer.key_combination(action.args["keys"])
            case _:
                raise ValueError(f"Unsupported function: {action}")
        return screenshot, feedback_screenshot

    def get_text(self, candidate: Candidate) -> str | None:
        """Extracts the text from the candidate."""
        text = []
        for part in candidate.content.parts:
            if part.text:
                text.append(part.text)
        return " ".join(text) or None

    def get_function_call(self, candidate: Candidate) -> types.FunctionCall | None:
        """Extracts the function call from the candidate."""
        for part in candidate.content.parts:
            if part.function_call:
                return part.function_call
        return None

    def run(self):
        screenshot_bytes = copy.deepcopy(self.browser_computer.screenshot())
        while not self.done:
            # Generate a response from the model.
            response = self.client.models.generate_content(
                model=self.model_name,
                contents=self.contents,
                config=self.generate_content_config,
            )

            # Extract the text and function call from the response.
            candidate = response.candidates[0]
            text = self.get_text(candidate)
            function_call = self.get_function_call(candidate)

            # Append the model turn.
            self.contents.append(candidate.content)

            if text:
                termcolor.cprint(
                    "Agent Reasoning",
                    color="magenta",
                    attrs=["bold"],
                )
                print(text)
                print()

            feedback_screenshot = None
            if function_call:
                termcolor.cprint(
                    "Agent Function Call",
                    color="yellow",
                    attrs=["bold"],
                )
                print(function_call.model_dump_json())
                print()
                screenshot, feedback_screenshot = self.handle_action(
                    function_call, screenshot_bytes
                )
            else:
                print("No function call found.")
                screenshot = self.browser_computer.screenshot()

            self.contents.append(
                Content(
                    role="user",
                    parts=[
                        Part(text="WebAgentState("),
                    ],
                )
            )
            if feedback_screenshot is not None:
                self.contents.append(
                    Content(
                        role="user",
                        parts=[
                            Part(text="\n  last_action_location_screenshot="),
                            Part(
                                inline_data=Blob(
                                    mime_type="image/png",
                                    data=feedback_screenshot,
                                ),
                            ),
                        ],
                    )
                )
            else:
                self.contents.append(
                    Content(
                        role="user",
                        parts=[Part(text="\n  last_action_location_screenshot=None")],
                    )
                )
            self.contents.append(
                Content(
                    role="user",
                    parts=[
                        Part(text=",\n  current_screenshot="),
                        Part(
                            inline_data=Blob(
                                mime_type="image/png",
                                data=screenshot,
                            ),
                        ),
                        Part(text=")"),
                    ],
                )
            )
            save_screenshot(screenshot)


Finally, run the agent and see it all in action

In [None]:
env = CloudRunComputer

query = "Find the temperature in Seattle for the next 3 days" #@param {type:"string"}
api_server = "https://computeruse-363745922778.us-central1.run.app/" #@param {type:"string"}

with env(api_server=api_server) as browser_computer:
    agent = BrowserAgent(
        browser_computer=browser_computer,
        query=query,
    )
    agent.run()

Creating session...
