<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/GUSE_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# GUSE_DEMO.ipynb
import time
import json
from google.colab import userdata
import base64
import google.generativeai as genai
from google.generativeai import GenerativeModel
from google.generativeai.protos import Tool, FunctionDeclaration, Schema

# --- API Key Setup (as provided by you, directly used) ---
GOOGLE_API_KEY = userdata.get('GEMINI')
if GOOGLE_API_KEY:
    genai.configure(api_key=GOOGLE_API_KEY)
    print("Google Generative AI configured successfully using Colab Secrets.")
    client_initialized = True
else:
    print("WARNING: GOOGLE_API_KEY not found in Colab Secrets. Please ensure 'GEMINI' secret is set.")
    print("API calls will likely fail. Proceeding with unconfigured API.")
    client_initialized = False

# --- Agent Configuration ---
class AgentConfig:
    LLM_MODEL_NAME: str = "gemini-2.5-flash"  # As specified by you
    BROWSER_URL: str = "https://www.google.com"

# --- SIMULATION HELPER FUNCTIONS ---
def get_initial_screenshot_part(file_path: str = "placeholder_screenshot.png") -> dict:
    base64_image_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
    image_bytes = base64.b64decode(base64_image_data)
    print(f"\n[SIMULATION] Using placeholder screenshot for {file_path}.")
    return {"inline_data": {"mime_type": "image/png", "data": base64_image_data}}

def execute_action(function_call: dict):
    name = function_call["name"]
    args = function_call["args"]
    print(f"\n[ACTION EXECUTOR] Executing: {name}({args})")
    if name == "type_text_at":
        query = args.get("text", "").strip("\n").replace(" ", "+")
        simulated_result = {
            "result": "Text typed successfully.",
            "url": f"https://www.google.com/search?q={query}",
        }
    elif name == "press_enter":
        simulated_result = {
            "result": "Enter key pressed successfully.",
            "url": args.get("url", AgentConfig.BROWSER_URL),
        }
    else:
        simulated_result = {
            "result": f"Successfully performed action: {name}.",
            "url": AgentConfig.BROWSER_URL,
        }
    print(f"[ACTION EXECUTOR] Action completed. Simulating new state: {simulated_result['url']}")
    return {"function_response": {"name": name, "response": simulated_result}}

# --- AGENT LOOP IMPLEMENTATION ---
def run_computer_use_agent(user_goal: str, max_iterations: int = 3):
    if not client_initialized:
        print("Cannot run agent simulation: API client not initialized.")
        return
    model_name = AgentConfig.LLM_MODEL_NAME
    tools = [
        Tool(
            function_declarations=[
                FunctionDeclaration(
                    name="type_text_at",
                    description="Types text into a specified field in a browser",
                    parameters=Schema(
                        type_="OBJECT",
                        properties={
                            "text": Schema(type_="STRING", description="Text to type"),
                            "selector": Schema(type_="STRING", description="CSS selector for the input field")
                        },
                        required=["text", "selector"]
                    )
                ),
                FunctionDeclaration(
                    name="press_enter",
                    description="Presses the enter key in the current input field",
                    parameters=Schema(
                        type_="OBJECT",
                        properties={
                            "url": Schema(type_="STRING", description="Current URL of the page")
                        },
                        required=[]  # url is optional
                    )
                )
            ]
        )
    ]
    history = []
    print("--- START OF AGENT LOOP ---")
    print(f"USER GOAL: {user_goal}")
    enhanced_goal = f"{user_goal}\nFor each step, provide a detailed step-by-step explanation of your plan, including why each action is necessary, before performing it."
    initial_parts = [
        {"text": enhanced_goal},
        get_initial_screenshot_part()
    ]
    history.append({"role": "user", "parts": initial_parts})
    try:
        iteration = 0
        task_completed = False
        while iteration < max_iterations and not task_completed:
            iteration += 1
            print(f"\n--- Iteration {iteration} ---")
            model = GenerativeModel(model_name=model_name, tools=tools)
            response = model.generate_content(history)
            reasoning_text = ""
            for part in response.parts:
                if hasattr(part, "text") and part.text:
                    reasoning_text += part.text
            history.append({"role": "model", "parts": response.parts})
            print(f"\n--- Model Response (Iteration {iteration}) ---")
            print(f"Reasoning: {reasoning_text or 'No text reasoning provided; function call likely returned.'}")
            function_call = None
            for part in response.parts:
                if hasattr(part, "function_call") and part.function_call:
                    function_call = {
                        "name": part.function_call.name,
                        "args": dict(part.function_call.args)
                    }
                    break
            if not function_call:
                print(f"[TASK COMPLETE] Model did not suggest further action in iteration {iteration}. Task likely finished.")
                task_completed = True
                break
            function_response_part = execute_action(function_call)
            print(f"\n--- Agent reports back to the Model (Iteration {iteration}) ---")
            history.append({"role": "user", "parts": [
                function_response_part,
                get_initial_screenshot_part(f"iteration_{iteration}_screenshot.png")
            ]})
        if iteration >= max_iterations and not task_completed:
            print(f"[WARNING] Reached maximum iterations ({max_iterations}) without task completion.")
        print("--- END OF AGENT LOOP SIMULATION ---")
    except Exception as e:
        print(f"[ERROR] Failed to run agent loop: {e}")

# --- Main Execution ---
if __name__ == "__main__":
    GOAL = "Go to the search bar and type the phrase 'Gemini agentic automation' then press enter."
    if client_initialized:
        run_computer_use_agent(GOAL, max_iterations=3)
    else:
        print("\nCannot run agent simulation due to missing or failed API client initialization.")

Google Generative AI configured successfully using Colab Secrets.
--- START OF AGENT LOOP ---
USER GOAL: Go to the search bar and type the phrase 'Gemini agentic automation' then press enter.

[SIMULATION] Using placeholder screenshot for placeholder_screenshot.png.

--- Iteration 1 ---

--- Model Response (Iteration 1) ---
Reasoning: Okay, I will type 'Gemini agentic automation' into the search bar and press Enter.

Here's the plan:
1. **Type 'Gemini agentic automation' into the search bar:** I will use the `type_text_at` function to input the text. I am assuming a standard search input field, and will use the CSS selector `input[name="q"]` which is commonly used for search boxes, especially on Google and similar search pages. This action is necessary to input the desired search query.


[ACTION EXECUTOR] Executing: type_text_at({'text': 'Gemini agentic automation', 'selector': 'input[name="q"]'})
[ACTION EXECUTOR] Action completed. Simulating new state: https://www.google.com/search?