# Agent Chat with Multimodal Models: GPT-4V

### Before everything starts, install AutoGen with the `lmm` option
```bash
pip install "pyautogen[lmm]~=0.2.0b4"
```

In [3]:
import requests
import json
import os

from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union

import autogen
from autogen import AssistantAgent, Agent, UserProxyAgent, ConversableAgent
from termcolor import colored
import random

Within the user proxy agent, we can decide to activate the human input mode or not (for here, we use human_input_mode="NEVER" for conciseness). This allows you to interact with LMM in a multi-round dialogue, enabling you to provide feedback as the conversation unfolds.

In [4]:
from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent

config_list_4v = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-4-vision-preview"],
    },
)


config_list_gpt4 = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-4", "gpt-4-0314", "gpt4", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-v0314"],
    },
)

gpt4_llm_config = {"config_list": config_list_gpt4, "cache_seed": 42}

In [5]:
# Remove the `api_type` param as it is not needed for 4V
[config.pop("api_type", None) for config in config_list_4v]

[None]

In [6]:
from utils import workspaceId, token, extract_element_bbox, \
                request_image_annotation, draw_bboxes, encode_image, send_to_gpt, \
                extract_numbers
from PIL import Image

In [7]:
import subprocess
import time

class Controller:
    def __init__(self, window_name="Mozilla Firefox") -> None:    
        # Get the window ID (replace 'Window_Name' with your window's title)
        self.window_name = window_name
        self.get_window_id = ["xdotool", "search", "--name", self.window_name]
        self.window_id = subprocess.check_output(self.get_window_id).strip()

    def move_mouse(self, x, y, click=1):
        # AI logic to determine the action (not shown)
        action = {"x": x, "y": y, "click": click}
        # Move the mouse and click within the window
        if action["click"]:
            subprocess.run(["xdotool", "mousemove", "--window", self.window_id, str(action["x"]), str(action["y"])])
            subprocess.run(["xdotool", "click", "--window", self.window_id, "1"])

        # Print AI decisions in terminal
        print(f"Moved to {action['x']}, {action['y']} and clicked: {action['click']}")
        # wait before next action
        time.sleep(2)

    def double_click_at_location(self, x, y):
        # Move the mouse to the specified location
        subprocess.run(["xdotool", "mousemove", "--window", self.window_id, str(int(x)), str(int(y))])
        # Double click
        subprocess.run(["xdotool", "click", "--repeat", "1", "--window", self.window_id, "1"])
        time.sleep(0.1)
        subprocess.run(["xdotool", "click", "--repeat", "1", "--window", self.window_id, "1"])

    def enter_text_at_location(self, text, x, y):
        # Move the mouse to the specified location
        subprocess.run(["xdotool", "mousemove", "--window", self.window_id, str(int(x)), str(int(y))])
        # Click to focus at the location
        subprocess.run(["xdotool", "click", "--window", self.window_id, "1"])
        # Type the text
        subprocess.run(["xdotool", "type", "--window", self.window_id, text])

    def press_enter(self):
        subprocess.run(["xdotool", "key", "--window", self.window_id, "Return"])

    def take_screenshot(self):
        # Take a screenshot
        print("Taking screenshot....")
        screenshot_command = ["import", "-window", self.window_id, "screenshot.png"]
        subprocess.run(screenshot_command)
        # Wait before next action
        time.sleep(1)
        self.image = Image.open("screenshot.png").convert("RGB")
        self.aui_annotate()
        return "screenshot taken with UI elements numbered at screenshot_annotated.png "

    def aui_annotate(self):
        print(f"aui_annotate function is called...")
        assert os.path.exists("screenshot.png"), "Screenshot not taken"
        self.raw_data = (request_image_annotation("screenshot.png", workspaceId, token)).json()
        self.image_with_bboxes = draw_bboxes(self.image, self.raw_data)
        self.image_with_bboxes.save("screenshot_annotated.png")
    
    def extract_location_from_index(self, index):
        print(f"Extract location function is called...")
        bbox = extract_element_bbox([index], self.raw_data)
        return [(bbox[0]+bbox[2])/2, (bbox[1]+bbox[3])/2]
    
    def convert_image_to_base64(self):
        return encode_image("screenshot_annotated.png")
    
    def get_target_UIelement_number(self, query):
        base64_image = self.convert_image_to_base64()
        gpt_response = send_to_gpt(base64_image, query)
        gpt_response = gpt_response["choices"][0]["message"]["content"]
        print(f"GPT resposne is {gpt_response}")
        # Extract numbers from the GPT response
        numbers = extract_numbers(gpt_response)
        return numbers[0]



In [8]:
llm_config = {
    "functions": [
        {
            "name": "take_screenshot",
            "description": "take screenshot of the UI window. Annotate UI elements - Draw bounding boxes of the UI elements with numbers on the image and save under screenshot_annotated.png",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
            },
        },
        {
            "name": "move_mouse",
            "description": "move the mouse to a position (x,y) in the UI window",
            "parameters": {
                "type": "object",
                "properties": {
                    "x": {
                        "type": "number",
                        "description": "x coordinate of the x,y point",
                    },
                    "y": {
                        "type": "number",
                        "description": "y coordinate of the x,y point",
                    },
                    "click": {
                        "type": "number",
                        "description": "Bool flag to know whether to make the left click action after moving to x,y",
                    }
                },
                "required": ["x", "y", "click"],
            },
        },
        {
            "name": "double_click_at_location",
            "description": "Double click on a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "x": {
                        "type": "number",
                        "description": "x coordinate of the x,y point",
                    },
                    "y": {
                        "type": "number",
                        "description": "y coordinate of the x,y point",
                    },
                },
                "required": ["x", "y"],
            },
        },
        {
            "name": "extract_location_from_index",
            "description": "Extract the position (x,y) from the index/number of the UI element detected",
            "parameters": {
                "type": "object",
                "properties": {
                    "index": {
                        "type": "number",
                        "description": "number of the UI element",
                    },
                },
                "required": ["index"],
            },
        },
        {
            "name": "get_target_UIelement_number",
            "description": "Given an annotated image and query about specific UI element, get the index/number of the desired element from an advanced model",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Query considering the UI element to click on",
                    },
                },
                "required": ["query"],
            },
        },
        {
            "name": "enter_text_at_location",
            "description": "Enter text at a specified location in the UI window",
            "parameters": {
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string",
                        "description": "Text to enter",
                    },
                    "x": {
                        "type": "number",
                        "description": "x coordinate of the x,y point",
                    },
                    "y": {
                        "type": "number",
                        "description": "y coordinate of the x,y point",
                    },
                },
                "required": ["x", "y", "text"],
            },
        },
        {
            "name": "press_enter",
            "description": "Press enter key",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
            },
        }

    ],
    "config_list": config_list_gpt4,
#     "temperature": 0.4,
}



planner = autogen.AssistantAgent(
    name="Planner",
    system_message="""You are the orchestrator that must achieve the given task. You are given functions to handle the UI window. Remember that you are given a UI window and you start the task by taking a screenshot and take screenshot after each action. For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done. Take a deep breath and think step-by-step""",
    llm_config=llm_config,
)


# create a UserProxyAgent instance named "user_proxy"
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    code_execution_config={"work_dir": "coding"},
    llm_config=llm_config,
)


controller = Controller(window_name = "Mozilla Firefox")


# register the functions
user_proxy.register_function(
    function_map={
        "take_screenshot": controller.take_screenshot,
        "move_mouse": controller.move_mouse,
        "extract_location_from_index": controller.extract_location_from_index,
        "convert_image_to_base64": controller.convert_image_to_base64,
        "get_target_UIelement_number": controller.get_target_UIelement_number,
        "enter_text_at_location": controller.enter_text_at_location,
        "press_enter": controller.press_enter,
        "double_click_at_location": controller.double_click_at_location,
    }
)



In [9]:
# register the functions
planner.register_function(
    function_map={
        "take_screenshot": controller.take_screenshot,
        "move_mouse": controller.move_mouse,
        "extract_location_from_index": controller.extract_location_from_index,
        "convert_image_to_base64": controller.convert_image_to_base64,
        "get_target_UIelement_number": controller.get_target_UIelement_number,
        "enter_text_at_location": controller.enter_text_at_location,
        "press_enter": controller.press_enter,
        "double_click_at_location": controller.double_click_at_location,
    }
)



In [10]:
def start_agents(query):
    user_proxy.initiate_chat(
        planner,
        message=f"Task is to: {query}. Check if the task is acheived by looking at the window. Don't quit immediately",
    )


In [11]:
query = "click on the github icon and click on 'blogs' repository"

In [12]:
start_agents(query)

[33muser_proxy[0m (to Planner):

Task is to: click on the github icon and click on 'blogs' repository. Check if the task is acheived by looking at the window. Don't quit immediately

--------------------------------------------------------------------------------
[33mPlanner[0m (to user_proxy):

[32m***** Suggested function Call: take_screenshot *****[0m
Arguments: 
{}
[32m****************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING FUNCTION take_screenshot...[0m
Taking screenshot....
aui_annotate function is called...
[33muser_proxy[0m (to Planner):

[32m***** Response from calling function "take_screenshot" *****[0m
screenshot taken with UI elements numbered at screenshot_annotated.png 
[32m************************************************************[0m

--------------------------------------------------------------------------------
[33mPlanner[0m (to user_proxy):

