In [2]:
import os

from groq import Groq

api_key=''
import os
os.environ['GROQ_API_KEY'] = api_key


In [6]:
import asyncio
from playwright.async_api import async_playwright

import playwright
import pytesseract
from PIL import Image
import asyncio
from playwright.async_api import async_playwright
from PIL import Image, ImageDraw
import asyncio
from playwright.async_api import async_playwright


import base64

import json


In [7]:
prompt='''Imagine you are a robot browsing the web, just like humans. Now you need to complete a task. In each iteration,
you will receive an Observation that includes a screenshot of a webpage and some texts. 
Carefully analyze the bounding box information and the web page contents to identify the Numerical Label corresponding 
to the Web Element that requires interaction, then follow
the guidelines and choose one of the following actions:

1. Click a Web Element.
2. Delete existing content in a textbox and then type content.
3. Scroll up or down.
4. Wait 
5. Go back
7. Return to google to start over.
8. Respond with the final answer

Correspondingly, Action should STRICTLY follow the format:

- Click [Numerical_Label] 
- Type [Numerical_Label]; [Content] 
- Scroll [Numerical_Label or WINDOW]; [up or down] 
- Wait 
- GoBack
- Bing
- ANSWER; [content]

Key Guidelines You MUST follow:

* Action guidelines *
1) Execute only one action per iteration.
2) Always click close on the popups.
3) When clicking or typing, ensure to select the correct bounding box.
4) Numeric labels lie in the top-left corner of their corresponding bounding boxes and are colored the same.
5) If the desired target is to do something like taking a action, you need to answer with ANSWER; FINISHED. For exmaple if i ask to write something or play a video

* Web Browsing Guidelines *
1) Don't interact with useless web elements like Login, Sign-in, donation that appear in Webpages
2) Select strategically to minimize time wasted.

Your reply should strictly follow the format:

Thought: {{Your brief thoughts (briefly summarize the info that will help ANSWER)}}
Action: {{One Action format you choose}}
Then the User will provide:
Observation: {{A labeled bounding boxes and contents given by User}}'''

In [8]:
import os


In [9]:
import nest_asyncio

# This is just required for running async playwright in a Jupyter notebook
nest_asyncio.apply()


In [10]:
import re

def parse_commands(text):
    # Define patterns for each command
    command_patterns = {
        'Click': r'Click \[?(\d+)\]?',
        'Type': r'Type \[?(\d+)\]?; (.*)',
        'Scroll': r'Scroll \[?(\d+|WINDOW)\]?; \[?(up|down)\]?',
        'Scroll1': r'Scroll \[?(up|down)\]?; \[?(\d+|WINDOW)\]?',
        'Wait': r'Wait',
        'GoBack': r'GoBack',
        'Bing': r'Bing',
        'Google': r'Google',
        'ANSWER': r'ANSWER; (.*)',
        'FINISHED': r'FINISHED'
    }
    
    print("***************", text)
    # Dictionary to hold the parsed commands
    parsed_commands = []
    
    # Search for each command in the text
    for command, pattern in command_patterns.items():
        for match in re.finditer(pattern, text):
            if command in ['Click', 'Type', 'Scroll', 'Scroll1', 'ANSWER']:
                # Commands with parameters
                if command == 'Scroll1':
                    command = 'Scroll'
                    parsed_commands.append({command: match.groups()[::-1]})
                else:
                    parsed_commands.append({command: match.groups()})
            else:
                # Commands without specific parameters
                parsed_commands.append(command)
    
    return parsed_commands

# Example text input
text = '''Thought: The screenshot shows a search result for iPhones on Amazon.in.
The price of an iPhone is visible in the search results, specifically for an iPhone 13 (128GB) - Blue.\n\nAction: ANSWER; ₹51,990'''

# Parse the example text
parsed_commands = parse_commands(text)
parsed_commands


*************** Thought: The screenshot shows a search result for iPhones on Amazon.in.
The price of an iPhone is visible in the search results, specifically for an iPhone 13 (128GB) - Blue.

Action: ANSWER; ₹51,990


[{'ANSWER': ('₹51,990',)}]

In [11]:
async def type_command(query, location):
    import platform
    await annot_elements[location].click()
    select_all = "Meta+A" if platform.system() == "Darwin" else "Control+A"
    await page.keyboard.press(select_all)
    await page.keyboard.press("Backspace")
    await page.keyboard.type(query)
    await page.keyboard.press("Enter")
    

In [12]:
def llama3_agent(q):
    client = Groq(
        api_key=os.environ.get("GROQ_API_KEY"),
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": q,
            }
        ],
        model="llama3-70b-8192",
    )

    print(chat_completion.choices[0].message.content)
    return chat_completion.choices[0].message.content

In [13]:
parsed_commands = parse_commands('Action: {{Scroll WINDOW; down}}')
parsed_commands

*************** Action: {{Scroll WINDOW; down}}


[{'Scroll': ('WINDOW', 'down')}]

In [14]:
import asyncio
import base64

from langchain_core.runnables import chain as chain_decorator

# Some javascript we will run on each step
# to take a screenshot of the page, select the
# elements to annotate, and add bounding boxes
with open("mark_page.js") as f:
    mark_page_script = f.read()



async def mark_page(page):
    with open("mark_page.js") as f:
        mark_page_script = f.read()
    await page.evaluate(mark_page_script)
    for _ in range(10):
        try:
            bboxes = await page.evaluate("markPage()")
            break
        except:
            # May be loading...
            asyncio.sleep(3)
    screenshot = await page.screenshot(path='annotated_screenshot_with_numbers.png')
    # Ensure the bboxes don't follow us around
    await page.evaluate("unmarkPage()")
    return {
        "img": base64.b64encode(screenshot).decode(),
        "bboxes": bboxes,
    }

In [15]:
def format_descriptions(state):
    labels = []
    for i, bbox in enumerate(state["bboxes"]):
        text = bbox.get("ariaLabel") or ""
        if not text.strip():
            text = bbox["text"]
        el_type = bbox.get("type")
        labels.append(f'{i} (<{el_type}/>): "{text}"')
    bbox_descriptions = "\nValid Bounding Boxes:\n" + "\n".join(labels)
    return {**state, "bbox_descriptions": bbox_descriptions}

In [16]:
async def click(page, state, query, location):
    # - Click [Numerical_Label]
    
    bbox_id = location
    bbox_id = int(bbox_id)
    try:
        bbox = state["bboxes"][bbox_id]
    except:
        return f"Error: no bbox for : {bbox_id}"
    x, y = bbox["x"], bbox["y"]
    res = await page.mouse.click(x, y)
    # TODO: In the paper, they automatically parse any downloaded PDFs
    # We could add something similar here as well and generally
    # improve response format.
    return  res

In [17]:
async def type_text(page, location,text_content, state):
    bbox_id = location
    bbox_id = int(bbox_id)
    bbox = state["bboxes"][bbox_id]
    x, y = bbox["x"], bbox["y"]
    await page.mouse.click(x, y)
    # Check if MacOS
    select_all = "Meta+A" if platform.system() == "Darwin" else "Control+A"
    await page.keyboard.press(select_all)
    await page.keyboard.press("Backspace")
    await page.keyboard.type(text_content)
    page = await page.keyboard.press("Enter")
    return page

In [18]:
async def scroll(page, state, scroll_args):
    if scroll_args is None or len(scroll_args) != 2:
        return "Failed to scroll due to incorrect arguments."

    target, direction = scroll_args

    if target.upper() == "WINDOW":
        # Not sure the best value for this:
        scroll_amount = 500
        scroll_direction = (
            -scroll_amount if direction.lower() == "up" else scroll_amount
        )
        await page.evaluate(f"window.scrollBy(0, {scroll_direction})")
    else:
        # Scrolling within a specific element
        scroll_amount = 200
        target_id = int(target)
        bbox = state["bboxes"][target_id]
        x, y = bbox["x"], bbox["y"]
        scroll_direction = (
            -scroll_amount if direction.lower() == "up" else scroll_amount
        )
        await page.mouse.move(x, y)
        await page.mouse.wheel(0, scroll_direction)

    return f"Scrolled {direction} in {'window' if target.upper() == 'WINDOW' else 'element'}"

In [19]:
async def wait():
    sleep_time = 5
    await asyncio.sleep(sleep_time)
    return f"Waited for {sleep_time}s."


async def go_back(page):
    
    await page.go_back()
    return f"Navigated back a page to {page.url}."


async def to_google(page):
    await page.goto("https://www.google.com/")
    return "Navigated to google.com."

In [20]:
def get_prompt(text, question):
    return f'''
        Given the following information 
        Content:
        {text}
        and a task
        Question:
        {question}
        Reply with answer if it is not an action and answer is available in the text.
        Reply with NO If you do not know the answer
        Reply with NO If your response is a set of actions to be taken
        Reply  with NO If the question involves an action 
        
        example:
        Question:
        Play X on youtube
        Answer: NO
        Dont add any preamble
        Answer:
    '''

In [22]:
p = get_prompt('Apple iPhone 15 128GB Green  ₹70,999', 'iphone  price')
llama3_agent(p)

70,999


'70,999'

In [28]:
async def get_information1(query):
    

    prev_image = ''
    answer = ''
    while True:
        

#         print(len(browser.contexts[0].pages))
#         page = browser.contexts[0].pages[0]
        try:
            obj = await mark_page(page)
        except:
            await wait()
            continue
#         if obj['img'] == prev_image:
#             await page.goto('https://www.amazon.in/')
#             continue
        obj = format_descriptions(obj)
    
        text = await page.inner_text('body')
        
        if 'google' not in page.url:
        
            p = get_prompt(text, query)
            p = llama3_agent(p)
            if p != 'NO':
                return p


        new_query =  prompt + "\n Valid Bounding boxes: " + obj['bbox_descriptions']  \
        + '\nQuestion:'+query
        #print(new_query)
        #print(new_query)
        resp = llama3_agent(new_query)
        
        print("********************", resp)
        
        parsed_commands =parse_commands(resp)[0]
        print(parsed_commands)
        if 'Type' in parsed_commands:
            location = int(parsed_commands['Type'][0])
            await type_text(page, location, parsed_commands['Type'][1], obj)
        elif 'Click' in parsed_commands:
            location = int(parsed_commands['Click'][0])
            await click(page, obj, query, location)
        elif 'Bing' in parsed_commands:
            await page.goto('https://www.bing.com/')
            
        elif 'Scroll' in parsed_commands:
            await scroll(page, obj, parsed_commands['Scroll'])
            
        elif 'Wait' in parsed_commands:
            await wait()
        elif 'GoBack' in parsed_commands:
            await go_back(page)
        elif 'Google' in parsed_commands:
            await to_google(page)
            
        elif 'ANSWER' in parsed_commands:
            answer = parsed_commands['ANSWER'][0]
            print(answer)
            break
        elif 'FINISHED' in parsed_commands:
            break
    #await browser.close()
    return answer
    

In [25]:
import platform
playwright = await async_playwright().start()

browser = await playwright.chromium.launch(headless=False)  # Launch Chromium browser
page = await browser.new_page()  # Open a new page
await page.goto('https://www.google.in')

await get_information1('tell me a good joke')

Thought: I need to find a good joke, so I'll use the search bar to search for one.

Action: Type 5; tell me a good joke
******************** Thought: I need to find a good joke, so I'll use the search bar to search for one.

Action: Type 5; tell me a good joke
*************** Thought: I need to find a good joke, so I'll use the search bar to search for one.

Action: Type 5; tell me a good joke
{'Type': ('5', 'tell me a good joke')}
Thought: I've analyzed the webpage and noticed that the user has asked for a good joke. I'll click on the first search result, "80 Hilariously Funny Jokes 2023 - Funniest Jokes to Tell" from Country Living Magazine.

Action: Click 26
******************** Thought: I've analyzed the webpage and noticed that the user has asked for a good joke. I'll click on the first search result, "80 Hilariously Funny Jokes 2023 - Funniest Jokes to Tell" from Country Living Magazine.

Action: Click 26
*************** Thought: I've analyzed the webpage and noticed that the use

'What falls, but never needs a bandage? The rain.'

In [29]:
import platform
playwright = await async_playwright().start()

browser = await playwright.chromium.launch(headless=False)  # Launch Chromium browser
page = await browser.new_page()  # Open a new page
await page.goto('https://www.google.in')
await get_information1('iphone price on amazon')

Thought: The webpage appears to be the Google search engine homepage, and I need to find the price of an iPhone on Amazon.

Action: Type 4; iphone price on amazon
******************** Thought: The webpage appears to be the Google search engine homepage, and I need to find the price of an iPhone on Amazon.

Action: Type 4; iphone price on amazon
*************** Thought: The webpage appears to be the Google search engine homepage, and I need to find the price of an iPhone on Amazon.

Action: Type 4; iphone price on amazon
{'Type': ('4', 'iphone price on amazon')}
Thought: I understand that I need to find the iPhone price on Amazon. I will navigate to the relevant webpage and extract the price information.

Action: Click 26
******************** Thought: I understand that I need to find the iPhone price on Amazon. I will navigate to the relevant webpage and extract the price information.

Action: Click 26
*************** Thought: I understand that I need to find the iPhone price on Amazon.

'The prices of iPhones on Amazon are:\n\n* Apple iPhone 13 (128GB) - ₹51,900\n* Apple iPhone 14 (128 GB) - ₹58,999\n* Apple iPhone 15 Plus (128 GB) - ₹80,990\n* Apple iPhone 15 Pro Max (256 GB) - ₹1,48,900\n* Apple iPhone 15 (128 GB) - ₹70,999\n* Apple iPhone 14 Plus (256 GB) - ₹76,999\n* Apple iPhone 15 Pro (128 GB) - ₹1,27,990'

In [30]:
import platform
playwright = await async_playwright().start()

browser = await playwright.chromium.launch(headless=False)  # Launch Chromium browser
page = await browser.new_page()  # Open a new page
await page.goto('https://www.google.in')

await get_information1('play "in the end song" on youtube')

Thought: I need to play "in the end song" on youtube.

Action: Type 5; in the end song youtube
******************** Thought: I need to play "in the end song" on youtube.

Action: Type 5; in the end song youtube
*************** Thought: I need to play "in the end song" on youtube.

Action: Type 5; in the end song youtube
{'Type': ('5', 'in the end song youtube')}
Thought: I need to play "In The End" song on YouTube.

Action: Click 22
******************** Thought: I need to play "In The End" song on YouTube.

Action: Click 22
*************** Thought: I need to play "In The End" song on YouTube.

Action: Click 22
{'Click': ('22',)}
In The End [Official HD Music Video] - Linkin Park


'In The End [Official HD Music Video] - Linkin Park'

In [34]:
import platform
playwright = await async_playwright().start()

browser = await playwright.chromium.launch(headless=False)  # Launch Chromium browser
page = await browser.new_page()  # Open a new page
await page.goto('https://www.google.in')

await get_information1('What are the places to visit in India')

Thought: I've been asked to find places to visit in India. I'm on the Google homepage, so I'll start by typing my query in the search box.

Action: Type 4; places to visit in India
******************** Thought: I've been asked to find places to visit in India. I'm on the Google homepage, so I'll start by typing my query in the search box.

Action: Type 4; places to visit in India
*************** Thought: I've been asked to find places to visit in India. I'm on the Google homepage, so I'll start by typing my query in the search box.

Action: Type 4; places to visit in India
{'Type': ('4', 'places to visit in India')}
Thought: Analyzing the webpage, I see a search bar and various options like Images, Maps, Videos, and News. The question is about finding tourist places to visit in India.

Action: Type 1; What are the places to visit in India
******************** Thought: Analyzing the webpage, I see a search bar and various options like Images, Maps, Videos, and News. The question is abou

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01hqjr85w6ffh89gthy5wjn0wy` on tokens per minute (TPM): Limit 3500, Used 2880, Requested ~5718. Please try again in 1m27.378714285s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [35]:


import platform
playwright = await async_playwright().start()

browser = await playwright.chromium.launch(headless=False)  # Launch Chromium browser
page = await browser.new_page()  # Open a new page
await page.goto('https://www.google.in')

await get_information1('What latest movies on decider stream-it-or-skip-it')

Thought: The webpage appears to be the Google homepage, and the task is to search for the latest movies on Decider Stream-It-Or-Skip-It.

Action: Type 9; What latest movies on decider stream-it-or-skip-it
******************** Thought: The webpage appears to be the Google homepage, and the task is to search for the latest movies on Decider Stream-It-Or-Skip-It.

Action: Type 9; What latest movies on decider stream-it-or-skip-it
*************** Thought: The webpage appears to be the Google homepage, and the task is to search for the latest movies on Decider Stream-It-Or-Skip-It.

Action: Type 9; What latest movies on decider stream-it-or-skip-it
{'Type': ('9', 'What latest movies on decider stream-it-or-skip-it')}
Thought: I understand the task is to find the latest movies on Decider Stream-It-Or-Skip-It. 

Action: Type 5; What latest movies on decider stream-it-or-skip-it
******************** Thought: I understand the task is to find the latest movies on Decider Stream-It-Or-Skip-It. 



'Here are the latest movies on Decider\'s "Stream It Or Skip It" list:\n\n1. \'Broken Horses\' on Hulu\n2. \'Hack Your Health: The Secrets of Your Gut\' on Netflix\n3. \'Thank You, Goodnight: The Bon Jovi Story\' on Hulu\n4. \'The Asunta Case\' on Netflix\n5. \'Curious Caterer: Foiled Plans\' on Hallmark Mystery\n6. \'Knuckles\' on Paramount+\n7. \'Dead Boy Detectives\' on Netflix\n8. \'Them: The Scare\' on Prime Video\n9. \'TLC Forever\' on Netflix\n10. \'Miller\'s Girl\' on Netflix\n11. \'King Richard\' on Netflix\n12. \'Deliver Me\' on Netflix\n13. \'Arthur the King\' on VOD\n14. \'Amar Singh Chamkila\' on Netflix\n15. \'The Big Door Prize\' Season 2 on Apple TV+\n16. \'Tiger\' on Disney+\n17. \'Fight For Paradise: Who Can You Trust?\' on Netflix\n18. \'The Express Way With Dulé Hill\' on PBS\n19. \'Brigands: The Quest For Gold\' on Netflix'

In [299]:

    

import platform
playwright = await async_playwright().start()

browser = await playwright.chromium.launch(headless=False)  # Launch Chromium browser
page = await browser.new_page()  # Open a new page
await page.goto('https://www.google.in')

await get_information1('Stream It Or Skip It: ‘The Asunta Case’')

Thought: This is the Google homepage with various links and a search bar. 
Action: Type 9; Stream It Or Skip It: ‘The Asunta Case’
******************** Thought: This is the Google homepage with various links and a search bar. 
Action: Type 9; Stream It Or Skip It: ‘The Asunta Case’
*************** Thought: This is the Google homepage with various links and a search bar. 
Action: Type 9; Stream It Or Skip It: ‘The Asunta Case’
{'Type': ('9', 'Stream It Or Skip It: ‘The Asunta Case’')}
Thought: It appears to be the Google homepage, and the task is to search for something.

Action: Type 5; Stream It Or Skip It: ‘The Asunta Case’
******************** Thought: It appears to be the Google homepage, and the task is to search for something.

Action: Type 5; Stream It Or Skip It: ‘The Asunta Case’
*************** Thought: It appears to be the Google homepage, and the task is to search for something.

Action: Type 5; Stream It Or Skip It: ‘The Asunta Case’
{'Type': ('5', 'Stream It Or Skip It: ‘

'STREAM IT.'

In [36]:

    

import platform
playwright = await async_playwright().start()

browser = await playwright.chromium.launch(headless=False)  # Launch Chromium browser
page = await browser.new_page()  # Open a new page
await page.goto('https://www.google.in')

await get_information1('summarize wikipedia article about India')

Thought: The webpage appears to be the Google homepage. To summarize a Wikipedia article about India, I need to search for "India Wikipedia" and then interact with the resulting webpage.

Action: Type 5; India Wikipedia
******************** Thought: The webpage appears to be the Google homepage. To summarize a Wikipedia article about India, I need to search for "India Wikipedia" and then interact with the resulting webpage.

Action: Type 5; India Wikipedia
*************** Thought: The webpage appears to be the Google homepage. To summarize a Wikipedia article about India, I need to search for "India Wikipedia" and then interact with the resulting webpage.

Action: Type 5; India Wikipedia
{'Type': ('5', 'India Wikipedia')}
Thought: The webpage is a Wikipedia article about India, and I need to summarize it.

Action: Wait
******************** Thought: The webpage is a Wikipedia article about India, and I need to summarize it.

Action: Wait
*************** Thought: The webpage is a Wikiped

Future exception was never retrieved
future: <Future finished exception=Exception('Connection closed while reading from the driver')>
Exception: Connection closed while reading from the driver
Future exception was never retrieved
future: <Future finished exception=BrokenPipeError(32, 'Broken pipe')>
Traceback (most recent call last):
  File "/Users/pradeep.borado/.conda/envs/cv/lib/python3.11/asyncio/unix_events.py", line 686, in write
    n = os.write(self._fileno, data)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
BrokenPipeError: [Errno 32] Broken pipe
Future exception was never retrieved
future: <Future finished exception=Exception('Connection closed while reading from the driver')>
Exception: Connection closed while reading from the driver
Future exception was never retrieved
future: <Future finished exception=Exception('Connection closed while reading from the driver')>
Exception: Connection closed while reading from the driver
Future exception was never retrieved
future: <Future finishe

Thought: This webpage appears to be the Wikipedia page for India. I will start by clicking on the article title to ensure I'm on the correct page.

Action: Click 34
******************** Thought: This webpage appears to be the Wikipedia page for India. I will start by clicking on the article title to ensure I'm on the correct page.

Action: Click 34
*************** Thought: This webpage appears to be the Wikipedia page for India. I will start by clicking on the article title to ensure I'm on the correct page.

Action: Click 34
{'Click': ('34',)}


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01hqjr85w6ffh89gthy5wjn0wy` on tokens per minute (TPM): Limit 3500, Used 2234, Requested ~8356. Please try again in 2m1.538428571s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}