# Web Agent: Intro to Playwright 
The notebook demonstrates building a web scraping agent using Playwright (async) and integrating it with OpenAI for content processing and visualization of extracted data (e.g., courses).


## Workflow
- Set up a Playwright-based scraping agent
- Visit a webpage and extract HTML
- Use OpenAI to process and summarize/extract structured data
- Visualize the courses scraped using helper functions


In [1]:
import asyncio
import requests
import json
import os
import nest_asyncio
import pprint
import base64

In [2]:
from io import BytesIO
import pandas as pd
from playwright.async_api import async_playwright
from openai import OpenAI
from PIL import Image
from tabulate import tabulate
from IPython.display import display, HTML, Markdown
from pydantic import BaseModel
from helper import get_openai_api_key, visualizeBlogs
from display_completion_usage import print_usage

In [3]:
client = OpenAI(api_key=get_openai_api_key())
nest_asyncio.apply()

In [4]:
# !npx @playwright/mcp@latest --port 8931

In [5]:
base_url = "http://localhost:8931"

In [6]:
# response = requests.post(f"{base_url}/sse", json={
#     "capabilities": {
#         "browserName": "chromium"
#     }
# })


In [9]:
import requests
import json
from sseclient import SSEClient    # Uncomment if using SSEClient

def translate_to_actions(instruction):
    """
    Mock translation of a natural language instruction to MCP action JSONs.
    In real use, call an LLM here (e.g. OpenAI) to parse the instruction.
    """
    # Example hard-coded mapping for illustration:
    if instruction.startswith("Navigate to"):
        url = instruction.split("to")[1].strip().strip('.')
        return [{
            "method": "tools/call",
            "params": {"name": "browser_navigate", "arguments": {"url": url}}
        }]
    if instruction.startswith("Click"):
        # This is a simplification; actual ref IDs come from snapshot.
        return [{
            "method": "tools/call",
            "params": {"name": "browser_click", "arguments": {"element": instruction, "ref": "dummy-ref"}}
        }]
    # Add more cases or call an LLM...
    return []



In [14]:
# 1. Connect to the MCP server via SSE
sse_url = "http://localhost:8931/sse"
# resp = requests.get(sse_url, stream=True)
client = SSEClient(sse_url)   # Would be used to read SSE events



In [16]:
client

<sseclient.SSEClient at 0x11ff15df0>

In [23]:
# For simplicity, assume we get a sessionId from the response.
# In practice, parse the first SSE event of type 'endpoint'.
session_id = "b80173aa-204c-45f8-8cfa-69f99735ac7e"  # Replace with real session ID from SSE.

# 2. Define the user steps
steps = [
    "Navigate to https://www.makemytrip.com",
    "Type 8754430833 mobile number in the login modal",
    "Click submit button",
    "Wait for the user to enter the one time passowrd",
    "Click the Submit button.",
    "Search for one-way flight from Delhi to Bangalore.",
]

# 3. Iterate steps: translate and send to MCP
id_counter = 1
for step in steps:
    print(step)
    actions = translate_to_actions(step)
    for action in actions:
        # Build JSON-RPC payload
        payload = {
            "jsonrpc": "2.0",
            "id": id_counter,
            "method": action["method"],
            "params": action["params"]
        }
        id_counter += 1

        # Send the action to the MCP server
        url = f"http://localhost:8931/message?sessionId={session_id}"
        headers = {"Content-Type": "application/json"}
        res = requests.post(url, headers=headers, data=json.dumps(payload))
        # Check basic HTTP response
        print(f"Sent action {payload['params']['name']} (HTTP {res.status_code})")

        # (Optional) Wait and parse result from SSE
        # event = next(client.events())
        # print("Result:", event.data)



Navigate to https://www.makemytrip.com
Sent action browser_navigate (HTTP 202)
Type 8754430833 mobile number in the login modal
Click submit button
Sent action browser_click (HTTP 202)
Wait for the user to enter the one time passowrd
Click the Submit button.
Sent action browser_click (HTTP 202)
Search for one-way flight from Delhi to Bangalore.


In [None]:
# 4. (Optional) Example: take a final snapshot after all actions
snapshot_payload = {
    "jsonrpc": "2.0",
    "id": id_counter,
    "method": "tools/call",
    "params": {"name": "browser_snapshot"}
}
res = requests.post(f"http://localhost:8931/message?sessionId={session_id}",
                    headers=headers, data=json.dumps(snapshot_payload))
print("Snapshot command sent, HTTP status", res.status_code)
evt = next(client.events())
snapshot_data = json.loads(evt.data)['result']
print("Snapshot:", snapshot_data)

# 5. Summarize results
# In a real agent, we would parse snapshot_data or gather tool outputs to answer the user.
print("Actions complete. (In production code, parse SSE responses for detailed results.)")


### Web Scraper Agent 
This agent:

Initializes a headless Chromium browser

Navigates to a URL and scrapes the page content

Captures screenshots (optionally as a file or in-memory buffer)

The browser is configured with several performance/security-related flags (e.g., disabling GPU, web security, background networking).

In [17]:
class WebScraperAgent:
    def __init__(self):
        self.playwright = None
        self.browser = None
        self.page = None

    async def init_browser(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(
            headless=True,
            args=[
                "--disable-dev-shm-usage",
                "--no-sandbox",
                "--disable-setuid-sandbox",
                "--disable-accelerated-2d-canvas",
                "--disable-gpu",
                "--no-zygote",
                "--disable-audio-output",
                "--disable-software-rasterizer",
                "--disable-webgl",
                "--disable-web-security",
                "--disable-features=LazyFrameLoading",
                "--disable-features=IsolateOrigins",
                "--disable-background-networking"
            ]
        )
        self.page = await self.browser.new_page()

    async def scrape_content(self, url):
        if not self.page or self.page.is_closed():
            await self.init_browser()
        await self.page.goto(url, wait_until='domcontentloaded')
        await self.page.wait_for_timeout(2000)  # Wait for dynamic content
        return await self.page.content()

    async def take_screenshot(self, path="screenshot.png"):
        await self.page.screenshot(path=path, full_page=True)
        return path
    async def screenshot_buffer(self):
        screenshot_bytes = await self.page.screenshot(type="png", full_page=False)
        return screenshot_bytes

    async def close(self):
        await self.browser.close()
        await self.playwright.stop()
        self.playwright = None
        self.browser = None
        self.page = None

In [18]:


# openai.api_key = "YOUR_OPENAI_API_KEY"  # Set this securely in practice

# Reusable agent class
class AgentBrowser:
    def __init__(self, model="gpt-4"):
        self.model = model
        self.browser = None
        self.page = None

    async def setup(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(headless=False)
        self.page = await self.browser.new_page()

    async def teardown(self):
        await self.browser.close()
        await self.playwright.stop()

    async def execute_step(self, step_instruction: str, step_number: int):
        # Screenshot or DOM snapshot can be passed here if needed
        context = f"""
You are controlling a browser using Playwright.
Here is your next step:
"{step_instruction}"

Use only what is visible on the page. You can describe what selectors to click, type into, or extract.

Respond ONLY with a JSON like this:
{{ "step": {step_number}, "status": "done", "message": "<summary or result>" }}
"""

        # Optionally, send partial page content or URL for context
        page_url = self.page.url if self.page.url else "about:blank"
        context += f"\nCurrent URL: {page_url}"

        # LLM call
        response = client.beta.chat.completions.parse(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": "You are a Playwright automation agent."},
                {"role": "user", "content": context}
            ]
        )

        reply = response.choices[0].message.content.strip()
        usage = response.usage
        try:
            result = json.loads(reply)
        except Exception as e:
            result = {
                "step": step_number, 
                "status": "error", 
                "message": f"Invalid JSON response: {reply}"
            }
        return result, usage

    async def run_workflow(self, steps: list):
        results = []
        await self.setup()
        for i, step in enumerate(steps, start=1):
            print(f"\n➡️ Step {i}: {step}")
            result, usage = await self.execute_step(step, i)
            print(f"✅ Response: {result}")
            print_usage(usage)
            results.append({
                "result": result,
                "usage": usage
            })

            # # Optional wait before continuing
            # input("🔄 Press Enter to continue to next step...")

        await self.teardown()
        return results


In [19]:
scraper = WebScraperAgent()

### Structured Data Format

In [20]:
import asyncio
# from agent_playwright import AgentBrowser

steps = [
    "Navigate to https://www.skyscanner.com.",
    "Search for a one-way flight from Delhi (DEL) to Bangalore (BLR) on May 5, 2025.",
    "Apply filters: non-stop flights only, departure time between 6 AM and 12 PM.",
    "Choose the cheapest available option.",
    "Proceed to booking and fill in the passenger details: Name: Gary Gupta, Gender: Male, Age: 35, Email: gary@example.com, Phone: 9999999999.",
    "Stop before making payment and summarize flight number, timing, and price."
]


agent = AgentBrowser()
results = await agent.run_workflow(steps)
print("\n📋 Final Summary:")
for res in results:
    print(res.results)
    print_usage(res.usage)

# if __name__ == "__main__":
#     asyncio.run(main())


➡️ Step 1: Navigate to https://www.skyscanner.com.
✅ Response: {'step': 1, 'status': 'done', 'message': 'Navigated to https://www.skyscanner.com.'}
🔍 Chat Completion API Usage Summary
🧠 Prompt Tokens        : 105
💬 Completion Tokens    : 30
🧾 Total Tokens         : 135

🔍 Prompt Token Details:
  🎵 Audio Tokens       : 0
  🧊 Cached Tokens      : 0

✍️ Completion Token Details:
  ✅ Accepted Predictions : 0
  ❌ Rejected Predictions : 0
  🧠 Reasoning Tokens     : 0
  🎧 Audio Tokens         : 0

➡️ Step 2: Search for a one-way flight from Delhi (DEL) to Bangalore (BLR) on May 5, 2025.
✅ Response: {'step': 2, 'status': 'done', 'message': 'The browser is currently on a blank page. You will need to navigate to a flight search website and then perform the search for a one-way flight from Delhi (DEL) to Bangalore (BLR) on May 5, 2025.'}
🔍 Chat Completion API Usage Summary
🧠 Prompt Tokens        : 121
💬 Completion Tokens    : 68
🧾 Total Tokens         : 189

🔍 Prompt Token Details:
  🎵 Audio Tok

AttributeError: 'dict' object has no attribute 'results'

In [None]:
class GowriShankarsBlogPost(BaseModel):
    title: str
    description: str
    date: str
    readTime: str
    imageUrl: str
    blogURL: str

class GowriShankarsBlogPostList(BaseModel):
    blogs: list[GowriShankarsBlogPost]

### LLM Client for Open AI


In [None]:
def get_system_prompt(instructions):
    system_prompt = f"""
                You are an expert web scraping agent. Your task is to:
                Extract relevant information from this HTML to JSON 
                following these instructions:
                {instructions}
                
                Extract the title, description, presenter, 
                the image URL and course URL for each of 
                all the courses for the deeplearning.ai website
    
                Return ONLY valid JSON, no markdown or extra text."""
    return system_prompt

In [None]:
async def process_with_llm(html, instructions, truncate = False):
    response = client.responses.parse(
        model="gpt-4o-mini-2024-07-18",
        input=[
            {
                "role": "system",
                "content": get_system_prompt(instructions),
            },
            {
                "role": "user",
                "content": html[:150000],
            },
        ],
        temperature=0.1,
        text_format=GowriShankarsBlogPostList,
    )
    return response

async def process_with_llm_2(html, instructions, truncate = False):
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        messages=[{
            "role": "system",
            "content": f"""
            You are an expert web scraping agent. Your task is to:
            Extract relevant information from this HTML to JSON 
            following these instructions:
            {instructions}
            
            Extract the title, description, presenter, 
            the image URL and course URL for each of 
            all the courses for the deeplearning.ai website

            Return ONLY valid JSON, no markdown or extra text."""
        }, {
            "role": "user",
            "content": html[:150000]  # Truncate to stay under token limits
        }],
        temperature=0.1,
        response_format=GowriShankarsBlogPostList,
        )
    return completion, completion.choices[0].message.parsed

In [None]:
async def webscraper(target_url, instructions):
    result = None
    try:
        # Scrape content and capture screenshot
        print("Extracting HTML Content \n")
        html_content = await scraper.scrape_content(target_url)

        print("Taking Screenshot \n")
        screenshot = None
        screenshot = await scraper.screenshot_buffer()
        # Process content

        print("Processing..")
        completion, result = await process_with_llm_2(html_content, instructions, False)
        print("\nGenerated Structured Response")
    except Exception as e:
        print(f"❌ Error: {str(e)}")
    finally:
        await scraper.close()
    return completion, result, screenshot

### Blogs

In [None]:
target_url = "https://gowrishankar.info/blog"  # Deeplearning AI courses
base_url="https://gowrishankar.info"

In [None]:
instructions = """
    Get all the courses
"""
completion, result, screenshot = await webscraper(target_url, instructions)


In [None]:
from helper import visualizeBlogs

In [None]:
await visualizeBlogs(result=result, 
                       screenshot=screenshot, 
                       target_url=target_url, 
                       instructions=instructions, 
                       base_url=base_url)

In [None]:
usage = completion.usage

In [None]:
from display_completion_usage import print_usage

In [None]:
print_usage(usage)

In [None]:
result