# Website Content Summarizer with OpenAI

A Playwright-based web scraper that fetches JavaScript-rendered pages, strips navigation/boilerplate, and uses OpenAI to generate concise summaries.

In [None]:
# imports
import os
from dotenv import load_dotenv
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from openai import OpenAI
from IPython.display import Markdown, display

In [None]:
async def fetch_website_contents_js(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
            viewport={"width": 1280, "height": 720}
        )
        page = await context.new_page()
        await page.goto(url, wait_until="domcontentloaded", timeout=15000)
        await page.wait_for_timeout(3000)  # extra time for Cloudflare challenge
        html = await page.content()
        await browser.close()

    soup = BeautifulSoup(html, "html.parser")
    for tag in soup.find_all(["script", "style", "nav", "footer", "header", "img", "svg"]):
        tag.decompose()
    return soup.get_text(separator="\n", strip=True)

### Initialize OpenAI

In [None]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

In [None]:
openai = OpenAI()

### Prompts

In [None]:
system_prompt = """
You are an informative assistant that analyzes the contents of a website,
and provides a concise, clear, and factual summary, ignoring text that might be navigation related.
Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.
"""

user_prompt_prefix = """
Here are the contents of a website.
Provide a short summary of this website.
If it includes news or announcements, then summarize these too.

"""


### Helpers

In [None]:
def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_prefix + website}
    ]

In [None]:
async def summarize_js(url):
    website = await fetch_website_contents_js(url)
    response = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=messages_for(website)
    )
    return response.choices[0].message.content

In [None]:
async def display_summary_js(url):
    summary = await summarize_js(url)
    display(Markdown(summary))

## Summarize website

In [None]:
await display_summary_js("https://openai.com")