# Website URL Summarizer (Selenium)

Uses a real browser via Selenium to fetch webpages that block simple HTTP requests, then summarizes using OpenAI.

In [6]:
import os

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

True

In [7]:
REMOVE_TAGS = [
    "script", "style", "nav", "footer", "header",
    "aside", "form", "noscript", "iframe",
]
MAX_TEXT_LENGTH = 15_000


def extract_text(url):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
        html = driver.page_source
    finally:
        driver.quit()

    soup = BeautifulSoup(html, "html.parser")

    for tag in soup(REMOVE_TAGS):
        tag.decompose()

    text = soup.get_text(separator="\n", strip=True)
    return text[:MAX_TEXT_LENGTH]

In [12]:
def summarize(text):
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not set in .env file.")

    model = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
    client = OpenAI(api_key=api_key)

    response = client.chat.completions.create(
        model=model,
        temperature=0.3,
        messages=[
    {
        "role": "system",
        "content": (
            "You are a concise and accurate web content summarizer. "
            "Your task is to extract and summarize the main content of a webpage. "
            "Rules:\n"
            "- Produce a clear summary in 3-5 paragraphs.\n"
            "- Focus on the key points, facts, and arguments.\n"
            "- Preserve important names, dates, and figures.\n"
            "- Ignore boilerplate such as cookie notices, navigation menus, "
            "ads, footers, and sidebar content.\n"
            "- If the content is too short or mostly boilerplate, "
            "state that no meaningful content was found."
        ),
    },
    {"role": "user", "content": f"Summarize this webpage content:\n\n{text}"},
],
    )
    return response.choices[0].message.content

In [None]:
url = "https://example.com"

In [14]:
text = None

print(f"Fetching {url} ...")
try:
    text = extract_text(url)
    print(f"Extracted {len(text)} characters.")
except TimeoutException:
    print("Page took too long to load.")
except WebDriverException as e:
    msg = str(e).split("\n")[0]
    print(f"Browser error: {msg}")
except Exception as e:
    print(f"Unexpected error: {e}")

Fetching https://openai.com ...
Extracted 1735 characters.


In [15]:
if not text or not text.strip():
    print("No text to summarize. Check the fetch step above.")
else:
    print("Summarizing...")
    try:
        summary = summarize(text)
        print(f"\n{summary}")
    except ValueError as e:
        print(e)
    except Exception as e:
        print(f"OpenAI API error: {e}")

Summarizing...

The webpage from OpenAI emphasizes the accessibility and potential for innovation using their technology, particularly for a diverse audience that includes developers, scientists, and students. It highlights a transformative moment where individuals can bring their ideas to life with fewer barriers, suggesting that the possibilities for creation are expanding rapidly.

Key announcements include the introduction of various products such as the Codex app, GPT-5.3-Codex, and OpenAI Frontier, which are designed to enhance user experience and application development. The content also notes recent initiatives aimed at expanding access to ChatGPT, including a focus on healthcare applications and the ability for developers to submit apps to the platform.

The webpage features stories of organizations and individuals leveraging OpenAI's technology, such as Taisei Corporation and Higgsfield, showcasing practical applications in fields like creative writing, medical research, and 