# Website URL Summarizer

Fetch a webpage, extract meaningful text, and summarize it using OpenAI's API.

In [12]:
import os

import requests
from bs4 import BeautifulSoup
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

True

In [13]:
REMOVE_TAGS = [
    "script", "style", "nav", "footer", "header",
    "aside", "form", "noscript", "iframe",
]
MAX_TEXT_LENGTH = 15_000


def extract_text(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        )
    }
    response = requests.get(url, headers=headers, timeout=15)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    for tag in soup(REMOVE_TAGS):
        tag.decompose()

    text = soup.get_text(separator="\n", strip=True)
    return text[:MAX_TEXT_LENGTH]

In [14]:
def summarize(text):
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not set in .env file.")

    model = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
    client = OpenAI(api_key=api_key)

    response = client.chat.completions.create(
        model=model,
        temperature=0.3,
        messages=[
            {
                "role": "system",
                "content": (
                    "Summarize the following webpage content in 3-5 paragraphs. "
                    "Focus on the main points and key information. "
                    "Ignore boilerplate text like cookie notices or navigation items."
                ),
            },
            {"role": "user", "content": text},
        ],
    )
    return response.choices[0].message.content

In [None]:
url = "https://github.com/faridraisi/website-summary"

In [16]:
text = None

print(f"Fetching {url} ...")
try:
    text = extract_text(url)
    print(f"Extracted {len(text)} characters.")
except requests.exceptions.ConnectionError:
    print(f"Could not connect to {url}")
except requests.exceptions.Timeout:
    print("Request timed out.")
except requests.exceptions.HTTPError as e:
    code = e.response.status_code
    if code == 403:
        print(f"Access denied (403). This site blocks automated requests.")
    elif code == 404:
        print(f"Page not found (404).")
    else:
        print(f"HTTP error {code}.")
except Exception as e:
    print(f"Unexpected error: {e}")

Fetching https://openai.com ...
Access denied (403). This site blocks automated requests.


In [None]:
if not text or not text.strip():
    print("No text to summarize. Check the fetch step above.")
else:
    print("Summarizing...")
    try:
        summary = summarize(text)
        print(f"\n{summary}")
    except ValueError as e:
        print(e)
    except Exception as e:
        print(f"OpenAI API error: {e}")