In [None]:
import time
import os
from dotenv import load_dotenv
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

"""
This program loads any webpage using Selenium, extracts its readable text content,
and summarizes it using an OpenAI model.

WHY SELENIUM?
-------------
- Selenium controls a real browser (Chrome), which allows full JavaScript rendering.
- Works reliably on Windows, UV, and Jupyter/VSCode Notebook environments.
- Can handle dynamic content, infinite scrolling, AJAX, React/Vue/Next.js sites.
- Unlike Playwright, it has no issues with subprocess creation on Windows/Jupyter.

WHAT THIS PROGRAM DOES:
-----------------------
1. Opens a webpage in a real headless Chrome browser.
2. Loads dynamic content by automatically scrolling.
3. Extracts visible text while removing scripts/images/styles.
4. Sends cleaned content to an OpenAI model for summarization.
5. Displays the summary in Markdown format.

In short, Selenium solves the rendering problem, and OpenAI handles the NLP.
"""

# ---------------------------------------------------------
# Environment setup
# ---------------------------------------------------------
load_dotenv(override=True)
api_key = os.getenv("OPENAI_API_KEY")

# Fail loudly if key is missing
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not found. Please set it in your environment.")

openai = OpenAI()

# ---------------------------------------------------------
# Logging Setup
# ---------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="üü¶ [%(levelname)s] %(message)s"
)


class WebsiteError(Exception):
    """Custom exception for Website-related issues."""
    pass


class Website:
    """
    Represents a website and provides methods to:
    - Load the page in Selenium
    - Extract title and text
    """

    def __init__(self, url):
        self.url = url
        self.title = None
        self.text = None

    @classmethod
    def create(cls, url):
        """
        Factory method to instantiate and initialize a Website.
        """
        website = cls(url)
        website.initialize()
        return website

    def initialize(self):
        """
        Loads the webpage, scrolls for dynamic content, and extracts cleaned text.
        """
        logging.info(f"üåê Loading webpage: {self.url}")

        # Chrome browser configuration
        options = Options()
        options.add_argument("--headless=new")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--window-size=1920,1080")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("start-maximized")
        options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
        )

        # Create driver (ChromeDriver auto-installs if missing)
        driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=options
        )

        try:
            # Load the page
            driver.get(self.url)
            time.sleep(2)

            # ---------------------------------------------------------
            # Auto-scroll for dynamic content
            # ---------------------------------------------------------
            scroll_pause = 1.0
            max_scrolls = 6   # adjustable

            last_height = driver.execute_script("return document.body.scrollHeight")

            for i in range(max_scrolls):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(scroll_pause)
                new_height = driver.execute_script("return document.body.scrollHeight")

                if new_height == last_height:
                    logging.info("‚úî Reached bottom of page.")
                    break

                last_height = new_height
                logging.info(f"Scrolled page ({i+1}/{max_scrolls})")

            # ---------------------------------------------------------
            # Extract title and page content
            # ---------------------------------------------------------
            self.title = driver.title or "Untitled Page"
            logging.info(f"üìù Extracted title: {self.title}")

            html = driver.page_source

            soup = BeautifulSoup(html, "html.parser")

            # Remove irrelevant tags
            for tag in soup.find_all(["script", "style", "noscript", "img", "svg", "input", "meta"]):
                tag.decompose()

            # Clean text
            body = soup.body.get_text(separator="\n", strip=True) if soup.body else ""
            clean_text = "\n".join(line for line in body.splitlines() if line.strip())

            self.text = clean_text

            logging.info("‚úî Text extraction completed.")

        except Exception as e:
            raise WebsiteError(f"Failed to scrape {self.url}: {e}")

        finally:
            driver.quit()


# ---------------------------------------------------------
# Build OpenAI messages
# ---------------------------------------------------------
def messages_for(website):
    return [
        {"role": "system", "content": "Provide a fun, friendly, and respectful summary of the webpage. "
                "Use light humor and a playful tone, but stay accurate and do not insult "
                "the author, the content, or the subject. Make it enjoyable to read."},
        {
            "role": "user",
            "content": f"Title: {website.title}\n\nContent:\n{website.text}"
        }
    ]


# ---------------------------------------------------------
# Summarization
# ---------------------------------------------------------
def summarize(url):
    website = Website.create(url)
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_for(website)
    )
    return response.choices[0].message.content


# ---------------------------------------------------------
# Display in Notebook
# ---------------------------------------------------------
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))


# Example usage
display_summary("https://udemy.com")
