In [None]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

In [None]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


In [None]:
openai = OpenAI()

In [None]:
# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish."

system_prompt = "You are an assistant trashes websites after analyzeing the contents of a website \
and provides a hillarious short summary"

In [None]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:
# See how this function creates exactly the format above

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [None]:
# A class to represent a Webpage
# If you're not familiar with Classes, check out the "Intermediate Python" notebook

# Some websites need you to use proper headers when fetching them:
# Import necessary modules
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

class ScrapeWebsite:
    def __init__(self, url):
        """
        Create this Website object from the given URL using Selenium + BeautifulSoup
        Supports JavaScript-heavy and normal websites uniformly.
        """
        self.url = url
        driver = None
        
        try:
            # Configure headless Chrome with better options for JS-heavy sites
            options = Options()
            options.add_argument('--headless')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--disable-gpu')
            options.add_argument('--window-size=1920,1080')
            # Add user agent to avoid bot detection
            options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
            # Disable automation flags
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)

            # Use webdriver-manager to manage ChromeDriver
            service = Service(ChromeDriverManager().install())

            # Initialize the Chrome WebDriver with the service and options
            driver = webdriver.Chrome(service=service, options=options)
            
            # Execute script to hide webdriver property
            driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
                'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
            })

            # Start Selenium WebDriver
            print(f"Loading {url}...")
            driver.get(url)

            # Wait for page to load - use WebDriverWait instead of just time.sleep
            # Wait for body element to be present (indicates page has loaded)
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            # Additional wait for JavaScript to execute
            time.sleep(3)
            
            # Scroll down to trigger lazy-loaded content
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            
            # Scroll back up
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1)

            # Fetch the page source after JS execution
            page_source = driver.page_source
            
            # Parse the HTML content with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')

            # Extract title
            self.title = soup.title.string if soup.title else "No title found"

            # Remove unnecessary elements
            if soup.body:
                for irrelevant in soup.body(["script", "style", "img", "input", "noscript"]):
                    irrelevant.decompose()
                self.text = soup.body.get_text(separator="\n", strip=True)
            else:
                self.text = ""
                
            print(f"Successfully scraped: {self.title[:50]}...")
            
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            self.title = "Error loading page"
            self.text = f"Failed to load content: {str(e)}"
        finally:
            if driver:
                driver.quit()


In [None]:
def summarize_js_website(url):
    website = ScrapeWebsite(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [None]:
summary = summarize_js_website("https://openai.com")

In [None]:
display(Markdown(summary))