In [None]:
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

In [None]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


In [None]:
MODEL = "gpt-4o-mini"
openai = OpenAI()

In [None]:
# To give you a preview -- calling OpenAI with these messages is this easy. Any problems, head over to the Troubleshooting notebook.

message = "Hello model! This is my first ever message to you! Hi!"
response = openai.chat.completions.create(model=MODEL, messages=[{"role":"user", "content":message}])
print(response.choices[0].message.content)

In [None]:
def get_webpage_content(url, wait_time=5):
    """
    Extract clean text content from a webpage for LLM processing.

    Args:
        url (str): The URL to scrape
        wait_time (int): Seconds to wait for page to load

    Returns:
        dict: Contains 'title', 'content', and 'url'
    """
    # Setup headless Chrome
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    )

    driver = webdriver.Chrome(options=chrome_options)

    try:
        # Navigate to the page
        driver.get(url)

        # Wait for page to load
        time.sleep(wait_time)

        # Get page title
        title = driver.title

        # Get the full HTML after JavaScript execution
        html_content = driver.page_source

        # Parse with BeautifulSoup for better text extraction
        soup = BeautifulSoup(html_content, "html.parser")

        # Remove unwanted elements
        for element in soup(
            ["script", "style", "nav", "footer", "header", "aside", "noscript"]
        ):
            element.decompose()

        # Extract main content
        content = soup.get_text()

        # Clean up the text
        lines = (line.strip() for line in content.splitlines())
        chunks = (
            phrase.strip() for line in lines for phrase in line.split("  ")
        )
        content = " ".join(chunk for chunk in chunks if chunk)

        return {"title": title, "content": content, "url": url}

    except Exception as e:
        return {
            "title": "Error",
            "content": f"Failed to scrape content: {str(e)}",
            "url": url,
        }

    finally:
        driver.quit()

In [None]:
# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish."

system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [None]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt(content):
    user_prompt = f"You are looking at a website titled {content['title']}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += content['content']
    return user_prompt

In [None]:
# See how this function creates exactly the format above

def messages_for(content):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": content['content']}
    ]

In [None]:
# And now: call the OpenAI API. You will get very familiar with this!

def summarize(url):
    result = get_webpage_content(url)
    response = openai.chat.completions.create(
        model = MODEL,
        messages = messages_for(result)
    )
    return response.choices[0].message.content

In [None]:
summarize("https://edwarddonner.com")

In [None]:
# A function to display this nicely in the Jupyter output, using markdown

def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [None]:
display_summary("https://edwarddonner.com")

In [None]:
display_summary("https://cnn.com")

In [None]:
display_summary("https://anthropic.com")

In [None]:
display_summary("https://github.com/langchain-ai/open-canvas?tab=readme-ov-file")