# How-To

Press `Shift` + `Return` to run a Cell.


In [None]:
# imports

import os, requests, time
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

# Load environment variables in a file called .env
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key
if not api_key:
    print("No API key was found")
else:
    print("API key found and looks good so far!")

# Instantiate an OpenAI object
openai = OpenAI()

# Make a test call to a Frontier model (Open AI) to get started:

In [None]:
message = "Hello, GPT! Holla back to this space probe!"
response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"user", "content":message}])
print(response.choices[0].message.content)

## Summarization project

In [None]:
# Some websites need proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

"""
A class to represent a Webpage
"""
class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the BeautifulSoup library
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [None]:
# Summarize website content
website = Website("https://rwothoromo.wordpress.com/")
# print(eli.title, "\n", eli.text)

In [None]:
# A system prompt tells a model like GPT4o what task they are performing and what tone they should use
# A user prompt is the conversation starter that they should reply to

system_prompt = "You are an assistant that analyzes the contents of a given website, \
and returns a brief summary, ignoring text that might be navigation-related. \
Respond in markdown."

In [None]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:
print(user_prompt_for(website))

In [None]:
# The API from OpenAI expects to receive messages in a particular structure. Many of the other APIs share this structure:
messages = [
    {"role": "system", "content": "You are a snarky assistant"}, # system message
    {"role": "user", "content": "What is 2 + 2?"}, # user message
]
response = openai.chat.completions.create(model="gpt-4o-mini", messages=messages)
print(response.choices[0].message.content)

In [None]:
# To build useful messages for GPT-4o-mini

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

messages_for(website)

In [None]:
# Call the OpenAI API.

url = "https://rwothoromo.wordpress.com/"
website = Website(url)

def summarize(website):
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [None]:
summarize(website)

In [None]:
# A function to display this nicely in the Jupyter output, using markdown

summary = summarize(website)
def display_summary(summary):
    display(Markdown(summary))

In [None]:
display_summary(summary)
# display_summary(summarize(Website("https://edwarddonner.com")))
# display_summary(summarize(Website("https://cnn.com")))
# display_summary(summarize(Website("https://anthropic.com")))

In [None]:
# Websites protected with CloudFront (and similar) or with JavaScript need a Selenium or Playwright implementation. They return 403

# display_summary(summarize(Website("https://openai.com")))

In [None]:
# To generate the above summary, use selenium

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class WebsiteSelenium:
    def __init__(self, url):
        self.url = url
        self.title = "No title found"
        self.text = ""

        # Configure Chrome options (headless mode is recommended for server environments)
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--headless")  # Run Chrome in headless mode (without a UI)
        chrome_options.add_argument("--no-sandbox") # Required for running as root in some environments
        chrome_options.add_argument("--disable-dev-shm-usage") # Overcomes limited resource problems

        # Path to your WebDriver executable (e.g., chromedriver)
        # Make sure to replace this with the actual path to your chromedriver
        # You might need to download it from: https://chromedriver.chromium.org/downloads and place it in a drivers dir
        service = Service('./drivers/chromedriver-mac-x64/chromedriver')

        driver = None
        try:
            driver = webdriver.Chrome(service=service, options=chrome_options)
            driver.get(url)

            # Wait for the page to load and dynamic content to render
            # You might need to adjust the wait condition based on the website
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            time.sleep(3) # Give more time for JavaScript to execute

            # Get the page source after dynamic content has loaded
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            self.title = soup.title.string if soup.title else "No title found"
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)

        except Exception as e:
            print(f"Error accessing {url} with Selenium: {e}")
        finally:
            if driver:
                driver.quit() # Always close the browser

display_summary(summarize(WebsiteSelenium("https://openai.com")))

In [None]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

# Apply nest_asyncio to allow asyncio.run in Jupyter
nest_asyncio.apply()

class WebsitePlaywright:
    def __init__(self, url):
        self.url = url
        self.title = "No title found"
        self.text = ""
        asyncio.run(self._fetch_content())

    async def _fetch_content(self):
        async with async_playwright() as p:
            browser = None
            try:
                browser = await p.chromium.launch(headless=True)
                page = await browser.new_page()

                # Increase timeout for navigation and other operations
                await page.goto(self.url, timeout=60000) # Wait up to 60 seconds for navigation
                print(f"Accessing {self.url} with Playwright - goto()")

                # You might need to adjust or add more specific waits
                await page.wait_for_load_state('domcontentloaded', timeout=60000) # Wait for basic HTML
                # await page.wait_for_load_state('networkidle', timeout=60000) # Wait for network activity to settle
                await page.wait_for_selector('div.duration-short', timeout=60000) # instead of networkidle
                await page.wait_for_selector('body', timeout=60000) # Wait for the body to be present
                await asyncio.sleep(5) # Give a bit more time for final rendering

                content = await page.content()
                soup = BeautifulSoup(content, 'html.parser')

                self.title = soup.title.string if soup.title else "No title found"
                for irrelevant in soup.body(["script", "style", "img", "input"]):
                    irrelevant.decompose()
                self.text = soup.body.get_text(separator="\n", strip=True)
                print(f"Accessed {self.url} with Playwright")

            except Exception as e:
                print(f"Error accessing {self.url} with Playwright: {e}")
            finally:
                if browser:
                    await browser.close()

display_summary(summarize(WebsitePlaywright("https://openai.com/")))

In [None]:
# Step 1: Create your prompts

system_prompt = "You are a professional assistant. Review this conversation and provide a comprehensive summary. Also, suggest how much better the converation could have gone:"
user_prompt = """

Dear Email Contact,

I hope this message finds you well.
I would like to share that I have proficiency in front-end design tools, particularly Figma, react and Angular. At this stage, I am keenly interested in finding opportunities to apply these skills professionally.

If you are aware of any companies, projects, or platforms seeking enterprise in front-end design, I would be grateful for any advice or recommendations you might kindly provide.

Thank you very much for your time and consideration.

Hello Job Seeker,

I hope you are doing well.

The last role (3 months gig) I saw was looking for a junior PHP Developer. Does your CV include that?

Hello Email Contact,
Thank you for your feedback.
Yes my CV has PHP as one of my skill set. Can I share it with you?

Email Contact: They said "It's late. Interviews were on Monday"

Hello Email Contact

Thanks for the update. When you hear of any opportunity please let me know.

Email Contact: For now, check out https://refactory.academy/courses/refactory-apprenticeship/
"""

# Step 2: Make the messages list

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

# Step 3: Call OpenAI

response = openai.chat.completions.create(
    model = "gpt-4o-mini",
    messages = messages
)

# Step 4: print the result

print(response.choices[0].message.content)

In [None]:
# To perform summaries using a model running locally
import ollama

# OLLAMA_API = "http://localhost:11434/api/chat"
# HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

def summarize_with_local_model(url):
    website = Website(url)
    messages = messages_for(website)
    response = ollama.chat(
        model=MODEL,
        messages=messages,
        stream=False # just get the results, don't stream them
        )
    return response['message']['content']

display(Markdown(summarize_with_local_model("https://rwothoromo.wordpress.com/")))