In [1]:
import requests
from bs4 import BeautifulSoup
from IPython.display import Markdown, display

In [2]:
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

In [3]:
# A class to represent a Webpage
# Some websites need you to use proper headers when fetching them:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        """
        Create this Website object from the given url using the BeautifulSoup library
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [4]:
# Define our system prompt
system_prompt = (
    "You are an assistant that analyzes the contents of a website "
    "and provides a short summary, ignoring text that might be navigation related. "
    "Respond in markdown."
)

In [5]:
# A function that writes a User Prompt that asks for summaries of websites:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}.\n"
    user_prompt += (
        "The contents of this website are as follows; "
        "please provide a short summary of this website in markdown. "
        "If it includes news or announcements, then summarize these too.\n\n"
    )
    user_prompt += website.text
    return user_prompt

In [6]:
# A function to format the messages for Ollama
def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]


In [10]:
# Function to summarize a website using Ollama
def summarize(url):
    website = Website(url)
    payload = {
        "model": MODEL,
        "messages": messages_for(website),
        "stream": False
    }
    response = requests.post(OLLAMA_API, json=payload, headers=HEADERS)

    # Debug: Print the raw response for verification
    try:
        response_data = response.json()
        print("Raw API Response:", response_data)  # Debugging line
    except Exception as e:
        print("Error parsing JSON:", e)
        print("Raw Response Text:", response.text)
        return "Failed to parse response."

    # Access the nested 'content' key
    try:
        summary = response_data["message"]["content"]
        # Clean the summary by removing mentions of navigation or ignored content
        lines = summary.split("\n")
        cleaned_summary = "\n".join(
            line for line in lines if "navigation menu" not in line.lower() and "ignored for this analysis" not in line.lower()
        )
        return cleaned_summary.strip()
    except KeyError:
        error_message = response_data.get("error", "Unknown error occurred.")
        return f"API Error: {error_message}"

In [11]:
# Function to display the summary in markdown
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [13]:
# Example usage
ed = Website("https://edition.cnn.com/")
display_summary("https://edition.cnn.com/")

Raw API Response: {'model': 'llama3.2', 'created_at': '2024-12-18T09:44:02.0839403Z', 'message': {'role': 'assistant', 'content': 'According to the article, a Freed prisoner who said he was a victim of the Assad regime was an intelligence officer, locals say.'}, 'done_reason': 'stop', 'done': True, 'total_duration': 119274829600, 'load_duration': 52876800, 'prompt_eval_count': 2048, 'prompt_eval_duration': 116156000000, 'eval_count': 27, 'eval_duration': 3051000000}


According to the article, a Freed prisoner who said he was a victim of the Assad regime was an intelligence officer, locals say.