In [9]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI


In [10]:
import pandas as pd 

In [11]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [12]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"

        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""

        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
 

In [15]:
def get_condition_links_from_topics_page():
    topics_url = "https://www.thuisarts.nl/overzicht/onderwerpen"
    response = requests.get(topics_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <a> tags that look like condition pages
    links = soup.find_all("a", href=True)
    condition_links = []

    for link in links:
        href = link['href']
        if href.startswith("/"):
            href = "https://www.thuisarts.nl" + href
        if href.startswith("https://www.thuisarts.nl/") and len(href.split("/")) > 3:
            condition_links.append(href)

    # Remove duplicates and return
    return list(set(condition_links))


In [16]:
link_system_prompt = """You are an assistant that filters URLs for patient education content. 

Only return links that lead to pages about symptoms, health conditions, treatments, or diseases — for example: pages on 'headache', 'diarrhea', 'stomach pain', 'asthma', etc.

DO NOT return:
- contact pages
- overview/video/image/keuzekaart lists unless they directly link to medical complaints
- navigation or privacy/cookie/social media links

Respond only with full https links in JSON format, like this:
{
  "links": [
    {"type": "symptom or condition page", "url": "https://www.thuisarts.nl/hoofdpijn"},
    {"type": "symptom or condition page", "url": "https://www.thuisarts.nl/buikpijn"}
  ]
}
"""


In [17]:
print(link_system_prompt)

You are an assistant that filters URLs for patient education content. 

Only return links that lead to pages about symptoms, health conditions, treatments, or diseases — for example: pages on 'headache', 'diarrhea', 'stomach pain', 'asthma', etc.

DO NOT return:
- contact pages
- overview/video/image/keuzekaart lists unless they directly link to medical complaints
- navigation or privacy/cookie/social media links

Respond only with full https links in JSON format, like this:
{
  "links": [
    {"type": "symptom or condition page", "url": "https://www.thuisarts.nl/hoofdpijn"},
    {"type": "symptom or condition page", "url": "https://www.thuisarts.nl/buikpijn"}
  ]
}



In [18]:
condition_links = get_condition_links_from_topics_page()
print(f"✅ Found {len(condition_links)} condition pages.")

# Format for summary function
selected_links = [{"url": link} for link in condition_links]


✅ Found 680 condition pages.


In [19]:
import json

def load_existing_summaries(filepath="brochure_cache.json"):
    if os.path.exists(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}

def save_summaries_to_cache(summaries, filepath="brochure_cache.json"):
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(summaries, f, indent=2, ensure_ascii=False)


In [20]:
selected_links = [{"url": link} for link in get_condition_links_from_topics_page()][:10]


In [21]:
summary_cache = {}

def summarize_for_brochure(url):
    if url in summary_cache:
        summary = summary_cache[url]
        print(f"✅ [Cached] {url}")
        print(f"📄 Summary:\n{summary}\n")  # 👈 this prints the cached summary too
        return summary

    page = Website(url)

    example = """
Example:

Title: Keelpijn  
Summary: Sore throat is a common symptom, often caused by a virus. It usually goes away on its own within a few days. Drink warm fluids, rest your voice, and take paracetamol if needed. See a doctor if the pain lasts more than a week or gets worse.

Title: Hoofdpijn  
Summary: Headaches can have many causes like stress, fatigue, or dehydration. Most are harmless and go away with rest and fluids. Painkillers like paracetamol can help. If headaches are severe, frequent, or different than usual, contact your GP.
"""

    prompt = f"""
You are a health writer. Based on the Dutch content below, write a clear, short, brochure-style summary in **English** for patients.

Use the format:  
Title: {page.title}  
Summary: <your summary>

Keep it under 100 words, easy to read, friendly, and medically accurate.

{example}

Now use this for:
Title: {page.title}
Content:
{page.text[:3000]}
"""

    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4
    )

    summary = response.choices[0].message.content.strip()
    summary_cache[url] = summary
    return summary


In [22]:
def build_symptom_brochure(links, cache_file="brochure_cache.json"):
    brochure = []
    cached = load_existing_summaries(cache_file)
    print("📄 Building summaries for brochure:\n")

    for i, item in enumerate(links, 1):
        url = item["url"]
        if url in cached:
            print(f"✅ [Cached] {url}")
            brochure.append({"url": url, "summary": cached[url]})
            continue
        
        print(f"🔄 [{i}/{len(links)}] Summarizing: {url}")
        try:
            summary = summarize_for_brochure(url)
            print(f"✅ Summary:\n{summary}\n")
            brochure.append({"url": url, "summary": summary})
            cached[url] = summary  # Save new summary
            save_summaries_to_cache(cached, cache_file)
        except Exception as e:
            print(f"❌ Error summarizing {url}: {e}\n")
            brochure.append({"url": url, "summary": "Error generating summary."})

    return brochure


In [24]:
brochure = build_symptom_brochure(selected_links)

📄 Building summaries for brochure:

🔄 [1/10] Summarizing: https://www.thuisarts.nl/sociale-angststoornis
✅ [New] https://www.thuisarts.nl/sociale-angststoornis
📄 Summary:
Title: Social Anxiety Disorder
Summary: Social anxiety disorder, or social phobia, is a fear of what others think of you, often leading to panic attacks. Writing down what happens, your thoughts, and feelings can help manage this fear. Positive thinking can also be beneficial when you're feeling anxious. Discussing your concerns with your GP or practice nurse can be helpful. If there's no improvement or symptoms are severe, treatments such as therapy with a psychologist or anxiety medication may be considered.

✅ Summary:
Title: Social Anxiety Disorder
Summary: Social anxiety disorder, or social phobia, is a fear of what others think of you, often leading to panic attacks. Writing down what happens, your thoughts, and feelings can help manage this fear. Positive thinking can also be beneficial when you're feeling anxi

In [23]:
def export_brochure_to_txt(brochure, filepath="brochure_summaries.txt"):
    if not brochure:
        print("⚠️ No summaries to export.")
        return

    with open(filepath, "w", encoding="utf-8") as f:
        for item in brochure:
            url = item.get("url", "Unknown URL")
            summary = item.get("summary", "No summary available.")
            f.write(f"URL: {url}\n")
            f.write(f"{summary}\n\n")

    print(f"📁 Exported {len(brochure)} summaries to {filepath}")


In [26]:
export_brochure_to_txt(brochure)


📁 Exported 10 summaries to brochure_summaries.txt


In [None]:
###---it works---