# Relevant News For Netherlands Expats

A solution for having up-to-date information relevant to expats in the Netherlands. It scarps main news portals and makes a summary with relevant information.

## List of Sources
https://www.dutchnews.nl/

## Setup

In [None]:
#imports

import os
import json
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from scraper import fetch_website_links, fetch_website_contents
from openai import OpenAI

In [None]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")

MODEL_LINKS = 'gpt-5-nano'      #small/cheap model for link selection
MODEL_DIGEST = 'gpt-4.1-mini'   #better model for final digest
openai = OpenAI()

DEFAULT_MAX_CATEGORIES = 30
DEFAULT_MAX_ARTICLES_PER_CATEGORY = 100
DEFAULT_MAX_ARTICLES_TOTAL = 300

news_source = "https://www.dutchnews.nl/"

In [None]:
links = fetch_website_links("https://www.dutchnews.nl")
links

## Prompts

In [None]:
category_link_system_prompt = """
You are provided with a list of links found on the homepage of a Netherlands news website.
Your task is to pick links that are *news category/section* pages (e.g. politics, economy, housing, health, migration, transport, culture).
Exclude: privacy/terms, about, contact, subscribe, login, donate, ads, newsletters, social media, PDFs, mailto links.

Return JSON in this exact shape:

{
  "categories": [
    {"name": "politics", "url": "https://full.url/to/category"},
    {"name": "economy", "url": "https://full.url/to/category"}
  ]
}

Rules:
- URLs must be full absolute https URLs.
- Prefer a diverse set of categories (not 6 variations of the same thing).
- If the site is in Dutch, still return category names in English.
"""

article_link_system_prompt = """
You are provided with a list of links found on a *category page* of a Netherlands news website.
Pick links that are *individual news articles* from that category.

Return UP TO {max_articles} articles. If there are enough, return as close to {max_articles} as possible.
Prefer the most recent-looking ones. Always include housing market news.
Exclude: category index pages, tag pages, author pages, pagination, subscribe/login, privacy/terms, newsletters, social media.

Return JSON in this exact shape:

{
  "articles": [
    {"title": "short title", "url": "https://full.url/to/article"},
    {"title": "short title", "url": "https://full.url/to/article"}
  ]
}

Rules:
- URLs must be full absolute https URLs.
- Titles should be short; if the title isn't obvious from the URL, make a best guess.
"""

digest_system_prompt = """
You are an assistant creating a practical news digest for expats living in the Netherlands. Your ton of voice a bit sarcastic and straightforward and can remind Mark Manson style.
You will be given scraped text from several recent news articles.

Write in English, in markdown WITHOUT code blocks.

Requirements:
- Start with a short "What were important events this week" section (3-7 bullets).
- Then group items by theme 5-7 for each gourp (e.g. migration/IND, taxes/benefits, housing, transport, healthcare, safety, education, politics/economy).
- For each item: 1-2 sentence summary + "why it's important for expats" + (optional) "what you can do" if there is a practical action.
- At the end of each group include "Sources" as a bullet list of URLs used.
- If the scraped text is incomplete (paywall/JS), say so and avoid overconfident claims.
"""

## Step 1: category selection

In [None]:
def get_category_links_user_prompt(url):
    links = fetch_website_links(url)

    user_prompt = f"""
Here is the list of links on the website {url} -
Pick the category/section pages.

Links (some might be relative links):
"""
    user_prompt += "\n".join(links)
    return user_prompt

In [None]:
print(get_category_links_user_prompt("https://www.dutchnews.nl"))

In [None]:
def selec_category_links(url, max_categories=DEFAULT_MAX_CATEGORIES):
    print(f"Selecting relevant category links for {url} by calling {MODEL_LINKS}")
    response = openai.chat.completions.create(
        model=MODEL_LINKS,
        messages=[
            {"role": "system", "content": category_link_system_prompt},
            {"role": "user", "content": get_category_links_user_prompt(url)}
        ],
        response_format={"type": "json_object"}
    )
    
    data = json.loads(response.choices[0].message.content)

    categories = data.get("categories", [])
    if not isinstance(categories, list):
        categories = []

    print(f"Found {len(categories)} relevant category links")

    data["categories"] = categories[:max_categories]
    return data

In [None]:
selec_category_links("https://www.dutchnews.nl")

## Step 2: article selection per category

In [None]:

def get_article_links_user_prompt(category_url, max_links=300):
    links = fetch_website_links(category_url)

    links = [str(l).strip() for l in links if l]

    links = list[str](dict.fromkeys(links))
    links = links[:max_links]
    user_prompt = f"""
Here is the list of links found on the category page {category_url}

Pick recent article links. 
Links (some might be relative links):
"""
    user_prompt += "\n".join(links)
    return user_prompt

In [None]:
print(get_article_links_user_prompt("https://www.dutchnews.nl/category/economy/"))

In [None]:
def select_recent_articles(category_url, max_articles=DEFAULT_MAX_ARTICLES_PER_CATEGORY):
    response = openai.chat.completions.create(
        model=MODEL_LINKS,
        messages=[
            {"role": "system", "content": article_link_system_prompt},
            {"role": "user", "content": get_article_links_user_prompt(category_url)},
        ],
        response_format={"type": "json_object"},
    )
    data = json.loads(response.choices[0].message.content)
    articles = data.get("articles", [])

    result = []
    for a in articles:
        url = a.get("url")
        title = a.get("title")
        if not url:
            continue
        result.append({"title": title, "url": url})
    return result[:max_articles]

In [None]:
print(select_recent_articles("https://www.dutchnews.nl/category/economy/"))

## Step 3: fetch + aggregate text

In [None]:
def fetch_homepage_categories_and_articles(
    home_url,
    max_categories=DEFAULT_MAX_CATEGORIES,
    max_articles_per_category=DEFAULT_MAX_ARTICLES_PER_CATEGORY,
    max_articles_total=DEFAULT_MAX_ARTICLES_TOTAL,
):

    homepage = fetch_website_contents(home_url)
    categories_data = selec_category_links(home_url, max_categories=max_categories)
    categories = categories_data.get("categories", [])

    #Pick articles
    all_articles = []
    for cat in categories:
        cat_url = cat.get("url")
        cat_name = cat.get("name", "unknown")

        picked = select_recent_articles(cat_url, max_articles=max_articles_per_category)
        for a in picked:
            all_articles.append({"category": cat_name, **a})
        if len(all_articles) >= max_articles_total:
            break
    all_articles = all_articles[:max_articles_total]

# Fetch articles contents
    result = f"## Homepage ({home_url})\n\n{homepage}\n\n## Articles\n"
    for a in all_articles:
        result += f"\n\n###Category: {a['category']}\n"
        result += f"### Title (from link selection): {a['title']}\n"
        result += f"### URL: {a['url']}\n\n"
        result += fetch_website_contents(a["url"])

# Keep input reasonably bounded
    return result[:38_000], [a["url"] for a in all_articles]

def get_digest_user_prompt(home_url, **kwargs):
    blob_text, urls = fetch_homepage_categories_and_articles(home_url, **kwargs)
    user_prompt = f"""
You are creating a digest for expats based on these scraped pages.
The data below may contain navigation text; focus on actual news content.

Scraped data:
{blob_text}

All article URLs (for reference):
{chr(10).join(urls)}

Remember to include a "Sources" at the end of each section.
"""
    return user_prompt



## Step 4: final digest

In [None]:
def create_expat_digest(home_url, **kwargs):
    prompt = get_digest_user_prompt(home_url, **kwargs)
    response = openai.chat.completions.create(
        model=MODEL_DIGEST,
        messages=[
            {"role": "system", "content": digest_system_prompt},
            {"role": "user", "content": prompt},
        ],
    )
    return display(Markdown(response.choices[0].message.content))

def stream_expat_digest(home_url, **kwargs):
    prompt = get_digest_user_prompt(home_url, **kwargs)
    stream = openai.chat.completions.create(
        model=MODEL_DIGEST,
        messages=[
            {"role": "system", "content": digest_system_prompt},
            {"role": "user", "content": prompt},
        ],
        stream=True
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        update_display(Markdown(response), display_id=display_handle.display_id)


    

## Try it out

In [None]:
stream_expat_digest(news_source)