In [1]:
"""
nl_expat_news_digest.py

A Day5-style mini app:
1) scrape homepage links
2) use an LLM to pick *category* pages (politics/economy/etc.)
3) scrape each category page and use an LLM to pick *recent article* links
4) fetch each article page text
5) ask an LLM to produce a digest focused on what's useful for expats in NL

Designed to mirror the techniques in day5.ipynb:
- "get_*_user_prompt" helpers
- "select_*_links" functions using response_format={"type":"json_object"}
- "fetch_page_and_all_relevant_links"-style aggregation
- optional streaming output
"""

import os
import re
import json
from urllib.parse import urljoin, urlparse

from dotenv import load_dotenv
from openai import OpenAI

from scraper import fetch_website_links, fetch_website_contents


# -----------------------------
# Setup
# -----------------------------

load_dotenv(override=True)

MODEL_LINKS = os.getenv("MODEL_LINKS", "gpt-5-nano")        # small/cheap model for link selection
MODEL_DIGEST = os.getenv("MODEL_DIGEST", "gpt-4.1-mini")    # better model for final digest
openai = OpenAI()

DEFAULT_MAX_CATEGORIES = 9
DEFAULT_MAX_ARTICLES_PER_CATEGORY = 9
DEFAULT_MAX_ARTICLES_TOTAL = 100


In [2]:
# -----------------------------
# Prompts (Day5-style)
# -----------------------------

category_link_system_prompt = """
You are provided with a list of links found on the homepage of a Netherlands news website.
Your task is to pick links that are *news category/section* pages (e.g. politics, economy, housing, health, migration, transport, culture).
Exclude: privacy/terms, about, contact, subscribe, login, donate, ads, newsletters, social media, PDFs, mailto links.

Return JSON in this exact shape:

{
  "categories": [
    {"name": "politics", "url": "https://full.url/to/category"},
    {"name": "economy", "url": "https://full.url/to/category"}
  ]
}

Rules:
- URLs must be full absolute https URLs.
- Prefer a diverse set of categories (not 6 variations of the same thing).
- If the site is in Dutch, still return category names in English.
"""

article_link_system_prompt = """
You are provided with a list of links found on a *category page* of a Netherlands news website.
Pick links that are *individual news articles* from that category, prioritizing the most recent items.

Exclude: category index pages, tag pages, author pages, pagination, subscribe/login, privacy/terms, newsletters, social media.

Return JSON in this exact shape:

{
  "articles": [
    {"title": "short title", "url": "https://full.url/to/article"},
    {"title": "short title", "url": "https://full.url/to/article"}
  ]
}

Rules:
- Choose only a small number of articles (the caller will decide the limit).
- URLs must be full absolute https URLs.
- Titles should be short; if the title isn't obvious from the URL, make a best guess.
"""

digest_system_prompt = """
You are an assistant creating a practical news digest for expats living in the Netherlands.
You will be given scraped text from several recent news articles.

Write in English, in markdown WITHOUT code blocks.

Requirements:
- Start with a short "что важно сегодня/на этой неделе" section (3-7 bullets).
- Then group items by theme (e.g. migration/IND, taxes/benefits, housing, transport, healthcare, safety, education, politics/economy).
- For each item: 1-2 sentence summary + "why it's important for expats" + (optional) "what you can do" if there is a practical action.
- At the end include "Sources" as a bullet list of URLs used.
- If the scraped text is incomplete (paywall/JS), say so and avoid overconfident claims.
"""


In [None]:
# -----------------------------
# Helpers
# -----------------------------

from typing import Any


def _dedupe_keep_order(items):
    seen = set[Any]()
    out = []
    for x in items:
        if x not in seen:
            out.append(x)
            seen.add(x)
    return out


def _normalize_links(base_url, links, same_domain_only=True):
    """
    Convert hrefs to absolute URLs, filter junk, optionally keep only same-domain.
    """
    abs_links = []
    base_domain = urlparse(base_url).netloc.lower()

    for href in links:
        if not href:
            continue
        href = href.strip()
        if href.startswith("#"):
            continue
        if href.startswith("mailto:") or href.startswith("javascript:"):
            continue

        full = urljoin(base_url, href)

        # Filter out non-http(s)
        if not full.startswith("http://") and not full.startswith("https://"):
            continue

        if same_domain_only:
            if urlparse(full).netloc.lower() != base_domain:
                continue

        abs_links.append(full)

    return _dedupe_keep_order(abs_links)


def _safe_json_loads(text):
    """
    response_format={"type":"json_object"} should give valid JSON, but guard anyway.
    """
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        m = re.search(r"\{.*\}", text, flags=re.DOTALL)
        if not m:
            raise
        return json.loads(m.group(0))


In [4]:
# -----------------------------
# Step 1: category selection
# -----------------------------

def get_category_links_user_prompt(home_url, max_links=250):
    links = fetch_website_links(home_url)
    links = _normalize_links(home_url, links, same_domain_only=True)[:max_links]

    user_prompt = f"""
Here is the list of links on the homepage: {home_url}

Pick the category/section pages.

Links:
"""
    user_prompt += "\n".join(links)
    return user_prompt


def select_category_links(home_url, max_categories=DEFAULT_MAX_CATEGORIES):
    response = openai.chat.completions.create(
        model=MODEL_LINKS,
        messages=[
            {"role": "system", "content": category_link_system_prompt},
            {"role": "user", "content": get_category_links_user_prompt(home_url)},
        ],
        response_format={"type": "json_object"},
    )
    data = _safe_json_loads(response.choices[0].message.content)

    categories = data.get("categories", [])
    # Basic cleanup + limit
    cleaned = []
    for c in categories:
        url = c.get("url")
        name = (c.get("name") or "").strip() or "category"
        if not url:
            continue
        cleaned.append({"name": name, "url": url})
    return cleaned[:max_categories]

In [5]:
print(select_category_links("https://www.dutchnews.nl"),DEFAULT_MAX_CATEGORIES)

[{'name': 'politics', 'url': 'https://www.dutchnews.nl/category/politics/'}, {'name': 'economy', 'url': 'https://www.dutchnews.nl/category/economy/'}, {'name': 'housing', 'url': 'https://www.dutchnews.nl/category/housing/'}, {'name': 'health', 'url': 'https://www.dutchnews.nl/category/health/'}, {'name': 'education', 'url': 'https://www.dutchnews.nl/category/education/'}, {'name': 'art-and-culture', 'url': 'https://www.dutchnews.nl/category/art-and-culture/'}, {'name': 'expats', 'url': 'https://www.dutchnews.nl/category/expats/'}, {'name': 'amsterdam', 'url': 'https://www.dutchnews.nl/category/amsterdam/'}, {'name': 'books', 'url': 'https://www.dutchnews.nl/category/books/'}] 9


In [7]:
# -----------------------------
# Step 2: article selection per category
# -----------------------------

def get_article_links_user_prompt(category_url, max_links=300):
    links = fetch_website_links(category_url)
    links = _normalize_links(category_url, links, same_domain_only=True)[:max_links]

    user_prompt = f"""
Here is the list of links found on the category page: {category_url}

Pick recent article links.

Links:
"""
    user_prompt += "\n".join(links)
    return user_prompt


def select_recent_articles(category_url, max_articles=DEFAULT_MAX_ARTICLES_PER_CATEGORY):
    response = openai.chat.completions.create(
        model=MODEL_LINKS,
        messages=[
            {"role": "system", "content": article_link_system_prompt},
            {"role": "user", "content": get_article_links_user_prompt(category_url)},
        ],
        response_format={"type": "json_object"},
    )
    data = _safe_json_loads(response.choices[0].message.content)

    articles = data.get("articles", [])
    cleaned = []
    for a in articles:
        url = a.get("url")
        title = (a.get("title") or "").strip() or "article"
        if not url:
            continue
        cleaned.append({"title": title, "url": url})
    return cleaned[:max_articles]


In [8]:
# -----------------------------
# Step 3: fetch + aggregate text (Day5-style)
# -----------------------------

def fetch_homepage_categories_and_articles(
    home_url,
    max_categories=DEFAULT_MAX_CATEGORIES,
    max_articles_per_category=DEFAULT_MAX_ARTICLES_PER_CATEGORY,
    max_articles_total=DEFAULT_MAX_ARTICLES_TOTAL,
):
    """
    Similar to day5.ipynb's `fetch_page_and_all_relevant_links`,
    but for a news digest: homepage -> categories -> recent articles.
    """
    homepage = fetch_website_contents(home_url)

    categories = select_category_links(home_url, max_categories=max_categories)

    # Pick articles
    all_articles = []
    for cat in categories:
        cat_url = cat["url"]
        picked = select_recent_articles(cat_url, max_articles=max_articles_per_category)
        for a in picked:
            all_articles.append({"category": cat["name"], **a})
        if len(all_articles) >= max_articles_total:
            break
    all_articles = all_articles[:max_articles_total]

    # Fetch article contents
    result = f"## Homepage ({home_url})\n\n{homepage}\n\n## Articles\n"
    for a in all_articles:
        result += f"\n\n### Category: {a['category']}\n"
        result += f"### Title (from link selection): {a['title']}\n"
        result += f"### URL: {a['url']}\n\n"
        result += fetch_website_contents(a["url"])

    # Keep input reasonably bounded
    return result[:18_000], [a["url"] for a in all_articles]


def get_digest_user_prompt(home_url, **kwargs):
    blob, urls = fetch_homepage_categories_and_articles(home_url, **kwargs)
    user_prompt = f"""
You are creating a digest for expats based on these scraped pages.
The data below may contain navigation text; focus on actual news content.

Scraped data:
{blob}

Remember to include an "Источники" section at the end.
"""
    # Keep it bounded (Day5 truncates too)
    return user_prompt[:22_000], urls

In [10]:
# -----------------------------
# Step 4: final digest (like create_brochure/stream_brochure)
# -----------------------------

def create_expat_digest(home_url, **kwargs):
    prompt, _urls = get_digest_user_prompt(home_url, **kwargs)
    response = openai.chat.completions.create(
        model=MODEL_DIGEST,
        messages=[
            {"role": "system", "content": digest_system_prompt},
            {"role": "user", "content": prompt},
        ],
    )
    return response.choices[0].message.content


def stream_expat_digest(home_url, **kwargs):
    """
    Notebook-friendly streaming version (similar to day5.ipynb).
    Prints to stdout if IPython display isn't available.
    """
    prompt, _urls = get_digest_user_prompt(home_url, **kwargs)

    stream = openai.chat.completions.create(
        model=MODEL_DIGEST,
        messages=[
            {"role": "system", "content": digest_system_prompt},
            {"role": "user", "content": prompt},
        ],
        stream=True,
    )

    try:
        from IPython.display import Markdown, display, update_display
        response_text = ""
        display_handle = display(Markdown(""), display_id=True)
        for chunk in stream:
            response_text += chunk.choices[0].delta.content or ""
            update_display(Markdown(response_text), display_id=display_handle.display_id)
        return response_text
    except Exception:
        # Fallback to console streaming
        response_text = ""
        for chunk in stream:
            delta = chunk.choices[0].delta.content or ""
            response_text += delta
            print(delta, end="", flush=True)
        print()
        return response_text


"""if __name__ == "__main__":
    # Example:
    # python nl_expat_news_digest.py "https://www.dutchnews.nl/"
    import sys
    if len(sys.argv) < 2:
        print("Usage: python nl_expat_news_digest.py <news_home_url>")
        raise SystemExit(2)

    url = sys.argv[1]
    print(create_expat_digest(url))
"""

'if __name__ == "__main__":\n    # Example:\n    # python nl_expat_news_digest.py "https://www.dutchnews.nl/"\n    import sys\n    if len(sys.argv) < 2:\n        print("Usage: python nl_expat_news_digest.py <news_home_url>")\n        raise SystemExit(2)\n\n    url = sys.argv[1]\n    print(create_expat_digest(url))\n'

In [11]:
stream_expat_digest("https://www.dutchnews.nl/")

# What’s Important Today / This Week

- The new Dutch government will be a rare minority cabinet formed by D66, CDA, and VVD.
- Young first-time homebuyers are benefiting from landlords selling more properties, despite rising prices.
- Inflation slightly eases to 2.8% but remains above the eurozone average.
- Financial struggles persist in the café and restaurant sector despite increased spending.
- Northern Netherlands faces road closures due to heavy snow and very cold weather.
- Explosive attacks on property remain at a high and concerning level.
- Flight cancellations continue at Schiphol Airport due to severe winter conditions and operational challenges.

---

# Politics and Government

**D66, CDA, and VVD Agree to Form Minority Cabinet**  
The three parties will form a minority government holding 66 out of 76 seats in the lower house, relying on support from other parties per issue. This new coalition marks a shift in Dutch politics, emphasizing cooperation despite political fragmentation.  
*Why important:* This minority setup may result in more negotiations and slower policymaking, impacting legislative developments relevant to expats, such as immigration, housing, and social policies.

**Road Closures and Severe Weather in the North**  
Due to heavy snow and drifting, several roads including N33 and N46 in Groningen are closed, with KNMI issuing a code amber warning. Extreme cold and more snow are anticipated over the weekend.  
*Why important:* If you’re in or planning to travel through northern provinces, expect disruptions and take extra caution; plan travels accordingly.

**Little Change in Explosive Property Attacks**  
Police reported 1,525 incidents of explosive attacks on private and commercial properties in 2025, many involving illegal fireworks and arson tactics. The attackers are often vulnerable youths recruited by brokers.  
*Why important:* Safety remains a concern in some neighborhoods; expats should stay informed about local community safety initiatives and take basic precautions where advised.

**Podcast Insight: Cancelled Transport and New Government Urgency**  
The ongoing winter disruption has severely affected flights, trains, and roads, with Schiphol airport struggling with snow and lack of de-icing fluid. Political talks have gained urgency amid these events.  
*Why important:* Transport disruptions can affect travel plans; staying updated can help mitigate inconvenience.

**Bad News for Elfstedentocht Ice Skating Event**  
Heavy snow is unfortunately detrimental to good ice conditions for this iconic event, reducing chances for a successful race.  
*Why important:* Expats interested in Dutch cultural events may want to track updates if planning to attend or follow the Elfstedentocht.

---

# Economy and Housing

**Spending in Cafes Rises, but Financial Challenges Persist**  
Turnover in cafés and restaurants rose by 5% in early 2025, helped by rising incomes and purchasing power. However, nearly a quarter of owners struggle with serious debt, and closures hit a ten-year high.  
*Why important:* Economic pressures on hospitality may affect service availability and job opportunities; consider this if looking for work or starting a business.

**Inflation Falls Slightly to 2.8% but Stays Above Eurozone Average**  
Dutch inflation eased from 2.9% to 2.8% in December 2025, averaging 3.3% for the year—higher than the eurozone average of 2.1%. Energy costs dropped slightly, but food inflation hovered above 3%.  
*Why important:* Rising living costs affect budgeting; expats should prepare for somewhat higher expenses on groceries and utilities compared to neighboring countries.

**More Houses Sold to Young First-Time Buyers as Landlords Sell Up**  
Over half of all homes sold in 2025 were to first-time buyers, with 11% under 25. Landlords are exiting the market due to new rent controls, creating more opportunities for young buyers, often with parental financial support.  
*Why important:* First-time buyers, including young expats, may find more housing options despite price rises; those renting may face changes in the rental market.  
*What you can do:* Explore first-time buyer programs and consult financial advisors for mortgage readiness; keep an eye on rental market shifts.

---

# Transport and Weather

**Winter Disruptions Continue: Snow and Cold Affect Travel**  
Significant snowfall in the north is causing road closures; snow is spreading south with expected accumulation in Limburg and other regions. Extreme cold temperatures are forecast.  
*Why important:* Weather can disrupt commuting and travel plans; prepare warm clothing and consider flexible work or travel arrangements.

**Schiphol Airport Faces Flight Cancellations due to Weather**  
Severe winter conditions and operational challenges (e.g., de-icing fluid shortages) have led to ongoing cancellations and delays. Airport authorities pledge improvements.  
*Why important:* If traveling through Schiphol, monitor your flight status actively, and factor in potential delays or cancellations.

---

# Summary for Expats

- The forming of a minority cabinet means potentially slower policy changes but also more political dialogue.
- Housing market shifts favor young buyers but may create uncertainty in the rental sector.
- Inflation and living costs remain elevated, so budgeting is advised.
- Expect winter weather to impact transport and daily life—stay prepared.
- Hospitality businesses face financial strain despite increased patronage.
- Safety concerns with explosive attacks remain; stay alert to local news.
- Flight disruptions at Schiphol require careful travel planning.

---

# Sources

- https://www.dutchnews.nl/2026/01/spending-in-cafes-goes-up-but-financial-problems-remain/  
- https://www.dutchnews.nl/2026/01/inflation-falls-slightly-to-2-8-still-above-eurozone-average/  
- https://www.dutchnews.nl/2026/01/more-houses-sold-to-young-first-time-buyers-as-landlords-sell-up/  
- https://www.dutchnews.nl/d66-cda-and-vvd-agree-to-form-a-minority-dutch-cabinet/  
- https://www.dutchnews.nl/2026/01/road-closures-in-north-more-snow-and-a-very-cold-weekend-ahead/  
- https://www.dutchnews.nl/2026/01/little-change-in-total-explosive-attacks-on-property-last-year/  
- https://www.dutchnews.nl/2026/01/podcast-the-cancelled-planes-trains-and-automobiles-editions/  
- https://www.dutchnews.nl/2026/01/bad-news-for-the-elfstedentocht-snow-is-the-enemy-of-ice/

