# Baby Ground News

## Brave Search API

In [1]:
BRAVE_TOKEN = "USE_YOUR_KEY"

In [2]:
import os
import time
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone

# --------------------
# Search Parameters
# --------------------

geography = "Greenland"
topic = "NATO and sovereignty"

allowed_domains = [
    "www.democracynow.org",
    "jacobin.com",
    "theintercept.com",
    "www.haaretz.com",
    "www.theguardian.com",
    "www.france24.com/en",
    "www.aljazeera.com",
    "www.npr.org",
    "www.sueddeutsche.de",
    "english.elpais.com",
    "www.abc.net.au/news",
    "www.asahi.com/ajw/",
    "www.blick.ch",
    "www.dailymaverick.co.za",
    "www.dawn.com",
    "www.spiegel.de/international",
    "www.repubblica.it",
    "www.lemonde.fr/en",
    "www.bbc.com/news",
    "www.cbc.ca/news",
    "www.reuters.com",
    "apnews.com",
    "africtelegraph.com",
    "www.ansa.it/english",
    "balkaninsight.com",
    "www.correiobraziliense.com.br",
    "www.euronews.com",
    "www.hindustantimes.com",
    "www.straitstimes.com",
    "www.scmp.com",
    "www.channelnewsasia.com",
    "dnevnik.mk",
    "www.thetimes.co.uk",
    "www.wsj.com",
    "www.jpost.com",
    "www.theglobeandmail.com",
    "www.afr.com",
    "www.businesslive.co.za",
    "www.faz.net",
    "gulfnews.com",
    "www.indiatoday.in",
    "www.telegraph.co.uk",
    "www.dailymail.co.uk",
    "www.trtworld.com",
    "www.lefigaro.fr",
    "www.lanacion.com.ar",
    "www.foxnews.com",
    "tass.com",
    "www.rt.com",
    "www.globaltimes.cn",
    "www.xinhuanet.com/english"
]

BASE_URL = "https://api.search.brave.com/res/v1/news/search"

base_params = {
    "search_lang": "en",
    "ui_lang": "en-US",
    "country": "US",
    "safesearch": "strict",
    "spellcheck": True,
    "freshness": "py",
    "operators": True,
    "count": 50,
    "offset": 0,
}

headers = {
    "Accept": "application/json",
    "Accept-Encoding": "gzip",
    "X-Subscription-Token": BRAVE_TOKEN,
}

# --------------------
# Rate limit + retry/backoff
# --------------------

MIN_SECONDS_BETWEEN_CALLS = 1.35
TIMEOUT_SECONDS = 15
MAX_RETRIES_429 = 3

_last_call_time = 0.0

def brave_get(session: requests.Session, params: dict) -> requests.Response:
    global _last_call_time

    for attempt in range(MAX_RETRIES_429 + 1):
        now = time.monotonic()
        wait = MIN_SECONDS_BETWEEN_CALLS - (now - _last_call_time)
        if wait > 0:
            time.sleep(wait)

        resp = session.get(BASE_URL, params=params, timeout=TIMEOUT_SECONDS)
        _last_call_time = time.monotonic()

        if resp.status_code != 429:
            return resp

        retry_after = resp.headers.get("Retry-After")
        if retry_after:
            try:
                backoff = float(retry_after)
            except ValueError:
                backoff = 2.0
        else:
            backoff = 2.0 + 2.0 * attempt

        time.sleep(backoff)

    return resp

# --------------------
# Search and Collect
# --------------------

rows = []
error_log = []

MAX_PAGES = 10

session = requests.Session()
session.headers.update(headers)

for domain in allowed_domains:
    # print(f"\n Searching in: {domain}")
    domain_query = f"{geography} {topic} site:{domain}"

    for page in range(MAX_PAGES):
        params = base_params.copy()
        params["q"] = domain_query
        params["offset"] = page

        # print(f"  → page {page+1}/{MAX_PAGES}", flush=True)

        try:
            resp = brave_get(session, params=params)

            if resp.status_code != 200:
                error_log.append({
                    "domain": domain,
                    "page": page,
                    "error": f"HTTP {resp.status_code}: {resp.text[:200]}"
                })
                if resp.status_code == 429:
                    break
                continue

            data = resp.json()

        except Exception as e:
            error_log.append({"domain": domain, "page": page, "error": str(e)})
            continue

        results = data.get("results", [])
        if not results:
            error_log.append({"domain": domain, "page": page, "error": "No results"})
            break

        for item in results:
            rows.append({
                "Brave page_age": item.get("page_age") or item.get("age") or "",
                "Description": item.get("title") or "",
                "Snippet": item.get("description") or "",
                "Hyperlink": item.get("url") or "",
                "Source Domain": (item.get("meta_url") or {}).get("hostname") or "",
            })

# --------------------
# Build DataFrame
# --------------------

df = pd.DataFrame(rows)
if not df.empty:
    df.drop_duplicates(subset=["Hyperlink"], inplace=True)

# --------------------
# CSV export
# --------------------

# today_str = datetime.today().strftime("%Y-%m-%d")
# query_slug = f"{topic}_{geography}".replace(" ", "_")
# filename = f"news_{query_slug}_{today_str}.csv"
# df.to_csv(filename, index=False)

# if error_log:
#     err_df = pd.DataFrame(error_log)
#     err_filename = f"errors_{query_slug}_{today_str}.csv"
#     err_df.to_csv(err_filename, index=False)

## Sentiment classification

In [3]:
import re
import math
import nltk
nltk.download("vader_lexicon")

# ----------------------------
# 2) Parse Brave page_age into a datetime
# ----------------------------

REL_RE = re.compile(r"^\s*(\d+)\s+(minute|hour|day|week|month|year)s?\s+ago\s*$", re.I)

def parse_brave_page_age(val):
    if pd.isna(val):
        return pd.NaT
    s = str(val).strip()
    if not s:
        return pd.NaT

    # 1) ISO-ish cases
    try:
        s_iso = s.replace("Z", "+00:00") if s.endswith("Z") else s
        dt = datetime.fromisoformat(s_iso)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return pd.Timestamp(dt.astimezone(timezone.utc))
    except Exception:
        pass

    # 2) Relative cases like "3 weeks ago"
    m = REL_RE.match(s.lower())
    if m:
        amount = int(m.group(1))
        unit = m.group(2).lower()
        delta = {
            "minute": timedelta(minutes=amount),
            "hour": timedelta(hours=amount),
            "day": timedelta(days=amount),
            "week": timedelta(weeks=amount),
            "month": timedelta(days=30 * amount),   # approx
            "year": timedelta(days=365 * amount),   # approx
        }[unit]
        return pd.Timestamp(datetime.now(timezone.utc) - delta)

    # 3) Common short date formats
    for fmt in ("%d-%b-%y", "%d-%b-%Y", "%Y-%m-%d"):
        try:
            dt = datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
            return pd.Timestamp(dt)
        except Exception:
            continue

    return pd.NaT

df["ArticleDateUTC"] = df["Brave page_age"].apply(parse_brave_page_age)

# Sort most recent first
df = df.sort_values("ArticleDateUTC", ascending=False).reset_index(drop=True)

# ----------------------------
# 3) Add a recency weight (newer = higher)
# ----------------------------
HALF_LIFE_DAYS = 14

now_utc = pd.Timestamp(datetime.now(timezone.utc))
age_days = (now_utc - df["ArticleDateUTC"]).dt.total_seconds() / (24 * 3600)

df["AgeDays"] = age_days
df["RecencyWeight"] = df["AgeDays"].apply(
    lambda d: float("nan") if pd.isna(d) else (0.5 ** (d / HALF_LIFE_DAYS))
)

# ----------------------------
# 4) Outlet classification table
# ----------------------------
outlets = [
    ("Democracy Now!", "US", "www.democracynow.org", "Far Left", -5, "Activist framing, anti-capitalist, anti-imperialist"),
    ("Jacobin", "US", "jacobin.com", "Far Left", -5, "Openly socialist, labor-focused, anti-neoliberal"),
    ("The Intercept", "US", "theintercept.com", "Far Left", -5, "Investigative, adversarial to intelligence/military"),
    ("Haaretz", "Israel", "www.haaretz.com", "Left", -3, "Pro-peace, civil rights framing, critical of Israeli right-wing government"),
    ("The Guardian", "UK", "www.theguardian.com", "Left", -3, "Social justice, progressive framing"),
    ("France 24", "France", "www.france24.com/en", "Center Left", -2, "Slight progressive lean, EU integrationist"),
    ("Al Jazeera", "Qatar", "www.aljazeera.com", "Center Left", -2, "Pro-Global South, critical of U.S. foreign policy"),
    ("NPR", "US", "www.npr.org", "Center Left", -2, "Balanced tone, liberal cultural framing"),
    ("Süddeutsche Zeitung", "Germany", "www.sueddeutsche.de", "Center Left", -2, "Liberal democratic, pro-EU, socially progressive"),
    ("El Pais", "Spain", "english.elpais.com", "Center Left", -2, "Pro-EU, social democracy framing"),
    ("ABC News", "Australia", "www.abc.net.au/news", "Center Left", -2, "Public broadcaster; slight progressive tone"),
    ("Asahi Shimbun", "Japan", "www.asahi.com/ajw/", "Center Left", -2, "Liberal, pro-democracy, anti-militarism framing"),
    ("Blick", "Switzerland", "www.blick.ch", "Center Left", -2, "Tabloid with progressive social framing"),
    ("Daily Maverick", "South Africa", "www.dailymaverick.co.za", "Center Left", -2, "Investigative, anti-corruption, liberal democratic lean"),
    ("Dawn", "Pakistan", "www.dawn.com", "Center Left", -2, "Independent, secular framing"),
    ("Der Spiegel", "Germany", "www.spiegel.de/international", "Center Left", -2, "Investigative, pro-EU, liberal framing"),
    ("La Repubblica", "Italy", "www.repubblica.it", "Center Left", -2, "Progressive editorial line, pro-EU"),
    ("Le Monde", "France", "www.lemonde.fr/en", "Center Left", -2, "Mainstream liberal paper, intellectual tone"),
    ("BBC News", "UK", "www.bbc.com/news", "Center", 0, "Traditional neutrality, Western liberal framing"),
    ("CBC News", "Canada", "www.cbc.ca/news", "Center", 0, "Public broadcaster with balanced coverage"),
    ("Reuters", "Global", "www.reuters.com", "Center", 0, "Fact-based, highly neutral tone"),
    ("Associated Press (AP)", "US", "apnews.com", "Center", 0, "Minimal framing, trusted wire service"),
    ("AfricTelegraph", "Pan-African", "africtelegraph.com", "Center", 0, "Covers African perspectives, moderate framing"),
    ("ANSA", "Italy", "www.ansa.it/english", "Center", 0, "Italy's main wire service; neutral tone"),
    ("Balkan Insight", "Southeast Europe", "balkaninsight.com", "Center", 0, "Regional watchdog reporting"),
    ("Correio Braziliense", "Brazil", "www.correiobraziliense.com.br", "Center", 0, "One of Brazil's oldest papers, centrist framing"),
    ("Euronews", "Europe", "www.euronews.com", "Center", 0, "Multilingual outlet; attempts neutral framing"),
    ("Hindustan Times", "India", "www.hindustantimes.com", "Center", 0, "Balanced framing, mainstream outlet"),
    ("Straits Times", "Singapore", "www.straitstimes.com", "Center", 0, "Government-aligned, factual, cautious framing"),
    ("South China Morning Post", "Hong Kong", "www.scmp.com", "Center Right", 2, "Pro-business, increasingly aligned with Beijing"),
    ("Channel News Asia", "Singapore", "www.channelnewsasia.com", "Center Right", 2, "Neutral framing with state alignment"),
    ("Dnevnik", "North Macedonia", "dnevnik.mk", "Center Right", 2, "National political coverage with conservative tone"),
    ("The Time", "UK", "www.thetimes.co.uk", "Center Right", 2, "Conservative but traditional framing"),
    ("Wall Street Journal", "US", "www.wsj.com", "Center Right", 2, "Pro-market economic framing"),
    ("The Jerusalem Post", "Israel", "www.jpost.com", "Center Right", 2, "Pro-Israel security framing"),
    ("The Globe and Mail", "Canada", "www.theglobeandmail.com", "Center Right", 2, "Fiscal conservative editorial tone"),
    ("Australian Financial Review", "Australia", "www.afr.com", "Center Right", 2, "Business-first framing"),
    ("Business Day", "South Africa", "www.businesslive.co.za", "Center Right", 2, "Fiscally conservative, business-focused"),
    ("Frankfurter Allgemeine (FAZ)", "Germany", "www.faz.net", "Center Right", 2, "Conservative-liberal, pro-business"),
    ("Gulf News", "UAE", "gulfnews.com", "Center Right", 2, "Pro-establishment, business-heavy coverage"),
    ("India Today", "India", "www.indiatoday.in", "Right", 3, "Nationalistic editorial tone"),
    ("The Telegraph", "UK", "www.telegraph.co.uk", "Right", 3, "Conservative Party aligned, pro-Brexit"),
    ("Daily Mail", "UK", "www.dailymail.co.uk", "Right", 3, "Populist conservative framing"),
    ("TRT World", "Turkey", "www.trtworld.com", "Far Right", 5, "State-aligned, Islamist conservative lean"),
    ("Le Figaro", "France", "www.lefigaro.fr", "Right", 3, "Traditional conservative, pro-business"),
    ("La Nacion", "Argentina", "www.lanacion.com.ar", "Right", 3, "Pro-business, critical of leftist governments"),
    ("Fox News", "US", "www.foxnews.com", "Far Right", 5, "Right-wing framing, especially in opinion"),
    ("TASS", "Russia", "tass.com", "Far Right", 5, "Pro-Kremlin, nationalist framing"),
    ("RT (Russia Today)", "Russia", "www.rt.com", "Far Right", 5, "Anti-Western framing, conspiratorial"),
    ("Global Times", "China", "www.globaltimes.cn", "Far Right", 5, "Nationalist, anti-Western, CCP-aligned"),
    ("Xinhua", "China", "www.xinhuanet.com/english", "Far Right", 5, "Pro-CCP propaganda framing"),
]

outlet_df = pd.DataFrame(outlets, columns=["Outlet", "Country/Region", "URL", "Political Leaning", "Score", "Notes"])

# ----------------------------
# 5) Match Source Domain -> outlet table (exact + fuzzy)
# ----------------------------
def normalize_domain(s):
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    s = s.replace("https://", "").replace("http://", "")
    s = s.split("/")[0]
    return s

df["SourceDomainNorm"] = df["Source Domain"].apply(normalize_domain)
outlet_df["URLNorm"] = outlet_df["URL"].apply(normalize_domain)

merged = df.merge(
    outlet_df,
    how="left",
    left_on="SourceDomainNorm",
    right_on="URLNorm",
    suffixes=("", "_outlet")
)

# Fuzzy match only where missing (optional)
try:
    from rapidfuzz import process, fuzz
    url_choices = outlet_df["URLNorm"].tolist()

    def fuzzy_pick(domain):
        if not domain:
            return None
        match = process.extractOne(domain, url_choices, scorer=fuzz.ratio)
        if match and match[1] >= 85:
            return match[0]
        return None

    missing = merged["Political Leaning"].isna()
    merged.loc[missing, "URLNorm"] = merged.loc[missing, "SourceDomainNorm"].apply(fuzzy_pick)

    merged = merged.drop(columns=["Outlet", "Country/Region", "URL", "Political Leaning", "Score", "Notes"], errors="ignore")
    merged = merged.merge(
        outlet_df[["URLNorm", "Outlet", "Country/Region", "URL", "Political Leaning", "Score", "Notes"]],
        how="left",
        on="URLNorm"
    )
except Exception:
    pass

# ----------------------------
# 6) Sentiment / vibes on Snippet: -1..1
# ----------------------------
try:
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer

    vader_analyzer = SentimentIntensityAnalyzer()

    def snippet_sentiment(snippet: str) -> float:
        if not isinstance(snippet, str) or not snippet.strip():
            return 0.0
        return vader_analyzer.polarity_scores(snippet)["compound"]

    USE_VADER = True
except Exception:
    USE_VADER = False

POS_WORDS = {"good","great","positive","benefit","improve","success","win","peace","deal","growth","safe"}
NEG_WORDS = {"bad","worse","negative","risk","crisis","war","threat","fail","loss","danger","probe","leak","tensions","cancel"}

def sentiment_fallback(snippet: str) -> float:
    if not isinstance(snippet, str) or not snippet.strip():
        return 0.0
    words = re.findall(r"[a-z']+", snippet.lower())
    if not words:
        return 0.0
    pos = sum(1 for w in words if w in POS_WORDS)
    neg = sum(1 for w in words if w in NEG_WORDS)
    score = (pos - neg) / max(1, pos + neg)
    return max(-1.0, min(1.0, float(score)))

if USE_VADER:
    merged["SnippetSentiment"] = merged["Snippet"].fillna("").astype(str).apply(snippet_sentiment)
else:
    merged["SnippetSentiment"] = merged["Snippet"].fillna("").astype(str).apply(sentiment_fallback)

merged["WeightedSentiment"] = merged["SnippetSentiment"] * merged["RecencyWeight"]

# ----------------------------
# 7) Keep / output what you want
# ----------------------------
# OUTPUT_CSV = "classified_weighted_news.csv"
COLUMNS_TO_DROP = ["SourceDomainNorm", "URLNorm", "Outlet", "URL"]

final_df = merged.drop(columns=COLUMNS_TO_DROP, errors="ignore")

# ----------------------------
# CSV export
# ----------------------------
# final_df.to_csv(OUTPUT_CSV, index=False)
# print(f" Saved classified + weighted output to: {OUTPUT_CSV}")

# print(merged[[
#     "ArticleDateUTC", "Brave page_age", "Description", "Snippet", "Hyperlink", "Source Domain",
#     "Outlet", "Political Leaning", "Score",
#     "SnippetSentiment", "RecencyWeight", "WeightedSentiment"
# ]].head(10))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\famor\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## LLM Summary Creation

In [4]:
# -----------------------------
# 1. Azure OpenAI configuration
# -----------------------------
openai_api_version = "API_VERSION"
openai_api_key = "USE_YOUR_KEY"
openai_api_base = "https://xtz.openai.azure.com/"

In [5]:
import json
from openai import AzureOpenAI

# ------------------------------------------
# 0) Client
# ------------------------------------------
client = AzureOpenAI(
    api_key=openai_api_key,
    api_version=openai_api_version,
    azure_endpoint=openai_api_base
)

# ------------------------------------------
# 1) Uses DataFrame from previous cell
# ------------------------------------------
df = final_df.copy()

# ------------------------------------------
# 2) Filter last 7 days using AgeDays
# ------------------------------------------
df["AgeDays"] = pd.to_numeric(df["AgeDays"], errors="coerce")
df["RecencyWeight"] = pd.to_numeric(df["RecencyWeight"], errors="coerce")

last7 = df[df["AgeDays"].notna() & (df["AgeDays"] < 7)].copy()

if last7.empty:
    weekly_df = pd.DataFrame([{
        "run_ts_utc": datetime.now(timezone.utc).isoformat(),
        "time_window": "last 7 days (AgeDays < 7)",
        "rows_used": 0,
        "topic": None,
        "summary": None,
        "key_developments": "[]",
        "timeline": "[]",
        "open_questions": "[]",
        "elapsed_time": 0.0
    }])

    # print(weekly_df)
    # weekly_df.to_csv("weekly_news_summary.csv", index=False)
    raise SystemExit()

# ------------------------------------------
# 3) Clean text + sort + dedupe
# ------------------------------------------
for c in ["Description", "Snippet"]:
    if c in last7.columns:
        last7[c] = (
            last7[c].astype(str)
            .str.replace("\u00a0", " ", regex=False)
            .str.replace("Â·", "·", regex=False)
            .str.replace("â€™", "’", regex=False)
            .str.replace("â€˜", "‘", regex=False)
            .str.replace("â€œ", "“", regex=False)
            .str.replace("â€", "”", regex=False)
            .str.replace("â€”", "—", regex=False)
            .str.replace("â†’", "→", regex=False)
        )

# Prefer high weight + newest (smallest AgeDays)
last7 = last7.sort_values(["RecencyWeight", "AgeDays"], ascending=[False, True])

# Drop obvious duplicates (same story/same link)
dedupe_cols = [c for c in ["Hyperlink", "Description"] if c in last7.columns]
if dedupe_cols:
    last7 = last7.drop_duplicates(subset=dedupe_cols, keep="first")

# ------------------------------------------
# 4) Build JSON "jobs"
# ------------------------------------------
def safe_str(x):
    return "" if pd.isna(x) else str(x)

jobs = []
for _, r in last7.iterrows():
    jobs.append({
        "AgeDays": float(r["AgeDays"]) if pd.notna(r["AgeDays"]) else None,
        "RecencyWeight": float(r["RecencyWeight"]) if pd.notna(r["RecencyWeight"]) else None,
        "ArticleDateUTC": safe_str(r.get("ArticleDateUTC")),
        "SourceDomain": safe_str(r.get("Source Domain")),
        "CountryRegion": safe_str(r.get("Country/Region")),
        "PoliticalLeaning": safe_str(r.get("Political Leaning")),
        "Description": safe_str(r.get("Description")),
        "Snippet": safe_str(r.get("Snippet")),
        "Hyperlink": safe_str(r.get("Hyperlink")),
    })

# print(f" Loaded {len(df)} rows; using {len(jobs)} rows where AgeDays < 7.")

# ------------------------------------------
# 5) Chunk + call model (MAP)
# ------------------------------------------
chunk_size = 30
map_chunk_dfs = []

system_msg = (
    "You are a news analyst producing a weekly brief for ONE main topic in this dataset. "
    "Use ONLY the provided context JSON. "
    "Merge duplicates. Avoid speculation. "
    "Output must be STRICT JSON only."
)

query = (
    "Summarize the last 7 days of news represented by these rows into a concise weekly brief. "
    "Identify the main topic, key developments, a simple timeline, and open questions."
)

output_rules = (
    "Return STRICT JSON ONLY with EXACT keys:\n"
    "  - topic (string)\n"
    "  - key_developments (array of strings)\n"
    "  - timeline (array of objects with keys: age_days (number), event (string))\n"
    "  - open_questions (array of strings)\n"
    "Do not add extra keys."
)

total = len(jobs)
start_all = time.time()

for i, start_idx in enumerate(range(0, total, chunk_size)):
    end_idx = min(start_idx + chunk_size, total)
    chunk = jobs[start_idx:end_idx]
    context = json.dumps(chunk, ensure_ascii=False)

    # print(f"\n MAP chunk {i + 1} ({start_idx}–{end_idx})...")

    try:
        t0 = time.time()
        resp = client.chat.completions.create(
            model="gpt-4o",
            temperature=0.2,
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": f"Context JSON:\n{context}\n\nTask:\n{query}\n\n{output_rules}"},
            ],
        )

        raw = resp.choices[0].message.content or ""
        if not raw.strip():
            # print(f" Skipping MAP chunk {i + 1}: empty response.")
            continue

        try:
            parsed = json.loads(raw)
        except json.JSONDecodeError:
            s = raw.find("{")
            e = raw.rfind("}")
            if s == -1 or e == -1 or e <= s:
                # print(f" Skipping MAP chunk {i + 1}: invalid JSON.")
                continue
            parsed = json.loads(raw[s:e+1])

        df_chunk = pd.DataFrame([{
            "chunk_index": i + 1,
            "topic": parsed.get("topic"),
            "key_developments": json.dumps(parsed.get("key_developments", []), ensure_ascii=False),
            "timeline": json.dumps(parsed.get("timeline", []), ensure_ascii=False),
            "open_questions": json.dumps(parsed.get("open_questions", []), ensure_ascii=False),
            "elapsed_time": time.time() - t0
        }])

        map_chunk_dfs.append(df_chunk)
        # print(f" MAP chunk {i + 1} done in {df_chunk.loc[0,'elapsed_time']:.2f}s")

    except Exception:
        # print(f" Error in MAP chunk {i + 1}: {e}")
        continue

if not map_chunk_dfs:
    raise RuntimeError("No MAP chunks produced valid output.")

map_df = pd.concat(map_chunk_dfs, ignore_index=True)

# ------------------------------------------
# 6) REDUCE: combine all chunk summaries
# ------------------------------------------
reduce_context = map_df.to_dict(orient="records")
reduce_context_json = json.dumps(reduce_context, ensure_ascii=False)

reduce_system = (
    "You combine multiple partial weekly summaries into ONE final weekly brief. "
    "Deduplicate, resolve overlaps, avoid speculation. "
    "Output must be STRICT JSON only."
)

reduce_query = (
    "Combine these chunk-level summaries into one final weekly summary for the topic. "
    "Keep it concise, but cover the key developments and what remains uncertain."
)

reduce_rules = (
    "Return STRICT JSON ONLY with EXACT keys:\n"
    "  - topic (string)\n"
    "  - summary (string)\n"
    "  - key_developments (array of strings)\n"
    "  - timeline (array of strings)\n"
    "  - open_questions (array of strings)\n"
    "Do not add extra keys."
)

t_reduce = time.time()
final_resp = client.chat.completions.create(
    model="gpt-4o",
    temperature=0.2,
    messages=[
        {"role": "system", "content": reduce_system},
        {"role": "user", "content": f"Chunk summaries JSON:\n{reduce_context_json}\n\nTask:\n{reduce_query}\n\n{reduce_rules}"},
    ],
)

raw_final = final_resp.choices[0].message.content or ""
try:
    final = json.loads(raw_final)
except json.JSONDecodeError:
    s = raw_final.find("{")
    e = raw_final.rfind("}")
    if s == -1 or e == -1 or e <= s:
        raise ValueError("Final reduce output was not valid JSON.")
    final = json.loads(raw_final[s:e+1])

elapsed_total = time.time() - start_all

# ------------------------------------------
# 7) Build pandas "table row"
# ------------------------------------------
weekly_row = {
    "run_ts_utc": datetime.now(timezone.utc).isoformat(),
    "time_window": "last 7 days (AgeDays < 7)",
    "rows_used": int(len(last7)),
    "topic": final.get("topic"),
    "summary": final.get("summary"),
    "key_developments": json.dumps(final.get("key_developments", []), ensure_ascii=False),
    "timeline": json.dumps(final.get("timeline", []), ensure_ascii=False),
    "open_questions": json.dumps(final.get("open_questions", []), ensure_ascii=False),
    "elapsed_time": round(elapsed_total, 3),
}

weekly_df = pd.DataFrame([weekly_row])

# OUT_CSV = "weekly_news_summary.csv"
# try:
#     existing = pd.read_csv(OUT_CSV)
#     weekly_df = pd.concat([existing, weekly_df], ignore_index=True)
# except FileNotFoundError:
#     pass
# weekly_df.to_csv(OUT_CSV, index=False)

# print("\n Weekly summary table (latest run is last row):")
# print(weekly_df.tail(1))

weekly_df

Unnamed: 0,run_ts_utc,time_window,rows_used,topic,summary,key_developments,timeline,open_questions,elapsed_time
0,2026-02-06T20:56:41.644906+00:00,last 7 days (AgeDays < 7),75,NATO and Arctic Strategy Amid US-Greenland Ten...,Recent developments have highlighted tensions ...,"[""Canada and France opened consulates in Green...","[""Canada and France open consulates in Greenla...","[""How will NATO's Arctic strategy evolve in re...",68.539


## News Card

In [6]:
from IPython.display import display, HTML

# -------------------------
# 1) Use in-memory DataFrames
# -------------------------
news_df = final_df.copy()

news_df["AgeDays"] = pd.to_numeric(news_df["AgeDays"], errors="coerce")
news_df["WeightedSentiment"] = pd.to_numeric(news_df.get("WeightedSentiment"), errors="coerce")

last7 = news_df[news_df["AgeDays"].notna() & (news_df["AgeDays"] < 7)].copy()

# -------------------------
# 2) Normalize Political Leaning -> 7 buckets
# -------------------------
LEAN_ORDER = [
    "Far Left",
    "Left",
    "Center Left",
    "Center",
    "Center Right",
    "Right",
    "Far Right",
]

# Colours
LEAN_COLORS = {
    "Far Left":     "#7f1d1d",  # deep red
    "Left":         "#ef4444",
    "Center Left":  "#fca5a5",
    "Center":       "#9ca3af",  # gray
    "Center Right": "#60a5fa",
    "Right":        "#2563eb",
    "Far Right":    "#1e3a8a",  # deep blue
}

def normalize_leaning(x: str) -> str:
    if pd.isna(x):
        return "Center"
    s = str(x).strip().lower()

    if s == "far left":
        return "Far Left"
    if s == "left":
        return "Left"
    if s in ("center left", "centre left"):
        return "Center Left"
    if s == "center":
        return "Center"
    if s in ("center right", "centre right"):
        return "Center Right"
    if s == "right":
        return "Right"
    if s == "far right":
        return "Far Right"
    return "Center"

last7["LeanBucket"] = last7["Political Leaning"].apply(normalize_leaning)

# -------------------------
# 3) Political leaning distribution (counts + %)
# -------------------------
counts = last7["LeanBucket"].value_counts().reindex(LEAN_ORDER, fill_value=0)
total_n = int(counts.sum())

if total_n == 0:
    pct = {k: 0 for k in LEAN_ORDER}
else:
    pct = {k: round(int(counts[k]) / total_n * 100) for k in LEAN_ORDER}

# Fix rounding drift so total == 100
drift = 100 - sum(pct.values())
if drift != 0 and total_n != 0:
    biggest = max(pct, key=pct.get)
    pct[biggest] += drift

# -------------------------
# 4) Blindspot = least covered leaning
# -------------------------
blindspot = min(pct, key=pct.get) if total_n != 0 else "Center"
blindspot_color = LEAN_COLORS.get(blindspot, "#9ca3af")

# -------------------------
# 5) Sentiment by leaning (avg WeightedSentiment)
# -------------------------
sent_means = (
    last7.groupby("LeanBucket")["WeightedSentiment"]
    .mean()
    .reindex(LEAN_ORDER)
)

def sentiment_label(x):
    if pd.isna(x):
        return "(No Data)"
    if x <= -0.4:
        return "Strongly Negative"
    if x <= -0.15:
        return "Moderately Negative"
    if x < 0.15:
        return "Neutral"
    if x < 0.4:
        return "Moderately Positive"
    return "Strongly Positive"

tone_icon = {
    "Strongly Negative": '<span style="color:#b91c1c; font-weight:700; font-size:15px; line-height:1;">▼</span>',
    "Moderately Negative": '<span style="color:#f97316; font-weight:700;">▼</span>',
    "Neutral": '<span style="color:#9ca3af; font-weight:700;font-size:8px; line-height:1;">▬</span>',
    "Moderately Positive": '<span style="color:#eab308; font-weight:700;">▲</span>',
    "Strongly Positive": '<span style="color:#b91c1c; font-weight:700; font-size:15px; line-height:1;">▲</span>',
    "No Data": '<span style="color:#9ca3af;">---</span>',
}

valid_sent = sent_means.dropna()
if len(valid_sent) > 0:
    most_negative = valid_sent.idxmin()
    most_positive = valid_sent.idxmax()
    sentiment_insight = (
        f"Tone varies by leaning: <b>{most_negative}</b> is most negative, "
        f"while <b>{most_positive}</b> is most positive."
    )
else:
    sentiment_insight = "Tone by leaning: not enough sentiment data to summarize."

tone_rows_html = ""
for k in LEAN_ORDER:
    mean_val = sent_means.loc[k]
    label = sentiment_label(mean_val)
    icon = tone_icon.get(label, "---")
    color = LEAN_COLORS[k]

    tone_rows_html += f"""
    <tr>
      <td style="
          padding:6px 10px;
          white-space:nowrap;
          text-align:left;
      ">
        <span style="
          display:inline-block;
          width:10px;
          height:10px;
          border-radius:50%;
          background:{color};
          margin-right:8px;
          vertical-align:middle;
        "></span>
        <span style="vertical-align:middle; font-weight:600;">
          {k}
        </span>
      </td>

      <td style="
          padding:6px 10px;
          text-align:left;
          white-space:nowrap;
          color:#2d3748;
      ">
        {icon} {label}
      </td>
    </tr>
    """

# -------------------------
# 6) Get latest weekly row (topic/summary/rows_used)
# -------------------------
row = weekly_df.iloc[-1]
topic = str(row.get("topic", ""))
summary = str(row.get("summary", ""))
rows_used = int(row.get("rows_used", 0))

# -------------------------
# 7) Build bar + labels HTML (LEFT column)
# -------------------------
segments_html = "".join(
    f"""<div title="{k} {pct[k]}%" style="width:{pct[k]}%; background:{LEAN_COLORS[k]};"></div>"""
    for k in LEAN_ORDER
)

labels_html = "".join(
    f"""
    <div style="color:{LEAN_COLORS[k]}; font-weight:700; font-size:12px; text-align:center;">
      {k} {pct[k]}%
    </div>
    """
    for k in LEAN_ORDER
)

left_card_html = f"""
  <div style="display:flex; align-items:center; gap:10px; margin-bottom:8px;">
    <span style="background:{blindspot_color}; color:white; font-weight:700; font-size:12px;
                 padding:4px 8px; border-radius:6px;">
      Blindspot
    </span>
    <span style="font-size:12px; color:#4a5568; border:1px solid #e2e8f0; padding:4px 8px; border-radius:999px;">
      {rows_used} Sources
    </span>
  </div>

  <div style="font-size:32px; font-weight:800; line-height:1.1; margin:6px 0 10px;">
    {topic}
  </div>

  <div style="font-size:16px; color:#2d3748; line-height:1.5; margin-bottom:14px;">
    {summary}
  </div>

  <div style="height:14px; border-radius:999px; overflow:hidden; display:flex; border:1px solid #e2e8f0;">
    {segments_html}
  </div>

  <div style="display:grid; grid-template-columns: repeat(7, 1fr); gap:6px; margin-top:10px;">
    {labels_html}
  </div>

  <div style="margin-top:10px; font-size:12px; color:#4a5568;">
    Blindspot indicates the least-represented leaning in the last 7 days:
    <b style="color:{blindspot_color};">{blindspot}</b>.
  </div>
"""

# -------------------------
# 8) Tone box HTML
# -------------------------
right_tone_html = f"""
<div style="
    border:1px solid #e2e8f0;
    border-radius:12px;
    padding:14px;
    background:#fafafa;
    position:sticky; top:12px;
">
  <div style="font-weight:900; font-size:14px; margin-bottom:8px;">
    Tone by Political Leaning
  </div>

  <table style="width:100%; font-size:12px; border-collapse:collapse;">
    <thead>
      <tr style="color:#4a5568;">
        <th style="text-align:left; padding:6px 10px;">Leaning</th>
        <th style="text-align:left; padding:6px 10px;">Tone</th>
      </tr>
    </thead>
    <tbody>
      {tone_rows_html}
    </tbody>
  </table>

  <div style="margin-top:10px; font-size:12px; color:#4a5568; line-height:1.35;">
    {sentiment_insight}
  </div>
</div>
"""

# -------------------------
# 9) Two-column page layout
# -------------------------
html = f"""
<div style="font-family: system-ui, -apple-system, Segoe UI, Roboto, Arial; max-width: 1200px;">
  <div style="display:grid; grid-template-columns: 1fr 340px; gap:28px; align-items:start;">
    <div style="max-width: 780px;">
      {left_card_html}
    </div>
    <div>
      {right_tone_html}
    </div>
  </div>
</div>
"""

display(HTML(html))

Leaning,Tone
Far Left,--- (No Data)
Left,▲ Moderately Positive
Center Left,▬ Neutral
Center,▬ Neutral
Center Right,▬ Neutral
Right,▬ Neutral
Far Right,▬ Neutral
