# Short News Lab
Use `AdvancedWebsite` to collect links from a source page, ask Ollama to keep likely news articles, and render 5-line summaries in a Gradio feed.


In [None]:
from pathlib import Path
import json
import textwrap
import io
import contextlib
import time
from typing import List, Dict, Any

import gradio as gr
import sys

# Add path to access AdvancedWebsite, even when the notebook is run from a nested directory

def _find_repo_root() -> Path:
    cwd = Path.cwd().resolve()
    for p in [cwd, *cwd.parents]:
        candidate = p / "week1" / "community-contributions" / "aryaman" / "advanced_website_scraper.py"
        if candidate.exists():
            return p
    return cwd

ROOT = _find_repo_root()
SCRAPER_PATH = ROOT / "week1" / "community-contributions" / "aryaman"
if str(SCRAPER_PATH) not in sys.path:
    sys.path.append(str(SCRAPER_PATH))
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from advanced_website_scraper import AdvancedWebsite

# Ollama / OpenAI-compatible settings
OLLAMA_BASE_URL = "http://localhost:11434/v1"
OLLAMA_MODEL = "llama3.1"
OLLAMA_API_KEY = "ollama"  # default key for local Ollama

# Generation knobs
SUMMARY_TEMPERATURE = 0
SUMMARY_MAX_TOKENS = 256  # tight for 5 lines

# Filtering prompt template
FILTER_PROMPT = """
You are a news assistant. Given a list of URLs, return only those that look like real news articles.
Keep URLs that likely contain news, announcements, or blog posts. Remove home pages, category pages,
PDFs, images, videos, or obvious non-news links. Return JSON with a top-level array of URLs only.
""".strip()

SUMMARY_SYSTEM = """
You are a concise news summarizer. Produce exactly 5 bullet lines capturing headline facts,
who/what/when/where/why, and key numbers. No preamble, no trailing notes. If content is empty,
reply with: 'No content available.'
""".strip()



In [5]:
import re
from openai import OpenAI


def get_client() -> OpenAI:
    return OpenAI(base_url=OLLAMA_BASE_URL, api_key=OLLAMA_API_KEY)


def _strip_fences(text: str) -> str:
    # Remove Markdown code fences if present
    if text.startswith("```"):
        text = text.strip().strip("`")
    return text


def _extract_links_from_text(raw: str) -> List[str]:
    # Try JSON first (after removing code fences)
    candidate = _strip_fences(raw)
    try:
        data = json.loads(candidate)
        if isinstance(data, list):
            return [u for u in data if isinstance(u, str)]
    except json.JSONDecodeError:
        pass
    # Regex fallback for URLs
    urls = re.findall(r"https?://[^\s\]\"'`]+", raw)
    return [u.rstrip('.,') for u in urls]


def filter_links_with_ollama(links: List[str]) -> List[str]:
    if not links:
        return []
    print(f"[ollama] filtering {len(links)} links ...")
    client = get_client()
    messages = [
        {"role": "system", "content": FILTER_PROMPT},
        {"role": "user", "content": "URLs:\n" + "\n".join(links)},
    ]
    try:
        resp = client.chat.completions.create(
            model=OLLAMA_MODEL,
            messages=messages,
            temperature=0,
            max_tokens=256,
        )
        raw = resp.choices[0].message.content.strip()
        print(f"[ollama] raw filter response:\n{raw}")
        cleaned = _extract_links_from_text(raw)
        print(f"[ollama] kept {len(cleaned)} links after parsing")
        return cleaned
    except Exception as e:
        print(f"Link filtering failed: {e}")
        return []


def summarize_link(url: str) -> Dict[str, Any]:
    print(f"[ollama] summarizing: {url}")
    try:
        site = AdvancedWebsite(url, use_js=False)
        title = site.title or "(no title)"
        content = site.text
        if not content:
            print(f"[ollama] no content at {url}")
            return {"url": url, "title": title, "summary": "No content available."}

        summary = site.summarize_with_ollama(
            model=OLLAMA_MODEL,
            base_url=OLLAMA_BASE_URL,
            api_key=OLLAMA_API_KEY,
            temperature=SUMMARY_TEMPERATURE,
            max_tokens=SUMMARY_MAX_TOKENS,
            system_prompt=SUMMARY_SYSTEM,
        )
        return {"url": url, "title": title, "summary": summary}
    except Exception as e:
        print(f"[ollama] summarize failed for {url}: {e}")
        return {"url": url, "title": "(error)", "summary": f"Failed: {e}"}


def fetch_loading_tips(count: int = 10, retries: int = 3, delay: float = 3.0) -> List[str]:
    """Get short one-liners about how the pipeline works using Ollama, with retry/backoff."""
    for attempt in range(retries):
        try:
            client = get_client()
            messages = [
                {
                    "role": "system",
                    "content":
                        "You concisely describe how this short_news pipeline works. One line per fact, "
                        "under 80 chars, no bullets or numbering. Facts are as follows:\n\n"
                        "Detailed flow of the short_news lab:\n\n"
                        "Inputs/controls: You pick a preset source (TechCrunch or Hacker News). Optional sliders "
                        "bound the total links considered and the number of articles summarized. A checkbox can "
                        "show run-time logs. After choosing a source, the sliders/log checkbox and the 'Fetch & Summarize' "
                        "button appear; controls hide during execution.\n\n"
                        "Fetch & parse: AdvancedWebsite fetches the source page (requests first; Playwright optional if needed) "
                        "and parses links.\n\n"
                        "Link filtering via Ollama: The scraped links are sent to Ollama (OpenAI-compatible API) with a system "
                        "prompt to keep only news-like URLs. The model response is parsed (JSON or regex) to a filtered list, "
                        "capped by the max-links slider.\n\n"
                        "Summarization: For each filtered link (capped by max-articles slider), AdvancedWebsite fetches and extracts "
                        "text, then summarize_with_ollama produces a concise 5-line summary. If a page is empty or fails, a fallback "
                        "message is set per article.\n\n"
                        "Loading UX: While the background thread runs, the UI cycles through short Ollama-generated hints about how "
                        "the pipeline works. If 'Show logs' is enabled, stdout (fetch/filter/summarize logs) is captured and displayed "
                        "above the feed after completion. Loading and tips are yielded via Gradio queue/generator.\n\n"
                        "Output/feed: Results render as social-style cards (avatar initial, source domain, title, bullet list, and 'Read more' "
                        "link). Dark gradient styling for cards and panels; logs (if enabled) appear in a monospace block above the summaries.\n\n"
                        "Defaults/config: Uses Ollama at http://localhost:11434/v1 with model llama3.1, temperature 0 for summaries, max_tokens "
                        "256 for summaries, and a 5-line bullet format."
                },
                {
                    "role": "user",
                    "content": f"Give {count} distinct one-line facts about: scraping with AdvancedWebsite, filtering links with Ollama, summarizing news. Return plain lines separated by newlines."
                },
            ]
            resp = client.chat.completions.create(
                model=OLLAMA_MODEL,
                messages=messages,
                temperature=0.3,
                max_tokens=128,
            )
            raw = resp.choices[0].message.content.strip()
            lines = [ln.strip("-• \t") for ln in raw.splitlines() if ln.strip()]
            return lines[:count] or ["Fetching links, filtering with Ollama, summarizing to 5 lines."]
        except Exception as e:
            print(f"[ollama] tip fetch failed (attempt {attempt + 1}/{retries}): {e}")
            if attempt < retries - 1:
                time.sleep(delay)
                continue
            return [
                "Scrapes links, filters via Ollama, then summarizes to 5 lines.",
                "AdvancedWebsite handles fetch + parse; Ollama filters URLs.",
                "Each kept link is summarized with Ollama chat into 5 bullets.",
            ][:count]


def build_news_feed(source_url: str, max_links: int = 15, max_articles: int = 8) -> List[Dict[str, Any]]:
    if not source_url:
        return []
    try:
        print(f"[pipeline] fetching source: {source_url}")
        source = AdvancedWebsite(source_url, use_js=False)
        links = source.get_links()
        print(f"[pipeline] found {len(links)} links")
    except Exception as e:
        print(f"[pipeline] failed to fetch source: {e}")
        return [{"url": source_url, "title": "(error)", "summary": f"Failed to fetch source: {e}"}]

    filtered = filter_links_with_ollama(links)
    if max_links:
        filtered = filtered[:max_links]
    print(f"[pipeline] using {len(filtered)} filtered links (cap {max_links})")

    articles = []
    for link in filtered[:max_articles]:
        articles.append(summarize_link(link))
    print(f"[pipeline] summarized {len(articles)} articles")
    return articles



In [22]:
import threading
def _clean_summary(summary: str, max_lines: int = 5) -> List[str]:
    lines = [ln.strip() for ln in summary.splitlines() if ln.strip()]
    cleaned = []
    for ln in lines:
        low = ln.lower()
        if "here are" in low and "bullet" in low:
            continue
        cleaned.append(ln.lstrip("-• "))
        if len(cleaned) >= max_lines:
            break
    return cleaned or ["No summary available."]


def _profile_badge(text: str) -> str:
    letter = (text.strip()[:1] or "N").upper()
    return f"<div class='avatar'>{letter}</div>"


def format_articles_html(articles: List[Dict[str, Any]]) -> str:
    if not articles:
        return "<div class='empty'>No articles found.</div>"
    blocks = []
    for art in articles:
        title = art.get("title") or "(no title)"
        summary = art.get("summary") or "No summary."
        url = art.get("url") or ""
        lines = _clean_summary(summary)
        bullets = "".join(f"<li>• {ln}</li>" for ln in lines)
        badge = _profile_badge(title)
        domain = url.split("//")[-1].split("/")[0] if url else "source"
        block = f"""
        <div class="card">
          <div class="profile">
            {badge}
            <div class="meta">
              <div class="name">{domain}</div>
              <div class="title">{title}</div>
            </div>
          </div>
          <div class="post-content">
            <ul class="bullets">{bullets}</ul>
          </div>
          <div class="interactions">
            <button>Like</button>
            <button>Comment</button>
            <a class="link" href="{url}" target="_blank">Read more ↗</a>
          </div>
        </div>
        """
        blocks.append(block)
    return "".join(blocks)


PRESET_SOURCES = {
    "TechCrunch": "https://techcrunch.com",
    "Hacker News": "https://news.ycombinator.com/",
}

def run_pipeline(source_url, max_links=15, max_articles=8):
    hide = gr.update(visible=False)

    loading_html = "<div class='loading'>Loading summaries, please wait...</div>"
    fallback_tips = ["Fetching links, filtering with Ollama, summarizing to 5 lines."]
    tips_ready: List[str] = []
    tips_done = threading.Event()

    def load_tips():
        # Fetch tips in the background so the first yield is immediate.
        tips_ready.extend(fetch_loading_tips(10))
        tips_done.set()

    threading.Thread(target=load_tips, daemon=True).start()

    done = threading.Event()
    result = {}

    def worker():
        articles = build_news_feed(source_url, max_links=max_links, max_articles=max_articles)
        result["articles"] = articles
        done.set()

    threading.Thread(target=worker, daemon=True).start()

    tip_idx = 0

    def next_tip():
        nonlocal tip_idx
        pool = tips_ready if tips_ready else fallback_tips
        tip = pool[tip_idx % len(pool)]
        tip_idx += 1
        return tip

    # immediate first yield without waiting for Ollama tip generation
    tip = next_tip()
    tip_html = loading_html + f"<div class='tip-line'>ℹ️ {tip}</div>"
    yield "⏳ Fetching & summarizing...", tip_html, hide, hide, hide

    while not done.is_set():
        tip = next_tip()
        tip_html = loading_html + f"<div class='tip-line'>ℹ️ {tip}</div>"
        yield "⏳ Fetching & summarizing...", tip_html, hide, hide, hide
        time.sleep(1.5)

    articles = result.get("articles", [])
    html = format_articles_html(articles)
    yield "✅ Done", html, hide, hide, hide

def set_source(name: str):
    url = PRESET_SOURCES.get(name, "")
    if not url:
        return "❗ Invalid source", ""
    return f"Source: {name} — {url}", url

demo = gr.Blocks(css="""
#feed {max-height: 640px; overflow-y: auto;}
.card {border: 1px solid #1f2937; border-radius: 14px; padding: 16px; margin-bottom: 14px; background: linear-gradient(135deg, #0b1220, #0f172a); color: #e6edf3; box-shadow: 0 6px 16px rgba(0,0,0,0.3);}
.profile {display: flex; align-items: center; margin-bottom: 10px; gap: 10px;}
.avatar {width: 42px; height: 42px; border-radius: 50%; background: #111827; border: 1px solid #1f2937; display: flex; align-items: center; justify-content: center; font-weight: 700; color: #9fcef8;}
.meta {display: flex; flex-direction: column;}
.title {font-weight: 700; font-size: 1.05rem; margin: 2px 0;}
.name {font-weight: 600; font-size: 0.9rem; color: #9ca3af;}
.post-content {margin-bottom: 10px; font-size: 0.95rem; color: #e6edf3;}
.bullets {list-style: none; padding-left: 0; margin: 0 0 10px 0;}
.bullets li {margin: 4px 0;}
.interactions {display: flex; gap: 12px; align-items: center; font-size: 0.9rem; color: #9ca3af;}
.interactions button {background: none; border: none; color: #58a6ff; cursor: pointer; padding: 0;}
.interactions button:hover {text-decoration: underline;}
.link {color: #58a6ff; text-decoration: none; font-weight: 600;}
.link:hover {text-decoration: underline;}
.empty {padding: 12px; color: #9ca3af;}
#status {color: #9ca3af; font-style: italic;}
.loading {padding: 16px; border-radius: 14px; background: linear-gradient(135deg, #0b1628, #0f172a); color: #bcd7ff; border: 1px solid #1f2a44; box-shadow: 0 6px 16px rgba(0,0,0,0.35); text-align: center;}
.tip-line {margin-top: 8px; color: #cdd9e5; font-size: 0.95rem;}

/* Input panel styling */
#source_row, #sliders_row, #fetch_row {border: 1px solid #1f2937; border-radius: 14px; padding: 14px 16px; background: linear-gradient(135deg, #0b1220, #0f172a); box-shadow: 0 6px 16px rgba(0,0,0,0.25);}
#sliders_row label {color: #e6edf3; font-weight: 600;}
#fetch_row .gr-button {width: 100%; font-weight: 700; background: linear-gradient(135deg, #2563eb, #1d4ed8); color: white; border: none;}
#fetch_row .gr-button:hover {filter: brightness(1.05);}
#next_btn, .source-btn {width: 100%; background: #111827; color: #e6edf3; border: 1px solid #1f2937;}
#next_btn:hover, .source-btn:hover {filter: brightness(1.05);}

/* Status area */
#status {padding: 8px 0;}
.pre {font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; font-size: 0.9rem; color: #cdd9e5; background: #0b1220; padding: 12px; border-radius: 10px; border: 1px solid #1f2937; max-height: 300px; overflow-y: auto;}
""")

with demo:
    gr.Markdown("## Short News Feed")
    gr.Markdown("Choose a source and summarize its latest links.")
    with gr.Column(visible=True, elem_id="source_row") as source_row:
        gr.Markdown("Select a preset source:")
        with gr.Row():
            tc_btn = gr.Button("TechCrunch", elem_classes=["source-btn"])
            hn_btn = gr.Button("Hacker News", elem_classes=["source-btn"])
    with gr.Row(visible=False, elem_id="sliders_row") as sliders_row:
        max_links_in = gr.Slider(5, 50, value=15, step=1, label="Max links to consider")
        max_articles_in = gr.Slider(1, 20, value=8, step=1, label="Max articles to summarize")
    with gr.Row(visible=False, elem_id="fetch_row") as fetch_row:
        run_btn = gr.Button("Fetch & Summarize", variant="primary")
    status = gr.Markdown("", elem_id="status")
    feed = gr.HTML(label="News Feed", elem_id="feed")
    source_url = gr.State("")

    tc_btn.click(set_source, inputs=gr.State("TechCrunch"), outputs=[status, source_url])
    hn_btn.click(set_source, inputs=gr.State("Hacker News"), outputs=[status, source_url])

    tc_btn.click(lambda: gr.update(visible=True), None, sliders_row)
    tc_btn.click(lambda: gr.update(visible=True), None, fetch_row)
    hn_btn.click(lambda: gr.update(visible=True), None, sliders_row)
    hn_btn.click(lambda: gr.update(visible=True), None, fetch_row)

    run_btn.click(
        run_pipeline,
        inputs=[source_url, max_links_in, max_articles_in],
        outputs=[status, feed, source_row, sliders_row, fetch_row],
        queue=True,
    )


def launch_demo(**kwargs):
    """Launch the Gradio app, closing any existing server to avoid duplicates."""
    try:
        demo.close()
    except Exception:
        pass
    return demo.queue().launch(**kwargs)

# Uncomment to launch in notebook
launch_demo()

* Running on local URL:  http://127.0.0.1:7870
* To create a public link, set `share=True` in `launch()`.




[pipeline] fetching source: https://techcrunch.com
[pipeline] found 88 links
[ollama] filtering 88 links ...
[ollama] raw filter response:
Here is the JSON output with only the URLs that likely contain news, announcements, or blog posts:

```
[
  "https://techcrunch.com/2025/12/20/it-felt-so-wrong-colin-angle-on-irobot-the-ftc-and-the-amazon-deal-that-never-was/",
  "https://techcrunch.com/2025/12/20/new-york-governor-kathy-hochul-signs-raise-act-to-regulate-ai-safety/",
  "https://techcrunch.com/2025/12/19/sequoia-partner-spreads-debunked-brown-shooting-theory-testing-new-leadership/",
  "https://techcrunch.com/2025/12/20/openai-allows-users-to-directly-adjust-chatgpts-warmth-and-enthusiasm/",
  "https://techcrunch.com/2025/12/19/ex-splunk-execs-startup-resolve-ai-hits-1-billion-valuation-with-series-a/",
  "https://techcrunch.com/2025/12/19/elon-musks-56b-tesla-pay-package-restored-by-delaware-supreme
[ollama] kept 6 links after parsing
[pipeline] using 6 filtered links (cap 15)
[oll

### Usage
- Ensure Ollama is running: `ollama serve` and model `llama3.1` is pulled.
- Optional: install Playwright browsers if sites need JS: `python -m playwright install`.
- In this notebook, run the cells, then uncomment and run `demo.launch()` to start Gradio.
- Input a source URL (e.g., a news index). The app fetches links, filters to likely articles via Ollama, and shows 5-line summaries.
- If link filtering or summarization fails for some items, the feed will note the failure per article.
