# NewsData.io API Test
Testing API as fallback for uncrawled WSJ articles.

**Goal**: Search by WSJ title keywords → get description, content, image

In [23]:
import requests
import json
import os
from datetime import datetime

# Load from env or paste directly
API_KEY = os.environ.get("NEWSDATA_API_KEY", "pub_935b61800d9a4551bb4191bc7c1a8b5c")
BASE_URL = "https://newsdata.io/api/1/latest"

## 1. Basic API Test — Single Query

In [11]:
# Test with a simple, well-known topic
params = {
    "apikey": API_KEY,
    "q": "Walmart sales revenue",
    "language": "en",
    "size": 1  # max 10 on free tier
}

resp = requests.get(BASE_URL, params=params)
data = resp.json()

print(f"Status: {data.get('status')}")
print(f"Total results: {data.get('totalResults')}")
print(f"Credits used (check remaining): nextPage={data.get('nextPage')}")
print(f"\nArticles returned: {len(data.get('results', []))}")

Status: success
Total results: 35
Credits used (check remaining): nextPage=1771477261524099844

Articles returned: 1


In [12]:
# Inspect the first article — what fields do we actually get?
if data.get('results'):
    article = data['results'][0]
    print("=== Available Fields ===")
    for key, val in article.items():
        if val is None:
            print(f"  {key}: None")
        elif isinstance(val, str) and len(val) > 200:
            print(f"  {key}: ({len(val)} chars) {val[:200]}...")
        else:
            print(f"  {key}: {val}")

=== Available Fields ===
  article_id: 41c7728257d50976861cbe5011a10092
  link: https://www.cnbc.com/2026/02/19/walmart-wmt-q4-2026-earnings.html
  title: Walmart will report earnings before the bell. Here's what to expect
  description: It marks the first earnings report since new Walmart CEO John Furner took over in early February.
  content: ONLY AVAILABLE IN PAID PLANS
  keywords: ['source:tagname:cnbc us source', 'john furner', 'breaking news: earnings', 'donald trump', 'business news', 'e-commerce', 'business', 'united states', 'donald j. trump', 'costco wholesale corp', 'walmart inc', 'articles', 'amazon.com inc', 'home depot inc', 'target corp', 'retail industry', 'retail', 'earnings', 'cnbc', 'doug mcmillon', 'discounters', 'breaking news: business', 'goldman sachs group inc']
  creator: ['melissa repko']
  language: english
  country: ['united states of america']
  category: ['business']
  datatype: news
  pubDate: 2026-02-19 05:01:01
  pubDateTZ: UTC
  fetched_at: 2026-02-19

## 2. Search Strategy Comparison
Test different ways to search for the same WSJ article.
**Key question**: full title vs keywords vs entity-only — which finds the best matches?

In [24]:
def search_newsdata(query: str = "", **kwargs) -> dict:
    """Search NewsData.io. Only ONE of q/qInTitle/qInMeta allowed."""
    params = {
        "apikey": API_KEY,
        "language": "en",
        "size": 5,
        **kwargs,
    }
    # Only set q if no other query filter was passed
    has_query = any(k in params for k in ("q", "qInTitle", "qInMeta"))
    if not has_query and query:
        params["q"] = query
    
    resp = requests.get(BASE_URL, params=params)
    data = resp.json()
    if data.get('status') == 'error':
        msg = data.get('results', {})
        if isinstance(msg, dict):
            print(f"  API ERROR: {msg.get('message', msg)}")
        else:
            print(f"  API ERROR: {msg}")
    return data

def get_count(data):
    if data.get('status') == 'error':
        return 'ERR'
    arts = data.get('results') or []
    return len(arts) if isinstance(arts, list) else 0

def get_best(data):
    arts = data.get('results') or []
    if isinstance(arts, list) and arts and isinstance(arts[0], dict):
        a = arts[0]
        img = "IMG" if a.get('image_url') else "   "
        return f"[{img}] {a.get('title','')[:75]} ({a.get('source_name','?')})"
    return None

def show_results(label, data):
    if data.get('status') == 'error':
        print(f"\n{label}: ERROR")
        return []
    articles = data.get('results') or []
    if not isinstance(articles, list):
        print(f"\n{label}: unexpected response")
        return []
    print(f"\n{label}: {len(articles)} results")
    for i, a in enumerate(articles):
        if not isinstance(a, dict): continue
        img = "IMG" if a.get('image_url') else "   "
        desc_len = len(a.get('description') or '')
        print(f"  [{i+1}] [{img}] {a.get('title', '?')[:85]}")
        print(f"       {a.get('source_name', '?')} | desc: {desc_len} chars | {a.get('pubDate', '')[:10]}")
    if not articles:
        print(f"  (no results)")
    return articles

In [21]:
# === Strategy comparison for "Walmart Sales Climb, Driven by Grocery and Online Gains" ===
# ~8 credits total

title = "Walmart Sales Climb, Driven by Grocery and Online Gains"
print(f"WSJ Title: {title}\n")

# A: q = full title (default AND between words)
show_results("A) q = full title", search_newsdata(title))

# B: qInTitle = key words (search in TITLE only, more precise)
show_results("B) qInTitle = Walmart sales", search_newsdata("", qInTitle="Walmart sales"))

# C: qInTitle + image filter
show_results("C) qInTitle + image=1", search_newsdata("", qInTitle="Walmart sales", image=1))

# D: q = company + topic, with top domains + dedup
show_results("D) q + prioritydomain=top + dedup",
    search_newsdata("Walmart sales grocery", prioritydomain="top", removeduplicate=1))

# E: q with timeframe (last 48h)
show_results("E) q + timeframe=48",
    search_newsdata("Walmart sales", timeframe=48))

# F: q + category filter
show_results("F) q + category=business",
    search_newsdata("Walmart sales", category="business"))

# G: qInTitle with AND operator
show_results("G) qInTitle = Walmart AND earnings",
    search_newsdata("", qInTitle="Walmart AND earnings"))

# H: q = full title + relevancy sort + dedup + image
show_results("H) full title + relevancy + dedup + image",
    search_newsdata(title, sort="relevancy", removeduplicate=1, image=1))

WSJ Title: Walmart Sales Climb, Driven by Grocery and Online Gains


A) q = full title: 0 results
  (no results)

B) qInTitle = Walmart sales: 2 results
  [1] [   ] Amazon dethrones Walmart as the world's biggest company by sales
       Stlpr | desc: 110 chars | 2026-02-19
  [2] [IMG] Amazon dethrones Walmart as the world's biggest company by sales
       Wuga | University Of Georgia | Your Oasis For Idea | desc: 110 chars | 2026-02-19

C) qInTitle + image=1: 1 results
  [1] [IMG] Amazon dethrones Walmart as the world's biggest company by sales
       Wuga | University Of Georgia | Your Oasis For Idea | desc: 110 chars | 2026-02-19

D) q + prioritydomain=top + dedup: 2 results
  [1] [IMG] Walmart earnings: CEO John Furner set for first earnings report with company market c
       Yahoo! News | desc: 111 chars | 2026-02-18
  [2] [IMG] As Walmart and Target head in different directions, all eyes are on their new CEOs
       Cnbc | desc: 127 chars | 2026-02-18
  API ERROR: {'message': 'Ac

[]

In [25]:
import time

titles = [
    "Walmart Sales Climb, Driven by Grocery and Online Gains",
    "Klarna Swings to Loss Despite Jump in Revenue",
    "U.S. Trade Deficit Grew in December",
    "Etsy Posts Lower Profit Despite Higher Revenue",
    "Hims & Hers to Acquire Eucalyptus in $1.15 Billion Deal",
    "Former Prince Andrew Arrested Amid Epstein Probe",
    "Wayfair Loss Narrows as Revenue Rises on U.S. Growth",
    "Rio Tinto Spotlights Growth Plans After Failed Glencore Megadeal Talks",
    "Elizabeth Warren Has Questions About the Shake-Up Inside the Fed\u2019s Banking Regulator",
    "Nestle to Shed Ice-Cream Business in Shakeup",
]

# Test 3 search modes with FULL TITLE, one at a time
print("=== Full title: qInMeta vs q vs qInTitle (~30 credits) ===")
print("(1 second delay between calls to avoid rate limit)\n")

for title in titles:
    print(f"\nWSJ: {title}")
    
    rA = search_newsdata(qInMeta=title, removeduplicate=1, size=2)
    time.sleep(1)
    rB = search_newsdata(title, removeduplicate=1, size=2)
    time.sleep(1)
    rC = search_newsdata(qInTitle=title, removeduplicate=1, size=2)
    time.sleep(1)
    
    print(f"  qInMeta: {get_count(rA)} | q: {get_count(rB)} | qInTitle: {get_count(rC)}")
    
    for label, r in [("qInMeta", rA), ("q", rB), ("qInTitle", rC)]:
        best = get_best(r)
        if best:
            print(f"  Best ({label}): {best}")
            break
    else:
        print(f"  → ALL ZERO")

=== Full title: qInMeta vs q vs qInTitle (~30 credits) ===
(1 second delay between calls to avoid rate limit)


WSJ: Walmart Sales Climb, Driven by Grocery and Online Gains
  qInMeta: 0 | q: 0 | qInTitle: 0
  → ALL ZERO

WSJ: Klarna Swings to Loss Despite Jump in Revenue
  qInMeta: 0 | q: 0 | qInTitle: 0
  → ALL ZERO

WSJ: U.S. Trade Deficit Grew in December
  qInMeta: 0 | q: 2 | qInTitle: 0
  Best (q): [   ] Treasuries Move Lower Following Upbeat Data, Fed Minutes (Rttnews)

WSJ: Etsy Posts Lower Profit Despite Higher Revenue
  qInMeta: 0 | q: 0 | qInTitle: 0
  → ALL ZERO

WSJ: Hims & Hers to Acquire Eucalyptus in $1.15 Billion Deal
  qInMeta: 0 | q: 0 | qInTitle: 0
  → ALL ZERO

WSJ: Former Prince Andrew Arrested Amid Epstein Probe
  qInMeta: 1 | q: 1 | qInTitle: 1
  Best (qInMeta): [IMG] Former British Prince Andrew arrested on suspicion of misconduct in public  (Ani (asian News International))

WSJ: Wayfair Loss Narrows as Revenue Rises on U.S. Growth
  API ERROR: Rate limit exceed

KeyboardInterrupt: 

In [None]:
# === Strategy 2: Use qInTitle with OR operator ===
# "Walmart OR earnings OR grocery" — at least one word matches in title

or_queries = {
    "Walmart Sales Climb, Driven by Grocery and Online Gains": "Walmart AND earnings",
    "Klarna Swings to Loss Despite Jump in Revenue": "Klarna AND revenue",
    "U.S. Trade Deficit Grew in December": "trade AND deficit",
    "Etsy Posts Lower Profit Despite Higher Revenue": "Etsy AND earnings",
    "Hims & Hers to Acquire Eucalyptus in $1.15 Billion Deal": "Hims AND Eucalyptus",
    "Former Prince Andrew Arrested Amid Epstein Probe": "Andrew AND Epstein",
    "Wayfair Loss Narrows as Revenue Rises on U.S. Growth": "Wayfair AND revenue",
    "Rio Tinto Spotlights Growth Plans After Failed Glencore Megadeal Talks": "Rio Tinto AND Glencore",
    "Elizabeth Warren Has Questions About the Shake-Up Inside the Fed\u2019s Banking Regulator": "Warren AND Fed AND regulator",
    "Nestle to Shed Ice-Cream Business in Shakeup": "Nestle AND ice cream",
}

print("=== Strategy 2: qInTitle with AND operator (~10 credits) ===\n")
results_qintitle = {}
for title, query in or_queries.items():
    print(f"\nWSJ: {title}")
    print(f"qInTitle: {query}")
    data = search_newsdata("", qInTitle=query, removeduplicate=1, size=3)
    articles = data.get('results') or []
    if not isinstance(articles, list): articles = []
    results_qintitle[title] = articles
    
    if articles:
        for i, a in enumerate(articles):
            if not isinstance(a, dict): continue
            img = "IMG" if a.get('image_url') else "   "
            print(f"  [{i+1}] [{img}] {a.get('title','?')[:80]}")
            print(f"       {a.get('source_name','?')} | desc: {len(a.get('description') or '')} chars")
    else:
        print("  (no results)")
    time.sleep(0.3)

In [None]:
# === Compare both strategies ===
print(f"{'WSJ Title':<55} {'S1 (entity)':>11} {'S2 (qInTitle)':>13}")
print("-" * 82)

s1_found = s2_found = s1_img = s2_img = 0
for title in entity_queries:
    e_arts = results_entity.get(title, [])
    q_arts = results_qintitle.get(title, [])
    e_ok = "YES" if e_arts else "NO"
    q_ok = "YES" if q_arts else "NO"
    e_has_img = any(isinstance(a, dict) and a.get('image_url') for a in e_arts)
    q_has_img = any(isinstance(a, dict) and a.get('image_url') for a in q_arts)
    if e_arts: s1_found += 1
    if q_arts: s2_found += 1
    if e_has_img: s1_img += 1
    if q_has_img: s2_img += 1
    
    short = title[:52] + "..." if len(title) > 55 else title
    print(f"{short:<55} {e_ok:>11} {q_ok:>13}")

total = len(entity_queries)
print(f"\nMatch rate:  {s1_found}/{total} ({s1_found/total*100:.0f}%)     {s2_found}/{total} ({s2_found/total*100:.0f}%)")
print(f"Image rate:  {s1_img}/{total} ({s1_img/total*100:.0f}%)     {s2_img}/{total} ({s2_img/total*100:.0f}%)")

In [None]:
# Show full details for articles that matched
for title, articles in all_results.items():
    if not articles:
        continue
    best = articles[0]
    print(f"\n{'='*70}")
    print(f"WSJ:  {title}")
    print(f"API:  {best.get('title')}")
    print(f"Source: {best.get('source_name')} | Date: {best.get('pubDate', '')[:10]}")
    print(f"\nDescription ({len(best.get('description') or '')} chars):")
    print(f"  {best.get('description', 'None')}")
    print(f"\nImage: {best.get('image_url', 'None')}")
    print(f"Link:  {best.get('link', 'None')}")
    print(f"Keywords: {best.get('keywords', [])[:5]}")

## 4. Credit Usage
Free tier: 200 credits/day, 10 articles/request

In [None]:
daily_gap_avg = 35
print(f"Daily uncovered items (avg): {daily_gap_avg}")
print(f"Credits needed: {daily_gap_avg} (1 per search)")
print(f"Credits available: 200/day")
print(f"Remaining: {200 - daily_gap_avg}")
print(f"\nVerdict: {'Sufficient' if daily_gap_avg <= 200 else 'NOT enough'}")

---
# GNews API Test
Testing GNews as alternative fallback API.

**Free tier**: 100 req/day, content (truncated), image, description
**Key advantage over NewsData.io**: `in` param searches title+description+content simultaneously, date filter free

In [26]:
import time

GNEWS_KEY = os.environ.get("GNEWS_API_KEY", "3381c549447bb6132b3f30824e051538")
GNEWS_URL = "https://gnews.io/api/v4/search"

def search_gnews(query: str, **kwargs) -> dict:
    """Search GNews API. Supports AND/OR/NOT operators, phrase search with quotes."""
    params = {
        "apikey": GNEWS_KEY,
        "q": query,
        "lang": "en",
        "max": 5,
        **kwargs,
    }
    resp = requests.get(GNEWS_URL, params=params)
    data = resp.json()
    if "errors" in data:
        print(f"  API ERROR: {data['errors']}")
    return data

def show_gnews(label, data):
    """Display GNews results."""
    if "errors" in data:
        print(f"\n{label}: ERROR — {data['errors']}")
        return []
    articles = data.get("articles", [])
    total = data.get("totalArticles", 0)
    print(f"\n{label}: {len(articles)} results (total: {total})")
    for i, a in enumerate(articles):
        img = "IMG" if a.get("image") else "   "
        desc_len = len(a.get("description") or "")
        content_len = len(a.get("content") or "")
        src = a.get("source", {}).get("name", "?")
        print(f"  [{i+1}] [{img}] {a.get('title', '?')[:85]}")
        print(f"       {src} | desc: {desc_len} | content: {content_len} chars | {a.get('publishedAt', '')[:10]}")
    if not articles:
        print("  (no results)")
    return articles

# Quick test
print("=== GNews Quick Test ===")
data = search_gnews("Walmart earnings")
show_gnews("GNews: Walmart earnings", data)

=== GNews Quick Test ===

GNews: Walmart earnings: 5 results (total: 661)
  [1] [IMG] Walmart (WMT) Q4 2026 earnings
       CNBC | desc: 97 | content: 266 chars | 2026-02-19
  [2] [IMG] U.S. Stock Futures Hold Steady as Investors Await Walmart Earnings
       Markets Insider | desc: 147 | content: 265 chars | 2026-02-18
  [3] [IMG] Walmart, Target earnings put focus on new CEOs Furner, Fiddelke
       CNBC | desc: 127 | content: 266 chars | 2026-02-18
  [4] [IMG] Here's How Much Walmart Stock Is Expected to Move After Earnings Thursday
       Investopedia | desc: 163 | content: 266 chars | 2026-02-18
  [5] [IMG] Walmart Earnings Preview: 45x Multiple Puts Pressure on Q4 Numbers
       Investing.com | desc: 82 | content: 266 chars | 2026-02-18


[{'id': '41c7728257d50976861cbe5011a10092',
  'title': 'Walmart (WMT) Q4 2026 earnings',
  'description': 'It marks the first earnings report since new Walmart CEO John Furner took over in early February.',
  'content': "Walmart is expected to report its fiscal fourth-quarter earnings on Thursday morning as the retail giant and its new CEO chase a future fueled more by digital businesses and artificial intelligence.\nHere's what Wall Street analysts surveyed by LSEG a... [2678 chars]",
  'url': 'https://www.cnbc.com/2026/02/19/walmart-wmt-q4-2026-earnings.html',
  'image': 'https://image.cnbcfm.com/api/v1/image/108229118-17636478422025-11-20t135644z_2028602658_rc2gieaot198_rtrmadp_0_walmart-results-nasdaq.jpeg?v=1763647862&w=1920&h=1080',
  'publishedAt': '2026-02-19T05:01:01Z',
  'lang': 'en',
  'source': {'id': '29e94cdd2d4f6a3ca7108bb0293087c6',
   'name': 'CNBC',
   'url': 'https://www.cnbc.com',
   'country': 'us'}},
 {'id': '6824b87c4887a8078084f8722f2360cb',
  'title': 'U.S. Sto

In [27]:
# Inspect first article fields
articles = data.get("articles", [])
if articles:
    a = articles[0]
    print("=== GNews Article Fields ===")
    for key, val in a.items():
        if isinstance(val, dict):
            print(f"  {key}: {json.dumps(val)}")
        elif isinstance(val, str) and len(val) > 300:
            print(f"  {key}: ({len(val)} chars) {val[:300]}...")
        else:
            print(f"  {key}: {val}")
    
    print(f"\n=== Key Metrics ===")
    print(f"Content length: {len(a.get('content') or '')} chars")
    print(f"Description length: {len(a.get('description') or '')} chars")
    print(f"Has image: {bool(a.get('image'))}")
    print(f"Source: {a.get('source', {}).get('name', '?')}")

=== GNews Article Fields ===
  id: 41c7728257d50976861cbe5011a10092
  title: Walmart (WMT) Q4 2026 earnings
  description: It marks the first earnings report since new Walmart CEO John Furner took over in early February.
  content: Walmart is expected to report its fiscal fourth-quarter earnings on Thursday morning as the retail giant and its new CEO chase a future fueled more by digital businesses and artificial intelligence.
Here's what Wall Street analysts surveyed by LSEG a... [2678 chars]
  url: https://www.cnbc.com/2026/02/19/walmart-wmt-q4-2026-earnings.html
  image: https://image.cnbcfm.com/api/v1/image/108229118-17636478422025-11-20t135644z_2028602658_rc2gieaot198_rtrmadp_0_walmart-results-nasdaq.jpeg?v=1763647862&w=1920&h=1080
  publishedAt: 2026-02-19T05:01:01Z
  lang: en
  source: {"id": "29e94cdd2d4f6a3ca7108bb0293087c6", "name": "CNBC", "url": "https://www.cnbc.com", "country": "us"}

=== Key Metrics ===
Content length: 266 chars
Description length: 97 chars
Has image: Tr

## GNews Strategy Comparison
Test same 10 WSJ titles with different search strategies.

**GNews advantages over NewsData.io:**
- `in` param: search in `title`, `description`, `content` (combinable!)
- `from`/`to` date filter on free tier
- `sortby=relevance` on free tier
- Content included (truncated on free, but still ~300+ chars)

In [33]:
# === GNews Batch Test (v3): Entity + Keyword ===
# Full title search fails (AND logic = too many words)
# Instead: manually extract entity + keyword (2-3 words) per title
# 10 requests, 3 second delay

import re
from datetime import timedelta

titles_and_queries = [
    ("Walmart Sales Climb, Driven by Grocery and Online Gains",        "Walmart sales grocery"),
    ("Klarna Swings to Loss Despite Jump in Revenue",                  "Klarna loss revenue"),
    ("U.S. Trade Deficit Grew in December",                            "US trade deficit"),
    ("Etsy Posts Lower Profit Despite Higher Revenue",                 "Etsy profit revenue"),
    ("Hims & Hers to Acquire Eucalyptus in $1.15 Billion Deal",       "Hims Hers Eucalyptus"),
    ("Former Prince Andrew Arrested Amid Epstein Probe",               "Prince Andrew arrested Epstein"),
    ("Wayfair Loss Narrows as Revenue Rises on U.S. Growth",          "Wayfair loss revenue"),
    ("Rio Tinto Spotlights Growth Plans After Failed Glencore Megadeal Talks", "Rio Tinto Glencore"),
    ("Elizabeth Warren Has Questions About the Shake-Up Inside the Fed's Banking Regulator", "Elizabeth Warren Fed regulator"),
    ("Nestle to Shed Ice-Cream Business in Shakeup",                   "Nestle ice cream business"),
]

# 2-day window — these WSJ articles are from ~Feb 17-19
from_date = (datetime.now() - timedelta(days=2)).strftime("%Y-%m-%dT00:00:00Z")
to_date = datetime.now().strftime("%Y-%m-%dT23:59:59Z")

gnews_results = {}

print("=== GNews: Entity + Keyword (2-day window, 3s delay) ===")
print(f"from={from_date}  to={to_date}\n")

for i, (title, query) in enumerate(titles_and_queries):
    print(f"\n[{i+1}/10] WSJ: {title}")
    print(f"  Query: \"{query}\"")
    
    data = search_gnews(query, sortby="relevance", **{
        "in": "title,description,content",
        "from": from_date,
        "to": to_date,
        "max": 3,
    })
    
    arts = data.get("articles", []) if "errors" not in data else []
    
    if arts:
        print(f"  -> {len(arts)} results (total: {data.get('totalArticles', '?')})")
        for j, a in enumerate(arts):
            img = "IMG" if a.get("image") else "   "
            src = a.get("source", {}).get("name", "?")
            clen = len(a.get("content") or "")
            print(f"  [{j+1}] [{img}] {a['title'][:80]}")
            print(f"       {src} | content:{clen}ch | {a.get('publishedAt','')[:10]}")
        gnews_results[title] = {"articles": arts, "query": query}
    else:
        print(f"  -> 0 results")
        gnews_results[title] = {"articles": [], "query": query}
    
    if i < len(titles_and_queries) - 1:
        time.sleep(3)

# Summary
found = sum(1 for v in gnews_results.values() if v["articles"])
has_img = sum(1 for v in gnews_results.values() 
              if v["articles"] and v["articles"][0].get("image"))
has_content = sum(1 for v in gnews_results.values()
                  if v["articles"] and len(v["articles"][0].get("content") or "") > 50)
total = len(titles_and_queries)
print(f"\n{'='*60}")
print(f"GNews entity+keyword (2-day): {found}/{total} matched ({found/total*100:.0f}%)")
print(f"With image:   {has_img}/{total}")
print(f"With content: {has_content}/{total}")

=== GNews: Entity + Keyword (2-day window, 3s delay) ===
from=2026-02-17T00:00:00Z  to=2026-02-19T23:59:59Z


[1/10] WSJ: Walmart Sales Climb, Driven by Grocery and Online Gains
  Query: "Walmart sales grocery"
  -> 3 results (total: 10)
  [1] [IMG] Walmart, Dollar Stores Beat Traditional Grocers' Reach As Affordability Crunch I
       Benzinga | content:266ch | 2026-02-18
  [2] [IMG] Walmart, Target earnings put focus on new CEOs Furner, Fiddelke
       CNBC | content:266ch | 2026-02-18
  [3] [IMG] Mega Millions Tuesday, Feb. 17, Mardi Gras jackpot at $395 million
       Gainesville Sun | content:266ch | 2026-02-17

[2/10] WSJ: Klarna Swings to Loss Despite Jump in Revenue
  Query: "Klarna loss revenue"
  -> 0 results

[3/10] WSJ: U.S. Trade Deficit Grew in December
  Query: "US trade deficit"
  -> 3 results (total: 126)
  [1] [IMG] Trump claims 78 per cent drop in US trade deficit; credits tariffs for economic 
       The Tribune | content:266ch | 2026-02-19
  [2] [IMG] Trump says ta

In [42]:
# === Show detailed results for GNews matches ===
# Focus on content quality — this is what we'll use for AI summarization

for title, result in gnews_results.items():
    arts = result["articles"]
    if not arts:
        continue
    a = arts[0]
    print(f"\n{'='*70}")
    print(f"WSJ:     {title}")
    print(f"Query:   {result.get('query', '?')}")
    print(f"API:     {a.get('title')}")
    print(f"Source:  {a.get('source', {}).get('name', '?')}")
    print(f"Date:    {a.get('publishedAt', '')}")
    
    desc = a.get("description") or ""
    content = a.get("content") or ""
    print(f"\nDescription ({len(desc)} chars):")
    print(f"  {desc[:500]}")
    print(f"\nContent ({len(content)} chars):")
    print(f"  {content[:2000]}{'...' if len(content) > 2000 else ''}")
    print(f"\nImage:  {a.get('image', 'None')}")
    print(f"URL:    {a.get('url', 'None')}")

# Also show which titles got NO results
no_match = [t for t, r in gnews_results.items() if not r["articles"]]
if no_match:
    print(f"\n{'='*70}")
    print(f"NO RESULTS ({len(no_match)}/{len(gnews_results)}):")
    for t in no_match:
        print(f"  - {t}")
        print(f"    Query: \"{gnews_results[t].get('query', '?')}\"")



WSJ:     Walmart Sales Climb, Driven by Grocery and Online Gains
Query:   Walmart sales grocery
API:     Walmart, Dollar Stores Beat Traditional Grocers' Reach As Affordability Crunch Intensifies, Report Finds
Source:  Benzinga
Date:    2026-02-18T13:28:47Z

Description (219 chars):
  As financial insecurity among Americans aged 18-54 continues to rise, Walmart (NYSE: WMT) has seen its grocery penetration in the U.S. reach a record 72%, according to the latest dunnhumby Consumer Trends Tracker (CTT).

Content (266 chars):
  A new Consumer Trends Tracker from data analytics firm dunnhumby released on Wednesday shows that more U.S. households are reorganizing where they shop in response to tightening budgets.
Mass-market retailers and dollar stores are drawing more custom... [2582 chars]

Image:  https://cdn.benzinga.com/files/images/story/2026/02/18/Nov-23--2019-Mountain-View-----Ca----Usa.jpeg?width=1200&height=800&fit=crop
URL:    https://www.benzinga.com/news/retail-sales/26/02/5068

In [43]:
gnews_results

{'Walmart Sales Climb, Driven by Grocery and Online Gains': {'articles': [{'id': '781a9c5ba3d4e817b030cb7d9353dda1',
    'title': "Walmart, Dollar Stores Beat Traditional Grocers' Reach As Affordability Crunch Intensifies, Report Finds",
    'description': 'As financial insecurity among Americans aged 18-54 continues to rise, Walmart (NYSE: WMT) has seen its grocery penetration in the U.S. reach a record 72%, according to the latest dunnhumby Consumer Trends Tracker (CTT).',
    'content': 'A new Consumer Trends Tracker from data analytics firm dunnhumby released on Wednesday shows that more U.S. households are reorganizing where they shop in response to tightening budgets.\nMass-market retailers and dollar stores are drawing more custom... [2582 chars]',
    'url': 'https://www.benzinga.com/news/retail-sales/26/02/50683948/walmart-dollar-stores-beat-traditional-grocers-reach-as-affordability-crunch-intensifies-report-finds',
    'image': 'https://cdn.benzinga.com/files/images/story/20

In [45]:
# === NewsData.io: Entity + Keyword (same queries as GNews) ===
# 10 requests, 3 second delay (rate limit: 30/15min)

titles_and_queries_nd = [
    ("Walmart Sales Climb, Driven by Grocery and Online Gains",        "Walmart sales grocery"),
    ("Klarna Swings to Loss Despite Jump in Revenue",                  "Klarna loss revenue"),
    ("U.S. Trade Deficit Grew in December",                            "US trade deficit"),
    ("Etsy Posts Lower Profit Despite Higher Revenue",                 "Etsy profit revenue"),
    ("Hims & Hers to Acquire Eucalyptus in $1.15 Billion Deal",       "Hims Hers Eucalyptus"),
    ("Former Prince Andrew Arrested Amid Epstein Probe",               "Prince Andrew arrested Epstein"),
    ("Wayfair Loss Narrows as Revenue Rises on U.S. Growth",          "Wayfair loss revenue"),
    ("Rio Tinto Spotlights Growth Plans After Failed Glencore Megadeal Talks", "Rio Tinto Glencore"),
    ("Elizabeth Warren Has Questions About the Shake-Up Inside the Fed's Banking Regulator", "Elizabeth Warren Fed regulator"),
    ("Nestle to Shed Ice-Cream Business in Shakeup",                   "Nestle ice cream business"),
]

nd_results = {}

print("=== NewsData.io: Entity + Keyword (3s delay) ===\n")

for i, (title, query) in enumerate(titles_and_queries_nd):
    print(f"\n[{i+1}/10] WSJ: {title}")
    print(f"  Query: \"{query}\"")
    
    data = search_newsdata(query, removeduplicate=1, size=3)
    
    arts = data.get('results') or []
    if not isinstance(arts, list):
        arts = []
    
    if arts:
        print(f"  -> {len(arts)} results (total: {data.get('totalResults', '?')})")
        for j, a in enumerate(arts):
            if not isinstance(a, dict): continue
            img = "IMG" if a.get("image_url") else "   "
            src = a.get("source_name", "?")
            dlen = len(a.get("description") or "")
            print(f"  [{j+1}] [{img}] {a.get('title','?')[:80]}")
            print(f"       {src} | desc:{dlen}ch | {a.get('pubDate','')[:10]}")
        nd_results[title] = {"articles": arts, "query": query}
    else:
        print(f"  -> 0 results")
        nd_results[title] = {"articles": [], "query": query}
    
    if i < len(titles_and_queries_nd) - 1:
        time.sleep(3)

# Summary
found = sum(1 for v in nd_results.values() if v["articles"])
has_img = sum(1 for v in nd_results.values() 
              if v["articles"] and isinstance(v["articles"][0], dict) and v["articles"][0].get("image_url"))
total = len(titles_and_queries_nd)
print(f"\n{'='*60}")
print(f"NewsData.io entity+keyword: {found}/{total} matched ({found/total*100:.0f}%)")
print(f"With image: {has_img}/{total}")
print(f"\n--- Compare with GNews ---")
gfound = sum(1 for v in gnews_results.values() if v["articles"])
print(f"GNews:      {gfound}/{total} matched ({gfound/total*100:.0f}%)")
print(f"NewsData:   {found}/{total} matched ({found/total*100:.0f}%)")

=== NewsData.io: Entity + Keyword (3s delay) ===


[1/10] WSJ: Walmart Sales Climb, Driven by Grocery and Online Gains
  Query: "Walmart sales grocery"
  -> 3 results (total: 6)
  [1] [IMG] 11 Best Natural and Organic Food Stocks to Buy Now
       Insider Trading | desc:307ch | 2026-02-18
  [2] [IMG] Target, Walmart traffic numbers trending in different directions: Placer.ai
       Supermarket News | desc:81ch | 2026-02-18
  [3] [IMG] XRT Is Up 11% But the Real Story Is Which Retailers Are Winning
       24/7 Wall St | desc:565ch | 2026-02-18

[2/10] WSJ: Klarna Swings to Loss Despite Jump in Revenue
  Query: "Klarna loss revenue"
  -> 2 results (total: 2)
  [1] [IMG] The Vogue Business Funding Tracker
       Vogue | desc:98ch | 2026-02-19
  [2] [   ] Buy Now Pay Later Market Expected to Reach $3.98 Trillion By 2030 | Trends, Sect
       Openpr.com | desc:255ch | 2026-02-18

[3/10] WSJ: U.S. Trade Deficit Grew in December
  Query: "US trade deficit"
  -> 3 results (total: 95)
  [1] [IM

## Enriched Test: ALL WSJ RSS Metadata → API Parameters

Using `feed_name`, `subcategory`, `published_at`, `description` from `wsj_items` table to maximize API filter precision.

**Strategy**: entity+keyword query + category filter + tight date window + description keywords

In [48]:
# === Enriched Test Setup: WSJ RSS Metadata ===
from datetime import datetime, timedelta

# WSJ items with FULL metadata (from DB query)
wsj_items_enriched = [
    {
        "title": "Walmart Sales Climb, Driven by Grocery and Online Gains",
        "feed_name": "BUSINESS_MARKETS",
        "subcategory": "retail",
        "description": "The retailer continued to attract more shoppers of all incom...",
        "published_at": "2026-02-19T13:53:00Z",
        "query_base": "Walmart sales grocery",
    },
    {
        "title": "Klarna Swings to Loss Despite Jump in Revenue",
        "feed_name": "BUSINESS_MARKETS",
        "subcategory": "earnings",
        "description": "Klarna swung to a loss in the fourth quarter despite reporti...",
        "published_at": "2026-02-19T13:41:00Z",
        "query_base": "Klarna loss revenue",
    },
    {
        "title": "U.S. Trade Deficit Grew in December",
        "feed_name": "ECONOMY",
        "subcategory": "trade",
        "description": "The U.S. trade deficit jumped in December, the latest leg of...",
        "published_at": "2026-02-19T13:56:00Z",
        "query_base": "US trade deficit",
    },
    {
        "title": "Etsy Posts Lower Profit Despite Higher Revenue",
        "feed_name": "BUSINESS_MARKETS",
        "subcategory": "retail",
        "description": "Etsy logged lower profit in the fourth quarter, despite an u...",
        "published_at": "2026-02-19T12:53:00Z",
        "query_base": "Etsy profit revenue",
    },
    {
        "title": "Hims & Hers to Acquire Eucalyptus in $1.15 Billion Deal",
        "feed_name": "BUSINESS_MARKETS",
        "subcategory": "deals",
        "description": "Hims & Hers Health has agreed to acquire digital health comp...",
        "published_at": "2026-02-19T12:31:00Z",
        "query_base": "Hims Hers Eucalyptus",
    },
    {
        "title": "Former Prince Andrew Arrested Amid Epstein Probe",
        "feed_name": "WORLD",
        "subcategory": "uk",
        "description": "King Charles said the British royal family supports a full i...",
        "published_at": "2026-02-19T13:04:00Z",
        "query_base": "Prince Andrew arrested Epstein",
    },
    {
        "title": "Wayfair Loss Narrows as Revenue Rises on U.S. Growth",
        "feed_name": "BUSINESS_MARKETS",
        "subcategory": "earnings",
        "description": "Wayfair narrowed its fourth-quarter loss as revenue rose, dr...",
        "published_at": "2026-02-19T12:11:00Z",
        "query_base": "Wayfair loss revenue",
    },
    {
        "title": "Rio Tinto Spotlights Growth Plans After Failed Glencore Megadeal Talks",
        "feed_name": "BUSINESS_MARKETS",
        "subcategory": "earnings",
        "description": "Rio Tinto on Thursday reported a 14% drop in annual net prof...",
        "published_at": "2026-02-19T12:11:00Z",
        "query_base": "Rio Tinto Glencore",
    },
    {
        "title": "Elizabeth Warren Has Questions About the Shake-Up Inside the Fed's Banking Regulator",
        "feed_name": "BUSINESS_MARKETS",
        "subcategory": "regulation",
        "description": "Letters to the Fed's Michelle Bowman ask for information abo...",
        "published_at": "2026-02-19T13:57:00Z",
        "query_base": "Elizabeth Warren Fed regulator",
    },
    {
        "title": "Nestle to Shed Ice-Cream Business in Shakeup",
        "feed_name": "BUSINESS_MARKETS",
        "subcategory": "earnings",
        "description": "Nestle reported better than expected fourth-quarter sales gr...",
        "published_at": "2026-02-19T12:11:00Z",
        "query_base": "Nestle ice cream business",
    },
]

# === Feed name → API category mapping ===
FEED_TO_GNEWS_TOPIC = {
    "ECONOMY": "nation",
    "WORLD": "world",
    "BUSINESS_MARKETS": "business",
    "TECH": "technology",
}

FEED_TO_NEWSDATA_CATEGORY = {
    "ECONOMY": "business",
    "WORLD": "world",
    "BUSINESS_MARKETS": "business",
    "TECH": "technology",
}

print(f"Loaded {len(wsj_items_enriched)} WSJ items with full metadata")
print(f"\nMetadata → API parameter mapping:")
print(f"  feed_name → GNews: country filter / NewsData: category filter")
print(f"  published_at → GNews: ±24h from/to window")
print(f"  subcategory → query enrichment (not used yet)")
print(f"  description → additional keywords (not used yet)")

Loaded 10 WSJ items with full metadata

Metadata → API parameter mapping:
  feed_name → GNews: country filter / NewsData: category filter
  published_at → GNews: ±24h from/to window
  subcategory → query enrichment (not used yet)
  description → additional keywords (not used yet)


In [49]:
# === GNews ENRICHED: entity+keyword + topic + tight date + country ===
# Uses: query_base + feed_name→topic + published_at→±24h + country=us
# Compare with baseline (cell-18): entity+keyword only, 2-day window
# 10 requests, 3 second delay

gnews_enriched = {}

print("=== GNews ENRICHED (all WSJ metadata) ===")
print("Params: query + topic + from/to(±24h) + country=us + in=title,desc,content + sortby=relevance\n")

for i, item in enumerate(wsj_items_enriched):
    title = item["title"]
    query = item["query_base"]
    topic = FEED_TO_GNEWS_TOPIC.get(item["feed_name"])
    
    # Tight date window: published_at - 24h to published_at + 24h
    pub_dt = datetime.fromisoformat(item["published_at"].replace("Z", "+00:00"))
    from_dt = (pub_dt - timedelta(hours=24)).strftime("%Y-%m-%dT%H:%M:%SZ")
    to_dt = (pub_dt + timedelta(hours=24)).strftime("%Y-%m-%dT%H:%M:%SZ")
    
    print(f"\n[{i+1}/10] WSJ: {title}")
    print(f"  Query: \"{query}\" | topic={topic} | window: ±24h from {item['published_at'][:16]}")
    
    params = {
        "sortby": "relevance",
        "in": "title,description,content",
        "from": from_dt,
        "to": to_dt,
        "max": 3,
        "country": "us",  # NEW: limit to US sources
    }
    # Only add topic if mapped (GNews uses 'topic' param via URL, not query param)
    # Actually GNews topic is a URL path param for /top-headlines, not /search
    # For /search, we can't filter by topic — skip this
    # Instead, we keep country=us to narrow results
    
    data = search_gnews(query, **params)
    
    arts = data.get("articles", []) if "errors" not in data else []
    
    if arts:
        print(f"  -> {len(arts)} results (total: {data.get('totalArticles', '?')})")
        for j, a in enumerate(arts):
            img = "IMG" if a.get("image") else "   "
            src = a.get("source", {}).get("name", "?")
            clen = len(a.get("content") or "")
            print(f"  [{j+1}] [{img}] {a['title'][:80]}")
            print(f"       {src} | content:{clen}ch | {a.get('publishedAt','')[:16]}")
        gnews_enriched[title] = {"articles": arts, "query": query, "params": "enriched"}
    else:
        print(f"  -> 0 results")
        gnews_enriched[title] = {"articles": [], "query": query, "params": "enriched"}
    
    if i < len(wsj_items_enriched) - 1:
        time.sleep(3)

# Summary + comparison with baseline
found_e = sum(1 for v in gnews_enriched.values() if v["articles"])
found_b = sum(1 for v in gnews_results.values() if v["articles"])
total = len(wsj_items_enriched)

print(f"\n{'='*70}")
print(f"GNews ENRICHED (all metadata):  {found_e}/{total} matched ({found_e/total*100:.0f}%)")
print(f"GNews BASELINE (entity only):   {found_b}/{total} matched ({found_b/total*100:.0f}%)")
print(f"\nPer-title comparison:")
print(f"{'Title':<55} {'Base':>5} {'Enr':>5}")
print("-" * 67)
for item in wsj_items_enriched:
    t = item["title"]
    short = t[:52] + "..." if len(t) > 55 else t
    b = "YES" if gnews_results.get(t, {}).get("articles") else "NO"
    e = "YES" if gnews_enriched.get(t, {}).get("articles") else "NO"
    flag = " ★" if e != b else ""
    print(f"{short:<55} {b:>5} {e:>5}{flag}")

=== GNews ENRICHED (all WSJ metadata) ===
Params: query + topic + from/to(±24h) + country=us + in=title,desc,content + sortby=relevance


[1/10] WSJ: Walmart Sales Climb, Driven by Grocery and Online Gains
  Query: "Walmart sales grocery" | topic=business | window: ±24h from 2026-02-19T13:53
  -> 1 results (total: 1)
  [1] [IMG] The Stroller, Feb. 19, 2026: Events in the Alle-Kiski Valley
       Pittsburgh Tribune-Review | content:267ch | 2026-02-19T06:00

[2/10] WSJ: Klarna Swings to Loss Despite Jump in Revenue
  Query: "Klarna loss revenue" | topic=business | window: ±24h from 2026-02-19T13:41
  -> 0 results

[3/10] WSJ: U.S. Trade Deficit Grew in December
  Query: "US trade deficit" | topic=nation | window: ±24h from 2026-02-19T13:56
  -> 3 results (total: 8)
  [1] [IMG] US trade deficit in goods widens to new record in 2025
       Breitbart News Network | content:266ch | 2026-02-19T06:56
  [2] [IMG] Markets unreactive following hawkish FOMC minutes; Crude extends on geopolitics 
 

In [50]:
# === NewsData.io ENRICHED: entity+keyword + category + prioritydomain + image ===
# Uses: query_base + feed_name→category + prioritydomain=top + image=1 + removeduplicate=1
# Note: NewsData.io timeframe param is PAID ONLY — can't filter by date on free tier
# 10 requests, 3 second delay

nd_enriched = {}

print("=== NewsData.io ENRICHED (all WSJ metadata) ===")
print("Params: query + category + prioritydomain=top + image=1 + removeduplicate=1\n")

for i, item in enumerate(wsj_items_enriched):
    title = item["title"]
    query = item["query_base"]
    category = FEED_TO_NEWSDATA_CATEGORY.get(item["feed_name"], "business")
    
    print(f"\n[{i+1}/10] WSJ: {title}")
    print(f"  Query: \"{query}\" | category={category} | prioritydomain=top")
    
    data = search_newsdata(
        query,
        category=category,
        prioritydomain="top",
        image=1,
        removeduplicate=1,
        size=3,
    )
    
    arts = data.get('results') or []
    if not isinstance(arts, list):
        arts = []
    
    if arts:
        print(f"  -> {len(arts)} results (total: {data.get('totalResults', '?')})")
        for j, a in enumerate(arts):
            if not isinstance(a, dict): continue
            img = "IMG" if a.get("image_url") else "   "
            src = a.get("source_name", "?")
            dlen = len(a.get("description") or "")
            prio = a.get("source_priority", "?")
            print(f"  [{j+1}] [{img}] {a.get('title','?')[:80]}")
            print(f"       {src} (prio:{prio}) | desc:{dlen}ch | {a.get('pubDate','')[:16]}")
        nd_enriched[title] = {"articles": arts, "query": query, "params": "enriched"}
    else:
        print(f"  -> 0 results")
        nd_enriched[title] = {"articles": [], "query": query, "params": "enriched"}
    
    if i < len(wsj_items_enriched) - 1:
        time.sleep(3)

# Summary + comparison with baseline
found_e = sum(1 for v in nd_enriched.values() if v["articles"])
found_b = sum(1 for v in nd_results.values() if v["articles"])
total = len(wsj_items_enriched)

print(f"\n{'='*70}")
print(f"NewsData ENRICHED (all metadata):  {found_e}/{total} matched ({found_e/total*100:.0f}%)")
print(f"NewsData BASELINE (entity only):   {found_b}/{total} matched ({found_b/total*100:.0f}%)")
print(f"\nPer-title comparison:")
print(f"{'Title':<55} {'Base':>5} {'Enr':>5}")
print("-" * 67)
for item in wsj_items_enriched:
    t = item["title"]
    short = t[:52] + "..." if len(t) > 55 else t
    b = "YES" if nd_results.get(t, {}).get("articles") else "NO"
    e = "YES" if nd_enriched.get(t, {}).get("articles") else "NO"
    flag = " ★" if e != b else ""
    print(f"{short:<55} {b:>5} {e:>5}{flag}")

=== NewsData.io ENRICHED (all WSJ metadata) ===
Params: query + category + prioritydomain=top + image=1 + removeduplicate=1


[1/10] WSJ: Walmart Sales Climb, Driven by Grocery and Online Gains
  Query: "Walmart sales grocery" | category=business | prioritydomain=top
  -> 1 results (total: 1)
  [1] [IMG] Walmart earnings: CEO John Furner set for first earnings report with company mar
       Yahoo! News (prio:17) | desc:111ch | 2026-02-18 17:16

[2/10] WSJ: Klarna Swings to Loss Despite Jump in Revenue
  Query: "Klarna loss revenue" | category=business | prioritydomain=top
  -> 0 results

[3/10] WSJ: U.S. Trade Deficit Grew in December
  Query: "US trade deficit" | category=business | prioritydomain=top
  -> 3 results (total: 12)
  [1] [IMG] Indian Rupee to stay near 89–90 per dollar by FY27-end, says CareEdge Ratings
       Firstpost (prio:5906) | desc:165ch | 2026-02-19 06:19
  [2] [IMG] Did Trump Tariffs Really Slash US Trade Deficit By 78%? Here's What's Actually H
       Asianet Ne

In [51]:
# === GRAND COMPARISON: All 4 Strategies ===
print("=" * 80)
print("GRAND COMPARISON: Baseline vs Enriched × GNews vs NewsData.io")
print("=" * 80)

print(f"\n{'Title':<45} {'G-Base':>7} {'G-Enr':>7} {'N-Base':>7} {'N-Enr':>7}")
print("-" * 75)

for item in wsj_items_enriched:
    t = item["title"]
    short = t[:42] + "..." if len(t) > 45 else t
    gb = "YES" if gnews_results.get(t, {}).get("articles") else "NO"
    ge = "YES" if gnews_enriched.get(t, {}).get("articles") else "NO"
    nb = "YES" if nd_results.get(t, {}).get("articles") else "NO"
    ne = "YES" if nd_enriched.get(t, {}).get("articles") else "NO"
    print(f"{short:<45} {gb:>7} {ge:>7} {nb:>7} {ne:>7}")

total = len(wsj_items_enriched)
gb_n = sum(1 for v in gnews_results.values() if v.get("articles"))
ge_n = sum(1 for v in gnews_enriched.values() if v.get("articles"))
nb_n = sum(1 for v in nd_results.values() if v.get("articles"))
ne_n = sum(1 for v in nd_enriched.values() if v.get("articles"))

print(f"\n{'TOTAL MATCHED':<45} {gb_n:>5}/10 {ge_n:>5}/10 {nb_n:>5}/10 {ne_n:>5}/10")
print(f"{'MATCH RATE':<45} {gb_n*10:>6}% {ge_n*10:>6}% {nb_n*10:>6}% {ne_n*10:>6}%")

print(f"\n--- Key Metadata Used ---")
print(f"GNews Baseline:   entity+keyword, 2-day window, in=title+desc+content")
print(f"GNews Enriched:   + country=us, ±24h from published_at")
print(f"NewsData Baseline: entity+keyword, removeduplicate=1")
print(f"NewsData Enriched: + category=business/world, prioritydomain=top, image=1")

# Best strategy per title
print(f"\n--- Best Strategy Per Title ---")
for item in wsj_items_enriched:
    t = item["title"]
    short = t[:60] + "..." if len(t) > 63 else t
    strategies = []
    if gnews_results.get(t, {}).get("articles"): strategies.append("G-Base")
    if gnews_enriched.get(t, {}).get("articles"): strategies.append("G-Enr")
    if nd_results.get(t, {}).get("articles"): strategies.append("N-Base")
    if nd_enriched.get(t, {}).get("articles"): strategies.append("N-Enr")
    if strategies:
        print(f"  ✓ {short}: {', '.join(strategies)}")
    else:
        print(f"  ✗ {short}: NO MATCH anywhere")

GRAND COMPARISON: Baseline vs Enriched × GNews vs NewsData.io

Title                                          G-Base   G-Enr  N-Base   N-Enr
---------------------------------------------------------------------------
Walmart Sales Climb, Driven by Grocery and...     YES     YES     YES     YES
Klarna Swings to Loss Despite Jump in Revenue      NO      NO     YES      NO
U.S. Trade Deficit Grew in December               YES     YES     YES     YES
Etsy Posts Lower Profit Despite Higher Rev...      NO      NO     YES      NO
Hims & Hers to Acquire Eucalyptus in $1.15...      NO      NO      NO      NO
Former Prince Andrew Arrested Amid Epstein...     YES     YES     YES      NO
Wayfair Loss Narrows as Revenue Rises on U...      NO      NO      NO      NO
Rio Tinto Spotlights Growth Plans After Fa...     YES     YES     YES     YES
Elizabeth Warren Has Questions About the S...      NO      NO      NO      NO
Nestle to Shed Ice-Cream Business in Shakeup      YES     YES     YES     YES

TO

---
# NewsAPI.org Test
Testing NewsAPI as third fallback API option.

**Free tier**: 100 req/day, content truncated to 200 chars
**Key advantages**: `domains` filter (precise source control), `searchIn` param, `from`/`to` free, `sortBy=relevancy`
**From old project**: `/Users/youngmincho/Project/finance/ai_briefing/scrapers/get_news.py`

In [52]:
# === NewsAPI.org Setup ===
import requests, time, os
from datetime import datetime, timedelta

# Key from old finance project .env or set manually
NEWSAPI_KEY = os.environ.get("NEWS_API_KEY", "4e58974dfec74936b5153b90276d2192")
NEWSAPI_URL = "https://newsapi.org/v2/everything"

# Quality domains (from old get_news.py + additions)
DOMAINS_WITH_WSJ = "wsj.com,reuters.com,cnbc.com,bloomberg.com,bbc.com,nytimes.com,finance.yahoo.com,investing.com,marketwatch.com,foxbusiness.com,ft.com,economist.com,apnews.com"
DOMAINS_NO_WSJ = "reuters.com,cnbc.com,bloomberg.com,bbc.com,nytimes.com,finance.yahoo.com,investing.com,marketwatch.com,foxbusiness.com,ft.com,economist.com,apnews.com"

def search_newsapi(query: str, **kwargs) -> dict:
    """Search NewsAPI.org /v2/everything."""
    params = {
        "apiKey": NEWSAPI_KEY,
        "q": query,
        "language": "en",
        "pageSize": 3,
        "sortBy": "relevancy",
        **kwargs,
    }
    resp = requests.get(NEWSAPI_URL, params=params)
    data = resp.json()
    if data.get("status") == "error":
        print(f"  API ERROR: {data.get('code')}: {data.get('message', '')[:100]}")
    return data

def show_newsapi(label, data):
    """Display NewsAPI results."""
    if data.get("status") == "error":
        print(f"\n{label}: ERROR — {data.get('message', '')[:80]}")
        return []
    articles = data.get("articles", [])
    total = data.get("totalResults", 0)
    print(f"\n{label}: {len(articles)} results (total: {total})")
    for i, a in enumerate(articles):
        img = "IMG" if a.get("urlToImage") else "   "
        src = a.get("source", {}).get("name", "?")
        clen = len(a.get("content") or "")
        dlen = len(a.get("description") or "")
        print(f"  [{i+1}] [{img}] {a.get('title', '?')[:80]}")
        print(f"       {src} | desc:{dlen}ch | content:{clen}ch | {a.get('publishedAt','')[:16]}")
    if not articles:
        print("  (no results)")
    return articles

# Quick test
print("=== NewsAPI.org Quick Test ===")
data = search_newsapi("Walmart earnings", **{
    "from": (datetime.now() - timedelta(days=2)).strftime("%Y-%m-%d"),
    "to": datetime.now().strftime("%Y-%m-%d"),
})
show_newsapi("NewsAPI: Walmart earnings", data)

print(f"\nAPI Key loaded: {'YES' if NEWSAPI_KEY != 'YOUR_KEY_HERE' else 'NO — set NEWS_API_KEY'}")

=== NewsAPI.org Quick Test ===

NewsAPI: Walmart earnings: 3 results (total: 18)
  [1] [IMG] Is Walmart Stock a Buy Ahead of Q4 Earnings on Feb. 19?
       Barchart.com | desc:148ch | content:214ch | 2026-02-17T19:38
  [2] [IMG] Walmart earnings: CEO John Furner set for first earnings report with company mar
       Yahoo Entertainment | desc:111ch | content:214ch | 2026-02-18T17:16
  [3] [IMG] Walmart (WMT) Is King, Says Jim Cramer
       Yahoo Entertainment | desc:158ch | content:214ch | 2026-02-18T17:50

API Key loaded: YES


In [53]:
# === NewsAPI BASELINE: entity+keyword, 2-day window, NO domain filter ===
# Same queries as GNews/NewsData tests — apples-to-apples comparison
# 10 requests, 3 second delay

from_date = (datetime.now() - timedelta(days=2)).strftime("%Y-%m-%d")
to_date = datetime.now().strftime("%Y-%m-%d")

newsapi_base = {}

print("=== NewsAPI BASELINE (entity+keyword, 2-day, no domain filter) ===")
print(f"from={from_date}  to={to_date}\n")

for i, item in enumerate(wsj_items_enriched):
    title = item["title"]
    query = item["query_base"]
    
    print(f"\n[{i+1}/10] WSJ: {title}")
    print(f"  Query: \"{query}\"")
    
    data = search_newsapi(query, **{
        "from": from_date,
        "to": to_date,
        "sortBy": "relevancy",
        "pageSize": 3,
    })
    
    arts = data.get("articles", []) if data.get("status") != "error" else []
    
    if arts:
        print(f"  -> {len(arts)} results (total: {data.get('totalResults', '?')})")
        for j, a in enumerate(arts):
            img = "IMG" if a.get("urlToImage") else "   "
            src = a.get("source", {}).get("name", "?")
            clen = len(a.get("content") or "")
            print(f"  [{j+1}] [{img}] {a['title'][:80]}")
            print(f"       {src} | content:{clen}ch | {a.get('publishedAt','')[:16]}")
        newsapi_base[title] = {"articles": arts, "query": query}
    else:
        print(f"  -> 0 results")
        newsapi_base[title] = {"articles": [], "query": query}
    
    if i < len(wsj_items_enriched) - 1:
        time.sleep(3)

# Summary
found = sum(1 for v in newsapi_base.values() if v["articles"])
total = len(wsj_items_enriched)
print(f"\n{'='*60}")
print(f"NewsAPI BASELINE: {found}/{total} matched ({found/total*100:.0f}%)")

=== NewsAPI BASELINE (entity+keyword, 2-day, no domain filter) ===
from=2026-02-17  to=2026-02-19


[1/10] WSJ: Walmart Sales Climb, Driven by Grocery and Online Gains
  Query: "Walmart sales grocery"
  -> 3 results (total: 4)
  [1] [IMG] Walmart earnings: CEO John Furner set for first earnings report with company mar
       Yahoo Entertainment | content:214ch | 2026-02-18T17:16
  [2] [IMG] What Happens to Stores When AI Agents Do the Shopping?
       pymnts.com | content:215ch | 2026-02-18T12:00
  [3] [IMG] As Walmart and Target head in different directions, all eyes are on their new CE
       CNBC | content:214ch | 2026-02-18T12:00

[2/10] WSJ: Klarna Swings to Loss Despite Jump in Revenue
  Query: "Klarna loss revenue"
  -> 2 results (total: 2)
  [1] [IMG] Portnoy Law Firm Announces Class Action on Behalf of Klarna Group plc Investors
       GlobeNewswire | content:214ch | 2026-02-17T14:35
  [2] [   ] INVESTOR ALERT: Pomerantz Law Firm Reminds Investors with Losses on their Invest
 

In [56]:
DOMAINS_NO_WSJ

'reuters.com,cnbc.com,bloomberg.com,bbc.com,nytimes.com,finance.yahoo.com,investing.com,marketwatch.com,foxbusiness.com,ft.com,economist.com,apnews.com'

In [55]:
# === NewsAPI ENRICHED: entity+keyword + domains + searchIn + ±24h ===
# Two variants: WITH WSJ domain and WITHOUT WSJ domain
# WSJ is notorious for blocking API access — test both to confirm
# 20 requests total (10 with WSJ, 10 without), 3 second delay

newsapi_enriched_wsj = {}
newsapi_enriched_no_wsj = {}

print("=== NewsAPI ENRICHED (domains + searchIn + tight date) ===\n")

for i, item in enumerate(wsj_items_enriched):
    title = item["title"]
    query = item["query_base"]
    
    # Tight date window: published_at ± 24h
    pub_dt = datetime.fromisoformat(item["published_at"].replace("Z", "+00:00"))
    from_dt = (pub_dt - timedelta(hours=24)).strftime("%Y-%m-%dT%H:%M:%S")
    to_dt = (pub_dt + timedelta(hours=24)).strftime("%Y-%m-%dT%H:%M:%S")
    
    print(f"\n[{i+1}/10] WSJ: {title}")
    print(f"  Query: \"{query}\" | window: ±24h | searchIn=title,description")
    
    # --- WITH WSJ domain ---
    data_wsj = search_newsapi(query, **{
        "from": from_dt,
        "to": to_dt,
        "domains": DOMAINS_WITH_WSJ,
        "searchIn": "title,description",
        "sortBy": "relevancy",
        "pageSize": 3,
    })
    arts_wsj = data_wsj.get("articles", []) if data_wsj.get("status") != "error" else []
    
    time.sleep(3)
    
    # --- WITHOUT WSJ domain ---
    data_no = search_newsapi(query, **{
        "from": from_dt,
        "to": to_dt,
        "domains": DOMAINS_NO_WSJ,
        "searchIn": "title,description",
        "sortBy": "relevancy",
        "pageSize": 3,
    })
    arts_no = data_no.get("articles", []) if data_no.get("status") != "error" else []
    
    # Print results side by side
    wsj_match = "YES" if arts_wsj else "NO"
    no_match = "YES" if arts_no else "NO"
    wsj_top = arts_wsj[0]["title"][:60] if arts_wsj else "(none)"
    no_top = arts_no[0]["title"][:60] if arts_no else "(none)"
    wsj_src = arts_wsj[0].get("source", {}).get("name", "?") if arts_wsj else ""
    no_src = arts_no[0].get("source", {}).get("name", "?") if arts_no else ""
    
    print(f"  WITH WSJ:    {wsj_match} ({data_wsj.get('totalResults', 0)} total) → {wsj_top} [{wsj_src}]")
    print(f"  WITHOUT WSJ: {no_match} ({data_no.get('totalResults', 0)} total) → {no_top} [{no_src}]")
    
    newsapi_enriched_wsj[title] = {"articles": arts_wsj, "query": query}
    newsapi_enriched_no_wsj[title] = {"articles": arts_no, "query": query}
    
    if i < len(wsj_items_enriched) - 1:
        time.sleep(3)

# Summary
total = len(wsj_items_enriched)
f_wsj = sum(1 for v in newsapi_enriched_wsj.values() if v["articles"])
f_no = sum(1 for v in newsapi_enriched_no_wsj.values() if v["articles"])
f_base = sum(1 for v in newsapi_base.values() if v["articles"])

print(f"\n{'='*70}")
print(f"NewsAPI BASELINE (no domains):      {f_base}/{total} ({f_base*10}%)")
print(f"NewsAPI ENRICHED (with WSJ):        {f_wsj}/{total} ({f_wsj*10}%)")
print(f"NewsAPI ENRICHED (without WSJ):     {f_no}/{total} ({f_no*10}%)")

print(f"\nPer-title comparison:")
print(f"{'Title':<45} {'Base':>5} {'+WSJ':>5} {'-WSJ':>5}")
print("-" * 62)
for item in wsj_items_enriched:
    t = item["title"]
    short = t[:42] + "..." if len(t) > 45 else t
    b = "YES" if newsapi_base.get(t, {}).get("articles") else "NO"
    w = "YES" if newsapi_enriched_wsj.get(t, {}).get("articles") else "NO"
    n = "YES" if newsapi_enriched_no_wsj.get(t, {}).get("articles") else "NO"
    flag = " ★" if w != n else ""
    print(f"{short:<45} {b:>5} {w:>5} {n:>5}{flag}")

=== NewsAPI ENRICHED (domains + searchIn + tight date) ===


[1/10] WSJ: Walmart Sales Climb, Driven by Grocery and Online Gains
  Query: "Walmart sales grocery" | window: ±24h | searchIn=title,description
  WITH WSJ:    NO (0 total) → (none) []
  WITHOUT WSJ: NO (0 total) → (none) []

[2/10] WSJ: Klarna Swings to Loss Despite Jump in Revenue
  Query: "Klarna loss revenue" | window: ±24h | searchIn=title,description
  WITH WSJ:    NO (0 total) → (none) []
  WITHOUT WSJ: NO (0 total) → (none) []

[3/10] WSJ: U.S. Trade Deficit Grew in December
  Query: "US trade deficit" | window: ±24h | searchIn=title,description
  WITH WSJ:    NO (0 total) → (none) []
  WITHOUT WSJ: NO (0 total) → (none) []

[4/10] WSJ: Etsy Posts Lower Profit Despite Higher Revenue
  Query: "Etsy profit revenue" | window: ±24h | searchIn=title,description
  WITH WSJ:    NO (0 total) → (none) []
  WITHOUT WSJ: NO (0 total) → (none) []

[5/10] WSJ: Hims & Hers to Acquire Eucalyptus in $1.15 Billion Deal
  Query: "Hims 

In [54]:
# === FINAL COMPARISON: 3 APIs × Baseline Only ===
# GNews, NewsData.io, NewsAPI.org — all using entity+keyword, 2-day window

print("=" * 75)
print("FINAL COMPARISON: GNews vs NewsData vs NewsAPI (Baseline)")
print("=" * 75)

print(f"\n{'Title':<45} {'GNews':>6} {'NData':>6} {'NApi':>6}")
print("-" * 65)

for item in wsj_items_enriched:
    t = item["title"]
    short = t[:42] + "..." if len(t) > 45 else t
    g = "YES" if gnews_results.get(t, {}).get("articles") else "NO"
    n = "YES" if nd_results.get(t, {}).get("articles") else "NO"
    a = "YES" if newsapi_base.get(t, {}).get("articles") else "NO"
    print(f"{short:<45} {g:>6} {n:>6} {a:>6}")

total = 10
g_n = sum(1 for v in gnews_results.values() if v.get("articles"))
n_n = sum(1 for v in nd_results.values() if v.get("articles"))
a_n = sum(1 for v in newsapi_base.values() if v.get("articles"))

print(f"\n{'MATCHED':<45} {g_n:>4}/10 {n_n:>4}/10 {a_n:>4}/10")
print(f"{'RATE':<45} {g_n*10:>5}% {n_n*10:>5}% {a_n*10:>5}%")

# Same-event accuracy (manual annotation from earlier analysis)
print(f"\n--- Same-Event Quality (Top-1 result) ---")
annotations = {
    "Walmart Sales Climb, Driven by Grocery and Online Gains": {
        "gnews": ("~", "Walmart grocery penetration (same topic)"),
        "newsdata": ("X", "11 Best Organic Food Stocks (different)"),
        "newsapi": ("O", "Walmart earnings CEO Furner (same event)"),
    },
    "Klarna Swings to Loss Despite Jump in Revenue": {
        "gnews": ("-", "no match"),
        "newsdata": ("X", "Vogue Funding Tracker (unrelated)"),
        "newsapi": ("X", "Class action lawsuit (different event)"),
    },
    "U.S. Trade Deficit Grew in December": {
        "gnews": ("~", "Trump claims 78% drop (same topic, diff angle)"),
        "newsdata": ("X", "USD/INR exchange rate (different)"),
        "newsapi": ("X", "Pensions Can\'t Beat Vanguard (unrelated)"),
    },
    "Etsy Posts Lower Profit Despite Higher Revenue": {
        "gnews": ("-", "no match"),
        "newsdata": ("X", "eBay Depop acquisition (different)"),
        "newsapi": ("X", "Can you make money raising earthworms (garbage)"),
    },
    "Hims & Hers to Acquire Eucalyptus in $1.15 Billion Deal": {
        "gnews": ("-", "no match"),
        "newsdata": ("-", "no match"),
        "newsapi": ("-", "no match"),
    },
    "Former Prince Andrew Arrested Amid Epstein Probe": {
        "gnews": ("O", "Prince Andrew arrested (same event)"),
        "newsdata": ("O", "Andrew Mountbatten-Windsor Arrested (same event)"),
        "newsapi": ("X", "William and Kate gossip (different)"),
    },
    "Wayfair Loss Narrows as Revenue Rises on U.S. Growth": {
        "gnews": ("-", "no match"),
        "newsdata": ("-", "no match"),
        "newsapi": ("~", "retailers vulnerable to bankruptcy (same topic)"),
    },
    "Rio Tinto Spotlights Growth Plans After Failed Glencore Megadeal Talks": {
        "gnews": ("~", "Rio Tinto profit down 14% (same company, diff event)"),
        "newsdata": ("O", "FT: Rio Tinto copper after Glencore talks fail (SAME EVENT)"),
        "newsapi": ("~", "Glencore Goes Shopping Again (related)"),
    },
    "Elizabeth Warren Has Questions About the Shake-Up Inside the Fed\'s Banking Regulator": {
        "gnews": ("-", "no match"),
        "newsdata": ("-", "no match"),
        "newsapi": ("-", "no match"),
    },
    "Nestle to Shed Ice-Cream Business in Shakeup": {
        "gnews": ("O", "Nestle Q4 + plans ice cream sale (same event)"),
        "newsdata": ("~", "Mixed Earnings Impact European Markets (tangential)"),
        "newsapi": ("X", "FMCG sector growth FY27 (different)"),
    },
}

print(f"\n{'Title':<45} {'GNews':>6} {'NData':>6} {'NApi':>6}")
print("-" * 65)
g_same = n_same = a_same = 0
g_topic = n_topic = a_topic = 0
for item in wsj_items_enriched:
    t = item["title"]
    short = t[:42] + "..." if len(t) > 45 else t
    ann = annotations.get(t, {})
    g = ann.get("gnews", ("-",""))[0]
    n = ann.get("newsdata", ("-",""))[0]
    a = ann.get("newsapi", ("-",""))[0]
    if g == "O": g_same += 1
    if g in ("O","~"): g_topic += 1
    if n == "O": n_same += 1
    if n in ("O","~"): n_topic += 1
    if a == "O": a_same += 1
    if a in ("O","~"): a_topic += 1
    print(f"{short:<45} {g:>6} {n:>6} {a:>6}")

print(f"\nLegend: O=same event, ~=same topic, X=wrong, -=no match")
print(f"\n{'SAME EVENT (O)':<45} {g_same:>4}/10 {n_same:>4}/10 {a_same:>4}/10")
print(f"{'SAME TOPIC+ (O+~)':<45} {g_topic:>4}/10 {n_topic:>4}/10 {a_topic:>4}/10")

# Union coverage
print(f"\n--- UNION (any API) ---")
any_match = 0
any_same = 0
for item in wsj_items_enriched:
    t = item["title"]
    short = t[:60] + "..." if len(t) > 63 else t
    matched = []
    if gnews_results.get(t, {}).get("articles"): matched.append("G")
    if nd_results.get(t, {}).get("articles"): matched.append("N")
    if newsapi_base.get(t, {}).get("articles"): matched.append("A")
    ann = annotations.get(t, {})
    best_quality = max(
        [ann.get("gnews",("-",""))[0], ann.get("newsdata",("-",""))[0], ann.get("newsapi",("-",""))[0]],
        key=lambda x: {"O":3,"~":2,"X":1,"-":0}.get(x, 0)
    )
    if matched:
        any_match += 1
        if best_quality == "O": any_same += 1
        print(f"  {best_quality} {short}: {\", \".join(matched)}")
    else:
        print(f"  - {short}: NO MATCH")

print(f"\nUNION match: {any_match}/10 ({any_match*10}%)")
print(f"UNION same-event: {any_same}/10 ({any_same*10}%)")

print(f"\n--- API Comparison Summary ---")
print(f"{'Metric':<25} {'GNews':>8} {'NewsData':>10} {'NewsAPI':>10}")
print("-" * 55)
print(f"{'Match rate':<25} {g_n*10:>7}% {n_n*10:>9}% {a_n*10:>9}%")
print(f"{'Same-event':<25} {g_same:>7}/10 {n_same:>9}/10 {a_same:>9}/10")
print(f"{'Same-topic+':<25} {g_topic:>7}/10 {n_topic:>9}/10 {a_topic:>9}/10")
print(f"{'Content':<25} {'266ch':>8} {'NO free':>10} {'200ch':>10}")
print(f"{'Free tier':<25} {'100/day':>8} {'200/day':>10} {'100/day':>10}")
print(f"{'Rate limit':<25} {'?':>8} {'30/15m':>10} {'?':>10}")

