In [1]:
import json
import os
import time
import re
import requests
import torch
import random
from datetime import datetime, timedelta

# Using Selenium for web scraping
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import BeautifulSoup
from io import BytesIO
from urllib.parse import urljoin

Reuters page requires human verification when web scraping beyond home page of US markets
- Train model with 30 articles of CNBC, then proceed to scrape daily news source, then feed into LLM, cross reference with FOMC, generate sentiment + analysis

In [16]:
# === Setup headless browser ===
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.reuters.com/markets/us/")
time.sleep(5)

base_url = "https://www.reuters.com"
articles = []
seen_urls = set()

def extract_articles():
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    new_articles = []

    # Find all article links from multiple relevant sections
    for a_tag in soup.find_all("a", attrs={"data-testid": ["Title", "Heading", "Link"]}):
        parent_h3 = a_tag.find_parent(["h3", "header"])
        if not parent_h3:
            continue  # skip if not inside a recognizable title block

        relative_url = a_tag.get("href")
        title = a_tag.get_text(strip=True)

        if (
            relative_url 
            and relative_url.startswith("/") 
            and relative_url not in seen_urls 
            and len(title.split()) > 3  # filter out short / irrelevant links
        ):
            full_url = base_url + relative_url
            seen_urls.add(relative_url)
            new_articles.append({
                "title": title,
                "url": full_url
            })

    return new_articles

articles.extend(extract_articles())

# === Output results ===
print(f"\n✅ Final count: {len(articles)} unique articles.")
for article in articles[:100]:
    print(f"- {article['title']} ({article['url']})")

# === Process articles ===
recent_articles = []

for article in articles:
    try:
        driver.get(article["url"])
        time.sleep(2)

        page_soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract date
        time_tag = page_soup.find("time")
        date_str = (
            time_tag["datetime"][:10]
            if time_tag and time_tag.has_attr("datetime")
            else datetime.today().strftime("%Y-%m-%d")
        )

        # Extract summary (list items)
        summary_items = page_soup.find_all("li", attrs={"data-testid": "Body"})

        # Extract body paragraphs
        paragraphs = page_soup.find_all("div", attrs={"data-testid": lambda v: v and v.startswith("paragraph-")})

        # Construct content
        content_parts = [article["title"]]

        if summary_items:
            content_parts.append("Summary:")
            content_parts.extend(li.get_text(strip=True) for li in summary_items if li.get_text(strip=True))

        if paragraphs:
            content_parts.append("Body:")
            content_parts.extend(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        content = "\n".join(content_parts)

        article_data = {
            "title": article["title"],
            "url": article["url"],
            "date": date_str,
            "content": content
        }

        recent_articles.append(article_data)
        print(f"✅ Processed: {article['title']}")

    except Exception as e:
        print(f"❌ Failed to process {article['url']}: {e}")

driver.quit()

# === Save to JSON file ===
output_dir = os.path.join("..", "data", "news")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "reuters_us_markets.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(recent_articles, f, ensure_ascii=False, indent=2)

print(f"\n✅ Saved {len(recent_articles)} articles to {output_path}")


✅ Final count: 14 unique articles.
- Trump reports more than $600 million in income from crypto, golf, licensing fees (https://www.reuters.com/world/us/trump-reports-tens-millions-income-crypto-ventures-2025-06-14/)
- Exclusive: Crypto giants set for EU green light amid growing regulatory rift, sources say (https://www.reuters.com/sustainability/boards-policy-regulation/crypto-giants-set-eu-green-light-amid-growing-regulatory-rift-sources-say-2025-06-13/)
- Brazil's services activity enjoys boost from Lady Gaga's free concert, holidays (https://www.reuters.com/world/americas/brazils-services-activity-enjoys-boost-lady-gagas-massive-free-concert-2025-06-13/)
- Big disruption to oil supply unlikely after Israel's attack on Iran, say analysts (https://www.reuters.com/business/energy/big-disruption-oil-supply-unlikely-after-israels-attack-iran-say-analysts-2025-06-13/)
- US Securities and Exchange Commission names new division chiefs (https://www.reuters.com/business/us-securities-exchang