In [15]:
import json
import os
import time
import re
import requests
import time
import torch
import random

# Using Selenium for web scraping
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO
from urllib.parse import urljoin
from datetime import datetime, timedelta

# Using transformers to load models
from transformers import AutoTokenizer, AutoModelForCausalLM

## Loading most recent statements/transcripts/minutes/articles from websites

FOMC meeting minutes/press conference transcripts/statements

In [14]:
# === Setup headless Selenium
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm")
time.sleep(2)

# === Grab all meeting blocks
meeting_blocks = driver.find_elements(By.CSS_SELECTOR, ".fomc-meeting")
today = datetime.today()

latest_meeting = None
latest_date = None

# === Step 1: Find most recent past meeting block
for block in meeting_blocks:
    try:
        month_text = block.find_element(By.CLASS_NAME, "fomc-meeting__month").text.strip()
        day_text = block.find_element(By.CLASS_NAME, "fomc-meeting__date").text.strip()
        first_day = int(re.findall(r"\d+", day_text)[0])

        year_match = re.search(r"20\d{2}", block.get_attribute("innerHTML"))
        year = int(year_match.group()) if year_match else today.year
        month_num = time.strptime(month_text, '%B').tm_mon
        date_obj = datetime(year, month_num, first_day)

        if date_obj <= today and (latest_date is None or date_obj > latest_date):
            latest_meeting = block
            latest_date = date_obj
    except:
        continue
    
print(f"✅ Latest meeting selected: {latest_date.strftime('%Y-%m-%d')}")

date_str = latest_date.strftime("%Y-%m-%d")
file_suffix = latest_date.strftime("%y%m")
combined_data = []

# === Step 2: Parse the meeting block directly
try:
    soup = BeautifulSoup(latest_meeting.get_attribute("innerHTML"), "html.parser")

    # --- Statement ---
    statement_link = soup.find("a", href=re.compile(r"monetary20\d{6}a\.htm"))
    if statement_link:
        statement_url = urljoin("https://www.federalreserve.gov", statement_link['href'])
        response = requests.get(statement_url)
        statement_soup = BeautifulSoup(response.text, "html.parser")
        statement_text = statement_soup.get_text(separator="\n", strip=True)
        combined_data.append({
            "date": date_str,
            "type": "statement",
            "url": statement_url,
            "source_type": "HTML",
            "content": statement_text
        })

    # --- Minutes ---
    minutes_link = soup.find("a", href=re.compile(r"fomcminutes20\d{6}\.htm"))
    if minutes_link:
        minutes_url = urljoin("https://www.federalreserve.gov", minutes_link['href'])
        response = requests.get(minutes_url)
        minutes_soup = BeautifulSoup(response.text, "html.parser")
        minutes_text = minutes_soup.get_text(separator="\n", strip=True)
        combined_data.append({
            "date": date_str,
            "type": "minutes",
            "url": minutes_url,
            "source_type": "HTML",
            "content": minutes_text
        })

    # --- Press Conference Transcript PDF ---
    pdf_link = soup.find("a", href=re.compile(r"/mediacenter/files/FOMCpresconf20\d{6}\.pdf"))
    if pdf_link:
        pdf_url = urljoin("https://www.federalreserve.gov", pdf_link['href'])
        try:
            response = requests.get(pdf_url)
            reader = PdfReader(BytesIO(response.content))
            pdf_text = "\n".join(page.extract_text() or "" for page in reader.pages)
            combined_data.append({
                "date": date_str,
                "type": "press_conference",
                "url": pdf_url,
                "source_type": "PDF",
                "content": pdf_text
            })
        except Exception as e:
            print(f"⚠️ PDF extract failed: {e}")
except Exception as e:
    print(f"❌ Failed to parse links from latest meeting: {e}")

driver.quit()

# === Step 3: Save as ./fomc_YYMM.json
output_path = f"./fomc_{file_suffix}.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(combined_data, f, ensure_ascii=False, indent=2)

print(f"✅ Saved all documents to {output_path}")

✅ Latest meeting selected: 2025-06-17
✅ Saved all documents to ./fomc_2506.json


CNBC News Articles

In [None]:
# === Setup ===
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.cnbc.com/federal-reserve/")
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')

# === Time filter ===
today = datetime.today()
one_week_ago = today - timedelta(days=7)

# === Article extraction ===
articles = []

# Find all article blocks
for card in soup.find_all("div", class_="Card-card"):
    title_tag = card.find("a", class_="Card-title")
    date_tag = card.find("span", class_="Card-time")

    if not title_tag or not date_tag:
        continue

    date_text = date_tag.get_text(strip=True)

    try:
        clean_date = date_text.replace('st', '').replace('nd', '').replace('rd', '').replace('th', '')
        article_date = datetime.strptime(clean_date, "%a, %b %d %Y")
    except ValueError:
        continue

    if article_date < one_week_ago:
        continue

    articles.append({
        "title": title_tag.text.strip(),
        "url": title_tag["href"],
        "date": article_date.strftime("%Y-%m-%d")
    })

print(f"✅ Found {len(articles)} articles within 7 days.")
for a in articles:
    print(f"- {a['date']}: {a['title']} ({a['url']})")
    
driver.quit()

✅ Found 13 articles within 7 days.
- 2025-06-18: Stagflation on the Fed’s mind (https://www.cnbc.com/2025/06/19/cnbc-daily-open-stagflation-on-the-feds-mind.html)
- 2025-06-18: Here’s how Wall Street is reacting to the Fed’s updated rate cut outlook (https://www.cnbc.com/2025/06/18/heres-how-wall-street-is-reacting-to-the-feds-updated-rate-cut-outlook.html)
- 2025-06-18: Fed sees preferred inflation gauge topping 3%,  higher than previous forecast (https://www.cnbc.com/2025/06/18/federal-reserve-dot-plot-and-economic-projection-june-2025.html)
- 2025-06-18: Here’s what changed in the new Fed statement (https://www.cnbc.com/2025/06/18/fed-meeting-heres-what-changed-in-the-new-statement.html)
- 2025-06-18: Fed holds interest rates steady: Here’s what that means for your wallet (https://www.cnbc.com/2025/06/18/fed-holds-interest-rates-steady-what-that-means-for-your-money.html)
- 2025-06-18: Fed holds key rate steady, still sees two more cuts this year (https://www.cnbc.com/2025/06/18/fed