In [7]:
import json
import os
import time
import re
import requests
import time
import torch
import random
from tqdm import tqdm

# Using Selenium for web scraping
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO
from urllib.parse import urljoin
from datetime import datetime, timedelta

# Using transformers to load models
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification

## Loading most recent statements/transcripts/minutes/articles from websites

FOMC meeting minutes/press conference transcripts/statements

In [51]:
# === Setup headless Selenium
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm")
time.sleep(2)

# === Grab all meeting blocks
meeting_blocks = driver.find_elements(By.CSS_SELECTOR, ".fomc-meeting")
today = datetime.today()

latest_meeting = None
latest_date = None

# === Step 1: Find most recent past meeting block
for block in meeting_blocks:
    try:
        month_text = block.find_element(By.CLASS_NAME, "fomc-meeting__month").text.strip()
        day_text = block.find_element(By.CLASS_NAME, "fomc-meeting__date").text.strip()
        first_day = int(re.findall(r"\d+", day_text)[0])

        year_match = re.search(r"20\d{2}", block.get_attribute("innerHTML"))
        year = int(year_match.group()) if year_match else today.year
        month_num = time.strptime(month_text, '%B').tm_mon
        date_obj = datetime(year, month_num, first_day)

        if date_obj <= today and (latest_date is None or date_obj > latest_date):
            latest_meeting = block
            latest_date = date_obj
    except:
        continue
    
print(f"✅ Latest meeting selected: {latest_date.strftime('%Y-%m-%d')}")

date_str = latest_date.strftime("%Y-%m-%d")
file_suffix = latest_date.strftime("%y%m")
combined_data = []

# === Step 2: Parse the meeting block directly
try:
    soup = BeautifulSoup(latest_meeting.get_attribute("innerHTML"), "html.parser")

    # --- Statement ---
    statement_link = soup.find("a", href=re.compile(r"monetary20\d{6}a\.htm"))
    if statement_link:
        statement_url = urljoin("https://www.federalreserve.gov", statement_link['href'])
        response = requests.get(statement_url)
        statement_soup = BeautifulSoup(response.text, "html.parser")
        statement_text = statement_soup.get_text(separator="\n", strip=True)
        combined_data.append({
            "date": date_str,
            "type": "statement",
            "url": statement_url,
            "source_type": "HTML",
            "content": statement_text
        })

    # --- Minutes ---
    minutes_link = soup.find("a", href=re.compile(r"fomcminutes20\d{6}\.htm"))
    if minutes_link:
        minutes_url = urljoin("https://www.federalreserve.gov", minutes_link['href'])
        response = requests.get(minutes_url)
        minutes_soup = BeautifulSoup(response.text, "html.parser")
        minutes_text = minutes_soup.get_text(separator="\n", strip=True)
        combined_data.append({
            "date": date_str,
            "type": "minutes",
            "url": minutes_url,
            "source_type": "HTML",
            "content": minutes_text
        })

    # --- Press Conference Transcript PDF ---
    pdf_link = soup.find("a", href=re.compile(r"/mediacenter/files/FOMCpresconf20\d{6}\.pdf"))
    if pdf_link:
        pdf_url = urljoin("https://www.federalreserve.gov", pdf_link['href'])
        try:
            response = requests.get(pdf_url)
            reader = PdfReader(BytesIO(response.content))
            pdf_text = "\n".join(page.extract_text() or "" for page in reader.pages)
            combined_data.append({
                "date": date_str,
                "type": "press_conference",
                "url": pdf_url,
                "source_type": "PDF",
                "content": pdf_text
            })
        except Exception as e:
            print(f"⚠️ PDF extract failed: {e}")
except Exception as e:
    print(f"❌ Failed to parse links from latest meeting: {e}")

driver.quit()

# === Step 3: Save as ./fomc_YYMM.json
output_path = f"./fomc.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(combined_data, f, ensure_ascii=False, indent=2)

print(f"✅ Saved all documents to {output_path}")

✅ Latest meeting selected: 2025-06-17
✅ Saved all documents to ./fomc.json


CNBC News Articles

In [20]:
# === Setup ===
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.cnbc.com/federal-reserve/")
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')

# === Time filter ===
today = datetime.today()
one_week_ago = today - timedelta(days=7)

# === Article extraction ===
articles = []

# Find all article blocks
for card in soup.find_all("div", class_="Card-card"):
    title_tag = card.find("a", class_="Card-title")
    date_tag = card.find("span", class_="Card-time")

    if not title_tag or not date_tag:
        continue

    date_text = date_tag.get_text(strip=True)

    try:
        clean_date = date_text.replace('st', '').replace('nd', '').replace('rd', '').replace('th', '')
        article_date = datetime.strptime(clean_date, "%a, %b %d %Y")
    except ValueError:
        continue

    if article_date < one_week_ago:
        continue

    articles.append({
        "title": title_tag.text.strip(),
        "url": title_tag["href"],
        "date": article_date.strftime("%Y-%m-%d")
    })

print(f"✅ Found {len(articles)} articles within 7 days.")
for a in articles:
    print(f"- {a['date']}: {a['title']} ({a['url']})")

✅ Found 13 articles within 7 days.
- 2025-06-18: Stagflation on the Fed’s mind (https://www.cnbc.com/2025/06/19/cnbc-daily-open-stagflation-on-the-feds-mind.html)
- 2025-06-18: Here’s how Wall Street is reacting to the Fed’s updated rate cut outlook (https://www.cnbc.com/2025/06/18/heres-how-wall-street-is-reacting-to-the-feds-updated-rate-cut-outlook.html)
- 2025-06-18: Fed sees preferred inflation gauge topping 3%,  higher than previous forecast (https://www.cnbc.com/2025/06/18/federal-reserve-dot-plot-and-economic-projection-june-2025.html)
- 2025-06-18: Here’s what changed in the new Fed statement (https://www.cnbc.com/2025/06/18/fed-meeting-heres-what-changed-in-the-new-statement.html)
- 2025-06-18: Fed holds interest rates steady: Here’s what that means for your wallet (https://www.cnbc.com/2025/06/18/fed-holds-interest-rates-steady-what-that-means-for-your-money.html)
- 2025-06-18: Fed holds key rate steady, still sees two more cuts this year (https://www.cnbc.com/2025/06/18/fed

In [None]:
recent_articles = []

for article in articles:
    try:
        driver.get(article['url'])
        time.sleep(2)
        
        page_soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        summary = page_soup.find_all('li')
        paragraphs = page_soup.find_all('p')
        
        content_parts = []

        content_parts.append(article['title'])
        
        if summary:
            content_parts.append("Summary:")
            content_parts.extend(line.get_text(strip=True) for line in summary if line.get_text(strip=True))

        if paragraphs:
            content_parts.append("Body:")
            content_parts.extend(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        content = '\n'.join(content_parts)
                
        article_data = {
            "title": article["title"],
            "url": article["url"],
            "date": article["date"],
            "content": content
        }
        
        recent_articles.append(article_data)
        
    except Exception as e:
        print(f"Failed to process {article['url']}: {e}")
        
driver.quit()

In [23]:
# === Step 3: Save as ./fomc_YYMM.json
output_path = f"./cnbc.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(recent_articles, f, ensure_ascii=False, indent=2)

print(f"✅ Saved all documents to {output_path}")

✅ Saved all documents to ./cnbc.json


## Cleaning and validating sentences

In [13]:
# === Sentence splitter (basic) ===
sentence_pattern = re.compile(r'(?<=[.!?]) +')

# === Define keyword and splitting logic ===
split_tokens = ["but", "however", "even though", "although", "while", ";"]
split_pattern = re.compile(r"\b(" + "|".join(map(re.escape, split_tokens)) + r")\b|;")

keywords = set(map(str.lower, [
    # Panel A1
    "inflation expectation", "interest rate", "bank rate", "fund rate", "price", 
    "economic activity", "inflation", "employment",
    # Panel A2
    "anchor", "cut", "subdue", "decline", "decrease", "reduce", "low", "drop", "fall",
    "fell", "decelerate", "slow", "pause", "pausing", "stable", "non-accelerating", 
    "downward", "tighten",
    # Panel B1
    "unemployment", "growth", "exchange rate", "productivity", "deficit", "demand",
    "job market", "monetary policy",
    # Panel B2
    "ease", "easing", "rise", "rising", "increase", "expand", "improve", "strong", 
    "upward", "raise", "high", "rapid"
]))

junk_phrases = [
        "cookie", "cookies", "terms of use", "privacy policy", "ads and content", 
        "by using this site", "subscribe", "sign up", "CNBC", "NBCUniversal", "copyright",
        "click", "browser", "advertise with us"
    ]

In [14]:
# === Read two json files ===
with open("./fomc.json", "r", encoding="utf-8") as f:
    fomc_data = json.load(f)

with open("./cnbc.json", "r", encoding="utf-8") as f:
    cnbc_data = json.load(f)
    
all_data = fomc_data + cnbc_data

In [15]:
# === Result: sentences grouped by URL
filtered_sentences_by_url = {}

# === Process each item ===
for item in all_data:
    content = item.get("content", "")
    url = item.get("url", "unknown_source")
    source_type = item.get("type", "unknown_type")

    if not content.strip():
        continue

    # --- Split into sentences ---
    sentences = sentence_pattern.split(content)

    valid_sentences = []

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # --- Split further on split tokens ---
        parts = split_pattern.split(sentence)
        parts = [part.strip() for part in parts if part and not re.match(split_pattern, part)]

        for part in parts:
            if len(part.split()) < 3 or part.count('\n') > 3 or len(re.findall(r'[.!?]', part)) < 1:
                continue

            part_lower = part.lower()

            if any(junk_phrase in part_lower for junk_phrase in junk_phrases):
                continue

            if any(re.search(rf"\b{re.escape(keyword)}\b", part_lower) for keyword in keywords):
                valid_sentences.append(part)

    # === If this item had any valid sentences, save them by URL
    if valid_sentences:
        filtered_sentences_by_url[url] = {
            "type": source_type,
            "sentences": valid_sentences
        }

# === Save
output_path = "filtered_sentences_by_url.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(filtered_sentences_by_url, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(filtered_sentences_by_url)} documents with filtered sentences to {output_path}")

✅ Saved 10 documents with filtered sentences to filtered_sentences_by_url.json


In [19]:
# === Done ===
filtered_sentences = []

for url, data in filtered_sentences_by_url.items():
    for sentence in data["sentences"]:
        filtered_sentences.append(sentence)
    
print(f"✅ Found {len(filtered_sentences)} valid sentences.")
for s in filtered_sentences[:20]:  # preview first 20
    print("-", s)

# === Save to json ===
with open("processed_sentences.json", "w", encoding="utf-8") as f:
    json.dump(filtered_sentences, f, ensure_ascii=False, indent=2)

✅ Found 66 valid sentences.
- EDT
Share
Although swings in net exports have affected the data, recent indicators suggest that economic activity has continued to expand at a solid pace.
- The unemployment rate remains low, and labor market conditions remain solid.
- Inflation remains somewhat elevated.
The Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run.
- The Committee is strongly committed to supporting maximum employment and returning inflation to its 2 percent objective.
In assessing the appropriate stance of monetary policy, the Committee will continue to monitor the implications of incoming information for the economic outlook.
- The Committee would be prepared to adjust the stance of monetary policy as appropriate if risks emerge that could impede the attainment of the Committee's goals.
- The Committee's assessments will take into account a wide range of information, including readings on labor market conditions, inflation

## Loading roBERTa base model and Mistral model for classification and summarisation

In [2]:
# === Load Mistral model (causal LM)
save_dir = "../models/saved_gemma_2_2b_it_model"
gemma_tokenizer = AutoTokenizer.from_pretrained(save_dir)
gemma_model = AutoModelForCausalLM.from_pretrained(save_dir, torch_dtype=torch.float16, device_map="cuda")

# Move to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gemma_model = gemma_model.to(device)

print(f"✅ Loaded Gemma on device: {device}")

# === Load finetuned RoBERTa (sequence classification)
roberta_tokenizer = AutoTokenizer.from_pretrained("../models/finetuned_roberta_model_pre_overfit_epoch_8")
roberta_model = AutoModelForSequenceClassification.from_pretrained("../models/finetuned_roberta_model_pre_overfit_epoch_8")

# Move to CUDA if available
roberta_model = roberta_model.to(device)

print(f"✅ Loaded finetuned RoBERTa on device: {device}")

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.45s/it]


✅ Loaded Gemma on device: cuda
✅ Loaded finetuned RoBERTa on device: cuda


In [3]:
with open("./processed_sentences.json", "r", encoding="utf-8") as f:
    processed_sentences = json.load(f)

In [8]:
# === Process sentences ===
results = []

for sentence in tqdm(processed_sentences):
    inputs = roberta_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = roberta_model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()

    # OPTIONAL: map label to text (depends on your training labels!)
    label_map = {
        0: "dovish",
        1: "neutral",
        2: "hawkish"
    }
    label_text = label_map.get(predicted_class_id, str(predicted_class_id))

    results.append({
        "sentence": sentence,
        "label_id": predicted_class_id,
        "label": label_text
    })

# === Save to JSON ===
output_path = "roberta_sentence_labels.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(results)} sentence labels to {output_path}")

100%|██████████| 66/66 [00:14<00:00,  4.43it/s]

✅ Saved 66 sentence labels to roberta_sentence_labels.json



