In [1]:
import json
import os
import time
import re
import requests
import time
import torch
import random
from tqdm import tqdm

# Using Selenium for web scraping
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO
from urllib.parse import urljoin
from datetime import datetime, timedelta

# Using transformers to load models
from transformers import AutoTokenizer, AutoModelForCausalLM, RobertaForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


## Loading most recent statements/transcripts/minutes/articles from websites

FOMC meeting minutes/press conference transcripts/statements

In [2]:
# === Setup headless Selenium
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm")
time.sleep(2)

# === Grab all meeting blocks
meeting_blocks = driver.find_elements(By.CSS_SELECTOR, ".fomc-meeting")
today = datetime.today()

latest_meeting = None
latest_date = None

# === Step 1: Find most recent past meeting block
for block in meeting_blocks:
    try:
        month_text = block.find_element(By.CLASS_NAME, "fomc-meeting__month").text.strip()
        day_text = block.find_element(By.CLASS_NAME, "fomc-meeting__date").text.strip()
        first_day = int(re.findall(r"\d+", day_text)[0])

        year_match = re.search(r"20\d{2}", block.get_attribute("innerHTML"))
        year = int(year_match.group()) if year_match else today.year
        month_num = time.strptime(month_text, '%B').tm_mon
        date_obj = datetime(year, month_num, first_day)

        if date_obj <= today and (latest_date is None or date_obj > latest_date):
            latest_meeting = block
            latest_date = date_obj
    except:
        continue
    
print(f"✅ Latest meeting selected: {latest_date.strftime('%Y-%m-%d')}")

date_str = latest_date.strftime("%Y-%m-%d")
file_suffix = latest_date.strftime("%y%m")
combined_data = []

# === Step 2: Parse the meeting block directly
try:
    soup = BeautifulSoup(latest_meeting.get_attribute("innerHTML"), "html.parser")

    # --- Statement ---
    statement_link = soup.find("a", href=re.compile(r"monetary20\d{6}a\.htm"))
    if statement_link:
        statement_url = urljoin("https://www.federalreserve.gov", statement_link['href'])
        response = requests.get(statement_url)
        statement_soup = BeautifulSoup(response.text, "html.parser")
        statement_text = statement_soup.get_text(separator="\n", strip=True)
        combined_data.append({
            "date": date_str,
            "type": "statement",
            "url": statement_url,
            "source_type": "HTML",
            "content": statement_text
        })

    # --- Minutes ---
    minutes_link = soup.find("a", href=re.compile(r"fomcminutes20\d{6}\.htm"))
    if minutes_link:
        minutes_url = urljoin("https://www.federalreserve.gov", minutes_link['href'])
        response = requests.get(minutes_url)
        minutes_soup = BeautifulSoup(response.text, "html.parser")
        minutes_text = minutes_soup.get_text(separator="\n", strip=True)
        combined_data.append({
            "date": date_str,
            "type": "minutes",
            "url": minutes_url,
            "source_type": "HTML",
            "content": minutes_text
        })

    # --- Press Conference Transcript PDF ---
    pdf_link = soup.find("a", href=re.compile(r"/mediacenter/files/FOMCpresconf20\d{6}\.pdf"))
    if pdf_link:
        pdf_url = urljoin("https://www.federalreserve.gov", pdf_link['href'])
        try:
            response = requests.get(pdf_url)
            reader = PdfReader(BytesIO(response.content))
            pdf_text = "\n".join(page.extract_text() or "" for page in reader.pages)
            combined_data.append({
                "date": date_str,
                "type": "press_conference",
                "url": pdf_url,
                "source_type": "PDF",
                "content": pdf_text
            })
        except Exception as e:
            print(f"⚠️ PDF extract failed: {e}")
except Exception as e:
    print(f"❌ Failed to parse links from latest meeting: {e}")

driver.quit()

# === Step 3: Save as ./fomc_YYMM.json
output_path = f"./fomc.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(combined_data, f, ensure_ascii=False, indent=2)

print(f"✅ Saved all documents to {output_path}")

✅ Latest meeting selected: 2025-06-17
✅ Saved all documents to ./fomc.json


CNBC News Articles

In [20]:
# === Setup ===
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.cnbc.com/federal-reserve/")
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')

# === Time filter ===
today = datetime.today()
one_week_ago = today - timedelta(days=7)

# === Article extraction ===
articles = []

# Find all article blocks
for card in soup.find_all("div", class_="Card-card"):
    title_tag = card.find("a", class_="Card-title")
    date_tag = card.find("span", class_="Card-time")

    if not title_tag or not date_tag:
        continue

    date_text = date_tag.get_text(strip=True)

    try:
        clean_date = date_text.replace('st', '').replace('nd', '').replace('rd', '').replace('th', '')
        article_date = datetime.strptime(clean_date, "%a, %b %d %Y")
    except ValueError:
        continue

    if article_date < one_week_ago:
        continue

    articles.append({
        "title": title_tag.text.strip(),
        "url": title_tag["href"],
        "date": article_date.strftime("%Y-%m-%d")
    })

print(f"✅ Found {len(articles)} articles within 7 days.")
for a in articles:
    print(f"- {a['date']}: {a['title']} ({a['url']})")

✅ Found 13 articles within 7 days.
- 2025-06-18: Stagflation on the Fed’s mind (https://www.cnbc.com/2025/06/19/cnbc-daily-open-stagflation-on-the-feds-mind.html)
- 2025-06-18: Here’s how Wall Street is reacting to the Fed’s updated rate cut outlook (https://www.cnbc.com/2025/06/18/heres-how-wall-street-is-reacting-to-the-feds-updated-rate-cut-outlook.html)
- 2025-06-18: Fed sees preferred inflation gauge topping 3%,  higher than previous forecast (https://www.cnbc.com/2025/06/18/federal-reserve-dot-plot-and-economic-projection-june-2025.html)
- 2025-06-18: Here’s what changed in the new Fed statement (https://www.cnbc.com/2025/06/18/fed-meeting-heres-what-changed-in-the-new-statement.html)
- 2025-06-18: Fed holds interest rates steady: Here’s what that means for your wallet (https://www.cnbc.com/2025/06/18/fed-holds-interest-rates-steady-what-that-means-for-your-money.html)
- 2025-06-18: Fed holds key rate steady, still sees two more cuts this year (https://www.cnbc.com/2025/06/18/fed

In [None]:
recent_articles = []

for article in articles:
    try:
        driver.get(article['url'])
        time.sleep(2)
        
        page_soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        summary = page_soup.find_all('li')
        paragraphs = page_soup.find_all('p')
        
        content_parts = []

        content_parts.append(article['title'])
        
        if summary:
            content_parts.append("Summary:")
            content_parts.extend(line.get_text(strip=True) for line in summary if line.get_text(strip=True))

        if paragraphs:
            content_parts.append("Body:")
            content_parts.extend(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        content = '\n'.join(content_parts)
                
        article_data = {
            "title": article["title"],
            "url": article["url"],
            "date": article["date"],
            "content": content
        }
        
        recent_articles.append(article_data)
        
    except Exception as e:
        print(f"Failed to process {article['url']}: {e}")
        
driver.quit()

In [23]:
# === Step 3: Save as ./fomc_YYMM.json
output_path = f"./cnbc.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(recent_articles, f, ensure_ascii=False, indent=2)

print(f"✅ Saved all documents to {output_path}")

✅ Saved all documents to ./cnbc.json


## Cleaning and validating sentences

In [13]:
# === Sentence splitter (basic) ===
sentence_pattern = re.compile(r'(?<=[.!?]) +')

# === Define keyword and splitting logic ===
split_tokens = ["but", "however", "even though", "although", "while", ";"]
split_pattern = re.compile(r"\b(" + "|".join(map(re.escape, split_tokens)) + r")\b|;")

keywords = set(map(str.lower, [
    # Panel A1
    "inflation expectation", "interest rate", "bank rate", "fund rate", "price", 
    "economic activity", "inflation", "employment",
    # Panel A2
    "anchor", "cut", "subdue", "decline", "decrease", "reduce", "low", "drop", "fall",
    "fell", "decelerate", "slow", "pause", "pausing", "stable", "non-accelerating", 
    "downward", "tighten",
    # Panel B1
    "unemployment", "growth", "exchange rate", "productivity", "deficit", "demand",
    "job market", "monetary policy",
    # Panel B2
    "ease", "easing", "rise", "rising", "increase", "expand", "improve", "strong", 
    "upward", "raise", "high", "rapid"
]))

junk_phrases = [
        "cookie", "cookies", "terms of use", "privacy policy", "ads and content", 
        "by using this site", "subscribe", "sign up", "CNBC", "NBCUniversal", "copyright",
        "click", "browser", "advertise with us"
    ]

In [14]:
# === Read two json files ===
with open("./fomc.json", "r", encoding="utf-8") as f:
    fomc_data = json.load(f)

with open("./cnbc.json", "r", encoding="utf-8") as f:
    cnbc_data = json.load(f)
    
all_data = fomc_data + cnbc_data

In [15]:
# === Result: sentences grouped by URL
filtered_sentences_by_url = {}

# === Process each item ===
for item in all_data:
    content = item.get("content", "")
    url = item.get("url", "unknown_source")
    source_type = item.get("type", "unknown_type")

    if not content.strip():
        continue

    # --- Split into sentences ---
    sentences = sentence_pattern.split(content)

    valid_sentences = []

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # --- Split further on split tokens ---
        parts = split_pattern.split(sentence)
        parts = [part.strip() for part in parts if part and not re.match(split_pattern, part)]

        for part in parts:
            if len(part.split()) < 3 or part.count('\n') > 3 or len(re.findall(r'[.!?]', part)) < 1:
                continue

            part_lower = part.lower()

            if any(junk_phrase in part_lower for junk_phrase in junk_phrases):
                continue

            if any(re.search(rf"\b{re.escape(keyword)}\b", part_lower) for keyword in keywords):
                valid_sentences.append(part)

    # === If this item had any valid sentences, save them by URL
    if valid_sentences:
        filtered_sentences_by_url[url] = {
            "type": source_type,
            "sentences": valid_sentences
        }

# === Save
output_path = "filtered_sentences_by_url.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(filtered_sentences_by_url, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(filtered_sentences_by_url)} documents with filtered sentences to {output_path}")

✅ Saved 10 documents with filtered sentences to filtered_sentences_by_url.json


In [19]:
# === Done ===
filtered_sentences = []

for url, data in filtered_sentences_by_url.items():
    for sentence in data["sentences"]:
        filtered_sentences.append(sentence)
    
print(f"✅ Found {len(filtered_sentences)} valid sentences.")
for s in filtered_sentences[:20]:  # preview first 20
    print("-", s)

# === Save to json ===
with open("processed_sentences.json", "w", encoding="utf-8") as f:
    json.dump(filtered_sentences, f, ensure_ascii=False, indent=2)

✅ Found 66 valid sentences.
- EDT
Share
Although swings in net exports have affected the data, recent indicators suggest that economic activity has continued to expand at a solid pace.
- The unemployment rate remains low, and labor market conditions remain solid.
- Inflation remains somewhat elevated.
The Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run.
- The Committee is strongly committed to supporting maximum employment and returning inflation to its 2 percent objective.
In assessing the appropriate stance of monetary policy, the Committee will continue to monitor the implications of incoming information for the economic outlook.
- The Committee would be prepared to adjust the stance of monetary policy as appropriate if risks emerge that could impede the attainment of the Committee's goals.
- The Committee's assessments will take into account a wide range of information, including readings on labor market conditions, inflation

## Loading roBERTa base model and Mistral model for classification and summarisation

In [4]:
# === Load Mistral model (causal LM)
save_dir = "../models/saved_gemma_2_2b_it_model"

gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
gemma_model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it")

# Move to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gemma_model = gemma_model.to(device)

print(f"✅ Loaded Gemma on device: {device}")

# === Load finetuned RoBERTa (sequence classification)
roberta_tokenizer = AutoTokenizer.from_pretrained("../models/finetuned_roberta_model_best_val_loss")
roberta_model = RobertaForSequenceClassification.from_pretrained("../models/finetuned_roberta_model_best_val_loss", num_labels=3)

# Move to CUDA if available
roberta_model = roberta_model.to(torch.float32).to(device) 
roberta_model.eval()

print(f"✅ Loaded finetuned RoBERTa on device: {device}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.09s/it]


✅ Loaded Gemma on device: cpu
✅ Loaded finetuned RoBERTa on device: cpu


In [5]:
# Loading second RoBERTa model, pre-overfit
roberta_tokenizer_pre_overfit = AutoTokenizer.from_pretrained("../models/finetuned_roberta_model_pre_overfit_epoch_8")
roberta_model_pre_overfit = RobertaForSequenceClassification.from_pretrained("../models/finetuned_roberta_model_pre_overfit_epoch_8", num_labels=3)

# Move to CUDA if available
roberta_model_pre_overfit = roberta_model_pre_overfit.to(torch.float32).to(device) 
roberta_model_pre_overfit.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [6]:
with open("./processed_sentences.json", "r", encoding="utf-8") as f:
    processed_sentences = json.load(f)

Generating labels for each sentence using roBERTa base

In [7]:
# Using a test sentence to check model's accuracy
sentence = "The Fed is going to lower interest rates"

In [8]:
inputs = roberta_tokenizer_pre_overfit(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = roberta_model_pre_overfit(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax(dim=-1).item()

In [9]:
predicted_class_id

1

In [85]:
# === Process sentences ===
results = []

for sentence in tqdm(processed_sentences):
    inputs = roberta_tokenizer_pre_overfit(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = roberta_model_pre_overfit(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()

    # OPTIONAL: map label to text (depends on your training labels!)
    label_map = {
        0: "hawkish",
        1: "dovish",
        2: "neutral"
    }
    label_text = label_map.get(predicted_class_id, str(predicted_class_id))

    results.append({
        "sentence": sentence,
        "label_id": predicted_class_id,
        "label": label_text
    })

# === Save to JSON ===
output_path = "roberta_sentence_labels.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(results)} sentence labels to {output_path}")

100%|██████████| 66/66 [00:01<00:00, 49.11it/s]

✅ Saved 66 sentence labels to roberta_sentence_labels.json





In [6]:
with open("./roberta_sentence_labels.json", "r", encoding="utf-8") as f:
    roberta_sentence_labels = json.load(f)

In [9]:
num_hawkish, num_dovish = 0, 0 
num_total_sentences = len(processed_sentences)

for sentence in roberta_sentence_labels:
    if sentence['label_id'] == 0:
        num_hawkish += 1
    elif sentence['label_id'] == 1:
        num_dovish += 1
    
index = (num_hawkish - num_dovish) / num_total_sentences

In [10]:
print(f"Number of hawkish sentences: {num_hawkish}, number of dovish sentences: {num_dovish}, number of neutral sentences: {num_total_sentences - num_hawkish - num_dovish}")

Number of hawkish sentences: 14, number of dovish sentences: 21, number of neutral sentences: 31


In [11]:
# An index < 0 implies that there are more dovish sentences than hawkish sentences
index

-0.10606060606060606

Generating summary for CNBC articles and FOMC sentences using Gemma 2

Formatting sentences in roberta_sentence_labels.json into LLM-friendly format

In [12]:
sentence_lines = []
for i, entry in enumerate(roberta_sentence_labels):
    sentence_lines.append(f"[{i+1}] \"{entry['sentence']}\" → Label: {entry['label'].capitalize()}")

sentences_text_block = "\n".join(sentence_lines)

In [13]:
sentences_text_block

'[1] "EDT\nShare\nAlthough swings in net exports have affected the data, recent indicators suggest that economic activity has continued to expand at a solid pace." → Label: Neutral\n[2] "The unemployment rate remains low, and labor market conditions remain solid." → Label: Neutral\n[3] "Inflation remains somewhat elevated.\nThe Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run." → Label: Neutral\n[4] "The Committee is strongly committed to supporting maximum employment and returning inflation to its 2 percent objective.\nIn assessing the appropriate stance of monetary policy, the Committee will continue to monitor the implications of incoming information for the economic outlook." → Label: Neutral\n[5] "The Committee would be prepared to adjust the stance of monetary policy as appropriate if risks emerge that could impede the attainment of the Committee\'s goals." → Label: Neutral\n[6] "The Committee\'s assessments will take into a

In [None]:
# Load model directly
# Gemma-2's context window is too small, mistral-7b is too large to run on GPU, TinyLlama is too slow

from transformers import AutoTokenizer, AutoModelForCausalLM

tiny_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tiny_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# Move to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tiny_model = tiny_model.to(device)

In [None]:
import openai

# Set your OpenAI API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Build input text
user_message = f"""
Given the following sentences and their sentiment classifications, summarise the overall monetary policy stance of the Fed.

The index is calculated as: (number of hawkish sentences - number of dovish sentences) / (total number of sentences).
A positive index (> 0) indicates an overall hawkish stance.
A negative index (< 0) indicates an overall dovish stance.
An index close to 0 indicates a neutral stance.

Please consider both the index and the provided sentences in your reasoning.

Index value: {index:.2f}

Sentences:
{sentences_text_block}

Summary:
"""

# Call OpenAI API (GPT-3.5 Turbo)
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a monetary policy expert."},
        {"role": "user", "content": user_message}
    ],
    max_tokens=500,  # Limit for summary length — adjust if needed
    temperature=0  # Adjust for creativity (lower = more factual)
)

In [18]:
summary = response.choices[0].message.content

print(summary)

The overall monetary policy stance of the Fed, based on the provided sentences and sentiment classifications, is slightly dovish. The index value of -0.11 indicates a leaning towards a dovish stance. While there are some hawkish sentiments expressed in the sentences related to inflation concerns and the potential impact of tariffs on prices, there are also dovish sentiments regarding the potential need for rate cuts due to economic slowdown, uncertainties related to tariffs, and concerns about slower growth. The neutral sentiments in many sentences also contribute to the overall slightly dovish stance.


In [14]:
max_context_length = 10000

def summarise_all(sentences_text_block, index):
    formatted_input = (
        "<bos><start_of_turn>user\n"
        "You are a monetary policy expert.\n"
        "Given the following sentences and their sentiment classifications, summarise the overall monetary policy stance of the Fed.\n\n"
        f"The index is calculated as: (number of hawkish sentences - number of dovish sentences) / (total number of sentences).\n"
        "A positive index (> 0) indicates an overall hawkish stance.\n"
        "A negative index (< 0) indicates an overall dovish stance.\n"
        "An index close to 0 indicates a neutral stance.\n"
        "Please consider both the index and the provided sentences in your reasoning.\n\n"
        f"Index value: {index:.2f}\n\n"
        f"Sentences:\n{sentences_text_block}\n\n"
        "Summary:<end_of_turn>\n<start_of_turn>model\n"
    )

    # Tokenise
    inputs = tiny_tokenizer(
        formatted_input, 
        return_tensors="pt", 
        truncation=True, 
        max_length=max_context_length
    ).to(device)

    with torch.no_grad():
        outputs = tiny_model.generate(**inputs, max_new_tokens=512)

    response = tiny_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract generated summary
    summary = response.split("Summary:")[-1].strip()

    print("=== Summary ===")
    print(summary)

    return summary

In [15]:
summarise_all(sentences_text_block, index)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


KeyboardInterrupt: 