In [1]:
import json
import os
import time
import re
import requests
import torch
import random
from datetime import datetime, timedelta

# Using Selenium for web scraping
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import BeautifulSoup
from io import BytesIO
from urllib.parse import urljoin

# Using transformers to load models
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


## Functions for live extraction of articles for generation of WEEKLY sentiment

In [23]:
# === Setup ===
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.cnbc.com/federal-reserve/")
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')

# === Time filter ===
today = datetime.today()
one_week_ago = today - timedelta(days=7)

# === Article extraction ===
articles = []

# Find all article blocks
for card in soup.find_all("div", class_="Card-card"):
    title_tag = card.find("a", class_="Card-title")
    date_tag = card.find("span", class_="Card-time")

    if not title_tag or not date_tag:
        continue

    date_text = date_tag.get_text(strip=True)

    try:
        clean_date = date_text.replace('st', '').replace('nd', '').replace('rd', '').replace('th', '')
        article_date = datetime.strptime(clean_date, "%a, %b %d %Y")
    except ValueError:
        continue

    if article_date < one_week_ago:
        continue

    articles.append({
        "title": title_tag.text.strip(),
        "url": title_tag["href"],
        "date": article_date.strftime("%Y-%m-%d")
    })

print(f"✅ Found {len(articles)} articles within 7 days.")
for a in articles:
    print(f"- {a['date']}: {a['title']} ({a['url']})")

✅ Found 7 articles within 7 days.
- 2025-06-12: What a Trump, Powell showdown means for your money (https://www.cnbc.com/2025/06/12/what-a-trump-powell-fed-showdown-means-for-your-money.html)
- 2025-06-12: Yellen expects Trump’s tariffs will hike inflation to 3% year over year (https://www.cnbc.com/2025/06/12/yellen-trump-tariffs-inflation.html)
- 2025-06-12: Trump calls Fed chief Powell ‘numbskull’ as he urges interest rate cut (https://www.cnbc.com/2025/06/12/trump-powell-numbskull-fed-rates.html)
- 2025-06-12: Good news on U.S. trade and inflation isn’t lifting markets (https://www.cnbc.com/2025/06/12/cnbc-daily-open-good-news-on-us-trade-and-inflation-not-lifting-markets.html)
- 2025-06-12: There’s progress on trade and U.S. inflation — but it’s harder to rely on such news (https://www.cnbc.com/2025/06/12/cnbc-daily-open-hard-to-count-on-us-trade-and-inflation-news.html)
- 2025-06-11: A ‘shadow’ Fed chair could be coming. Who it could be and how markets might react (https://www.cnb

In [24]:
recent_articles = []

for article in articles:
    try:
        driver.get(article['url'])
        time.sleep(2)
        
        page_soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        summary = page_soup.find_all('li')
        paragraphs = page_soup.find_all('p')
        
        content_parts = []

        content_parts.append(article['title'])
        
        if summary:
            content_parts.append("Summary:")
            content_parts.extend(line.get_text(strip=True) for line in summary if line.get_text(strip=True))

        if paragraphs:
            content_parts.append("Body:")
            content_parts.extend(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        content = '\n'.join(content_parts)
                
        article_data = {
            "title": article["title"],
            "url": article["url"],
            "date": article["date"],
            "content": content
        }
        
        recent_articles.append(article_data)
        
    except Exception as e:
        print(f"Failed to process {article['url']}: {e}")
        
driver.quit()

## Functions for extraction of CNBC Markets Federal Reserve articles (2025)

For POC - extracted 35 news articles, extracting beyond 35 articles causes Selenium to time out. Further enhancements would involve processing articles from more years before for a richer training dataset.

In [3]:
# === Setup ===
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://www.cnbc.com/federal-reserve/")
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')

base_url = "https://www.cnbc.com"
seen_urls = set()
articles = []

# === Time filter ===
today = datetime.today()
one_year_ago = today - timedelta(days=365)

# === Article extraction ===
def extract_articles():
    new_articles = []
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    for card in soup.find_all("div", class_="Card-card"):
        title_tag = card.find("a", class_="Card-title")
        date_tag = card.find("span", class_="Card-time")

        if not title_tag or not date_tag:
            continue

        relative_url = title_tag["href"]
        if not relative_url.startswith("http"):
            full_url = base_url + relative_url
        else:
            full_url = relative_url

        if full_url in seen_urls:
            continue

        date_text = date_tag.get_text(strip=True)
        try:
            clean_date = date_text.replace('st', '').replace('nd', '').replace('rd', '').replace('th', '')
            article_date = datetime.strptime(clean_date, "%a, %b %d %Y")
        except ValueError:
            continue

        if article_date < one_year_ago:
            continue

        seen_urls.add(full_url)
        new_articles.append({
            "title": title_tag.get_text(strip=True),
            "url": full_url,
            "date": article_date.strftime("%Y-%m-%d")
        })

    return new_articles

# === Load articles until we have 100 ===
while len(articles) < 30:
    newly_loaded = extract_articles()
    articles.extend(newly_loaded)
    print(f"🔁 Loaded {len(newly_loaded)} new, total: {len(articles)} articles")

    if len(articles) >= 100:
        break

    try:
        load_more = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "LoadMoreButton-loadMore"))
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", load_more)
        time.sleep(3)
    except Exception as e:
        print("❌ No more 'Load More' button or click failed:", e)
        break

print(f"✅ Final count: {len(articles)} articles collected.")
for a in articles:
    print(f"- {a['date']}: {a['title']} ({a['url']})")
    
recent_articles = []

for article in articles:
    try:
        driver.get(article['url'])
        time.sleep(2)
        
        page_soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        summary = page_soup.find_all('li')
        paragraphs = page_soup.find_all('p')
        
        content_parts = []

        content_parts.append(article['title'])
        
        if summary:
            content_parts.append("Summary:")
            content_parts.extend(line.get_text(strip=True) for line in summary if line.get_text(strip=True))

        if paragraphs:
            content_parts.append("Body:")
            content_parts.extend(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        content = '\n'.join(content_parts)
                
        article_data = {
            "title": article["title"],
            "url": article["url"],
            "date": article["date"],
            "content": content
        }
        
        recent_articles.append(article_data)
        
    except Exception as e:
        print(f"Failed to process {article['url']}: {e}")
        
driver.quit()

🔁 Loaded 35 new, total: 35 articles
✅ Final count: 35 articles collected.
- 2025-06-15: Here are the 4 big things we’re watching in the stock market in the week ahead (https://www.cnbc.com/2025/06/15/the-4-big-things-were-watching-in-the-stock-market-in-the-week-ahead.html)
- 2025-06-12: What a Trump, Powell showdown means for your money (https://www.cnbc.com/2025/06/12/what-a-trump-powell-fed-showdown-means-for-your-money.html)
- 2025-06-12: Yellen expects Trump’s tariffs will hike inflation to 3% year over year (https://www.cnbc.com/2025/06/12/yellen-trump-tariffs-inflation.html)
- 2025-06-12: Trump calls Fed chief Powell ‘numbskull’ as he urges interest rate cut (https://www.cnbc.com/2025/06/12/trump-powell-numbskull-fed-rates.html)
- 2025-06-12: Good news on U.S. trade and inflation isn’t lifting markets (https://www.cnbc.com/2025/06/12/cnbc-daily-open-good-news-on-us-trade-and-inflation-not-lifting-markets.html)
- 2025-06-12: There’s progress on trade and U.S. inflation — but it’s

In [4]:
output_dir = os.path.join("..", "data", "news")
os.makedirs(output_dir, exist_ok=True)

# === Save to JSON file ===
output_path = os.path.join(output_dir, "cnbc_fed_markets_2025.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(recent_articles, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(recent_articles)} articles to {output_path}")

✅ Saved 35 articles to ..\data\news\cnbc_fed_markets_2025.json


## Processing sentences from CNBC articles for labelling

In [15]:
# === Define keyword and splitting logic ===
split_tokens = ["but", "however", "even though", "although", "while", ";"]
split_pattern = re.compile(r"\b(" + "|".join(map(re.escape, split_tokens)) + r")\b|;")

# All keywords excluding Panel C
keywords = set(map(str.lower, [
    # Panel A1
    "inflation expectation", "interest rate", "bank rate", "fund rate", "price", 
    "economic activity", "inflation", "employment",
    # Panel A2
    "anchor", "cut", "subdue", "decline", "decrease", "reduce", "low", "drop", "fall",
    "fell", "decelerate", "slow", "pause", "pausing", "stable", "non-accelerating", 
    "downward", "tighten",
    # Panel B1
    "unemployment", "growth", "exchange rate", "productivity", "deficit", "demand",
    "job market", "monetary policy",
    # Panel B2
    "ease", "easing", "rise", "rising", "increase", "expand", "improve", "strong", 
    "upward", "raise", "high", "rapid"
]))

def split_into_sentences(text):
    # First split on newlines (to handle lists, summaries, etc.)
    lines = text.strip().split('\n')
    all_sentences = []
    for line in lines:
        line = line.strip()
        if line:  # skip empty lines
            # Split line into sentences based on punctuation
            all_sentences.extend(re.split(r'(?<=[.!?])\s+', line))
    return all_sentences

def is_relevant_sentence(sentence, keywords):
    sentence_lower = sentence.lower()

    if len(sentence) < 15 or len(sentence) > 300:
        return False

    # Reject junk regardless of keywords
    junk_phrases = [
        "cookie", "cookies", "terms of use", "privacy policy", "ads and content", 
        "by using this site", "subscribe", "sign up", "CNBC", "NBCUniversal", "copyright",
        "click", "browser", "advertise with us"
    ]
    if any(junk in sentence_lower for junk in junk_phrases):
        return False

    # Keep only if it contains relevant economic keywords
    return any(k in sentence_lower for k in keywords)

# === Read input ===
input_path = os.path.join("..", "data", "news", "cnbc_fed_markets_2025.json")
with open(input_path, "r", encoding="utf-8") as f:
    articles = json.load(f)

# === Load and filter ===
with open("../data/news/cnbc_fed_markets_2025.json", "r", encoding="utf-8") as f:
    raw_articles = json.load(f)

sampled_data = []

for article in raw_articles:
    content = article.get("content", "")
    sentences = split_into_sentences(content)
    
    # Filter relevant sentences
    relevant = [s for s in sentences if is_relevant_sentence(s, keywords)]
    
    if len(relevant) >= 15:
        sampled = random.sample(relevant, min(25, len(relevant)))
        sampled_data.append({
            "title": article["title"],
            "url": article["url"],
            "date": article["date"],
            "sentences": sampled
        })

# === Save cleaned and filtered output ===
output_path = "../data/news/sampled_cnbc_fed_markets_2025.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(sampled_data, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(sampled_data)} cleaned articles to {output_path}")

✅ Saved 26 cleaned articles to ../data/news/sampled_cnbc_fed_markets_2025.json


## Loading gemma-2-2b-it model for sentence labelling

In [2]:
# Checking CUDA availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    print(f"CUDA version: {torch.version.cuda}")

CUDA available: True
CUDA device: NVIDIA GeForce RTX 3050 Ti Laptop GPU
CUDA version: 12.1


In [3]:
save_dir = "../models/saved_gemma_2_2b_it_model"
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForCausalLM.from_pretrained(save_dir, torch_dtype=torch.float16, device_map="cuda")

Loading checkpoint shards: 100%|██████████| 3/3 [00:15<00:00,  5.22s/it]


In [4]:
def classify(sentence):
    # Formatting sentence
    formatted_input = (
    "<bos><start_of_turn>user\n"
    "You are a monetary policy expert. Classify the following sentence as either 'Hawkish', 'Dovish', or 'Neutral'. "
    "Only respond with one word.\n"
    f"Sentence: {sentence}\nClassification:<end_of_turn>\n<start_of_turn>model\n"
)
    
    # Tokenising formatted sentence:
    inputs = tokenizer(formatted_input, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=10)
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    fallback = response.split("Classification:")[-1].strip().split()
    classification = fallback[1].capitalize()

    print("Sentence:", sentence)
    print("Classification:", classification)

    return classification

In [5]:
# === Load refined sentences ===
with open("../data/news/refined_sampled_cnbc_fed_markets.json", "r", encoding="utf-8") as f:
    refined_data = json.load(f)

In [6]:
for sentence in refined_data:
    print(sentence)

Connected Devices:For connected devices, such as smart TVs or streaming devices, you should review the device’s settings and select the option that allows you to disable automatic content recognition or ad tracking.
The risk is that reducing rates too soon could halt or reverse progress on tamping down inflation, according to Mark Higgins, senior vice president at Index Fund Advisors and author of “Investing in U.S.
Once the fed funds rate comes down, consumers could see their borrowing costsstart to fall as well, which some may consider a welcome change.
Trump has previously said the central bank should cut interest rates by a full percentage point.
The president has argued that maintaining a fed funds rate that is too high makes it harder for businesses and consumers to borrow and puts the U.S.
at an economic disadvantage to countries with lower rates.
Subscribe to CNBC PRO
Subscribe to Investing Club
Licensing & Reprints
CNBC Councils
Supply Chain Values
CNBC on Peacock
Join the CNB