In [1]:
import os
from dotenv import load_dotenv
import pandas_datareader.data as web
from datetime import datetime
import pandas as pd

load_dotenv()
fred_key = os.getenv("FRED_API_KEY")

# Fetch the Federal Funds Rate data from FRED 
start_date = datetime(2023, 12, 6) 
fedfunds = web.DataReader("FEDFUNDS", "fred", start=start_date, api_key=fred_key)

# Rename and clean up
fedfunds = fedfunds.rename(columns={"FEDFUNDS": "fed_funds_rate"})
fedfunds.index.name = "timestamp"

# Resample to hourly frequency (forward-fill from rate announcement dates)
fedfunds_hourly = fedfunds.resample("H").ffill()

# Save to CSV
output_path = "data/raw/macro/fedfunds_hourly.csv"
fedfunds_hourly.to_csv(output_path)

print(f"FEDFUNDS data saved to {output_path}")


FEDFUNDS data saved to data/raw/macro/fedfunds_hourly.csv


  fedfunds_hourly = fedfunds.resample("H").ffill()


In [7]:
import os
from transformers import pipeline
import pandas as pd
from transformers import AutoTokenizer

# Load FinBERT
sentiment_pipeline = pipeline("sentiment-analysis", model="ProsusAI/finbert")

# Prepare path
statements_dir = "data/raw/macro/statements"
results = []

for filename in os.listdir(statements_dir):
    if filename.endswith(".txt"):
        date_str = filename[:10]  # e.g., "2023-12-13"
        path = os.path.join(statements_dir, filename)
        
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        
        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

        # Tokenize and truncate to 512 tokens
        inputs = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")

        # Manually pass through model pipeline
        result = sentiment_pipeline.tokenizer.decode(inputs["input_ids"][0])
        result = sentiment_pipeline(text, truncation=True, max_length=512)[0]

        label_map = {
            "positive": 1,
            "neutral": 0,
            "negative": -1
        }

        label = result["label"].lower()
        sentiment_score = label_map[label] * result["score"]


        results.append({
            "timestamp": f"{date_str} 14:00",  # FOMC statements release at 2:00 PM ET
            "label": result["label"],
            "confidence": result["score"],
            "sentiment_score": sentiment_score
        })

# Save to interim CSV
output_df = pd.DataFrame(results)
os.makedirs("data/interim/macro", exist_ok=True)
output_df.to_csv("data/interim/macro/fomc_sentiment.csv", index=False)

print("✅ FOMC sentiment scores saved.")


Device set to use cpu


✅ FOMC sentiment scores saved.


In [10]:
import requests
from bs4 import BeautifulSoup
import os
from datetime import datetime

BASE_URL = "https://www.federalreserve.gov"
SPEECHES_URL = f"{BASE_URL}/newsevents/speeches.htm"
SAVE_DIR = "data/raw/macro/speeches"
os.makedirs(SAVE_DIR, exist_ok=True)

headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(SPEECHES_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

speech_items = soup.select(".item")
print(f"Found {len(speech_items)} total speech items")

for item in speech_items:
    # Filter for speeches by Jerome Powell
    speaker_tag = item.select_one(".speaker")
    if not speaker_tag:
        continue
    speaker = speaker_tag.text.strip().lower()
    if "Powell" not in speaker:
        continue

    print(f"Speaker found: {speaker}")

    # Get speech link and date
    link = item.select_one("a")["href"]
    full_url = BASE_URL + link

    raw_date = item.select_one(".date").text.strip()  # e.g. "October 19, 2023"
    try:
        dt = datetime.strptime(raw_date, "%B %d, %Y")
    except:
        continue
    date_str = dt.strftime("%Y-%m-%d")

    # Scrape the full speech content
    speech_html = requests.get(full_url, headers=headers).text
    speech_soup = BeautifulSoup(speech_html, "html.parser")
    content_div = speech_soup.select_one(".col-xs-12.col-sm-8.col-md-8")

    if not content_div:
        print(f"⚠️ Could not extract text for {date_str}")
        continue

    paragraphs = content_div.find_all("p")
    speech_text = "\n".join(p.get_text() for p in paragraphs if p.text.strip())

    if not speech_text.strip():
        print(f"⚠️ Empty speech: {date_str}")
        continue

    # Save to .txt file
    filename = os.path.join(SAVE_DIR, f"{date_str}_powell.txt")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(speech_text)

    print(f"✅ Saved {filename}")


Found 0 total speech items


In [None]:
import requests
import pandas as pd
import os
from dotenv import load_dotenv
from datetime import datetime

# Load FRED API key
load_dotenv()
api_key = os.getenv("FRED_API_KEY")

# GDP series: Real GDP, QoQ % change, annualized
series_id = "A191RL1Q225SBEA"
url = "https://api.stlouisfed.org/fred/series/observations"

params = {
    "series_id": series_id,
    "api_key": api_key,
    "file_type": "json",
    "observation_start": "2015-01-01",  # adjust as needed
}

response = requests.get(url, params=params)
print(response.status_code)
print(response.text)
data = response.json()["observations"]

# Convert to DataFrame
gdp_df = pd.DataFrame([{
    "date": obs["date"],
    "gdp_growth": float(obs["value"]) if obs["value"] != "." else None
} for obs in data])

# Convert to timestamp format used in intraday (e.g., set to 8:30 AM ET)
gdp_df["timestamp"] = pd.to_datetime(gdp_df["date"]) + pd.Timedelta(hours=8, minutes=30)
gdp_df.drop(columns=["date"], inplace=True)

# Save to file
gdp_df.to_csv("data/interim/macro/gdp_growth_quarterly.csv", index=False)
print("✅ GDP data saved.")


200
{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","observation_start":"2015-01-01","observation_end":"9999-12-31","units":"lin","output_type":1,"file_type":"json","order_by":"observation_date","sort_order":"asc","count":42,"offset":0,"limit":100000,"observations":[{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","date":"2015-01-01","value":"3.6"},{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","date":"2015-04-01","value":"2.5"},{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","date":"2015-07-01","value":"1.6"},{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","date":"2015-10-01","value":"0.7"},{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","date":"2016-01-01","value":"2.3"},{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","date":"2016-04-01","value":"1.3"},{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","date":"2016-07-01","value":"2.9"},{"realtime_start":"2025-11-21","realtime_end":"2025-11-21","date

 For each hour in your stock data, forward-fill GDP value until the next release=>
intraday = pd.read_csv("data/raw/intraday/AAPL.csv", parse_dates=["timestamp"])
gdp = pd.read_csv("data/interim/macro/gdp_growth_quarterly.csv", parse_dates=["timestamp"])

intraday = intraday.sort_values("timestamp")
gdp = gdp.sort_values("timestamp")

 Merge and forward-fill GDP values into intraday=>
intraday = pd.merge_asof(intraday, gdp, on="timestamp", direction="backward")
