In [33]:
# graph_build_config.py

import json
from pathlib import Path
from datetime import datetime, date, timedelta
from collections import defaultdict

import numpy as np
import pandas as pd
import torch
from torch_geometric.data import HeteroData

DATA_DIR = Path("data")
SPEECH_FOLDER = DATA_DIR / "text_data/"
TOPIC_SCORE_FOLDER = DATA_DIR / "topic_scores/"
RATES_FILE = DATA_DIR / "price_data/2025-10-26 Fed Funds 12M 6M Historical Swap Rates.xlsx"

LOOKBACK_DAYS = 30   # rolling window length
TARGET_COLUMN = "fed_funds"   # or "asw"
PREDICT_DELTA = True         # True = predict Î”y, False = level
START_DATE = datetime(2018, 1, 1)

In [34]:
# date_utils.py

from datetime import datetime, date

def parse_date(dstr: str) -> date:
    """
    Try several common date formats and return a datetime.date.
    Adjust/add formats if your data differs.
    """
    dstr = dstr.strip()
    formats = [
        "%Y-%m-%d",       # 2023-08-25
        "%Y/%m/%d",       # 2023/08/25
        "%Y-%m-%dT%H:%M:%S",  # 2023-08-25T00:00:00
        "%B %d, %Y",      # August 25, 2023
        "%b %d, %Y",      # Aug 25, 2023
    ]
    for fmt in formats:
        try:
            return datetime.strptime(dstr, fmt)
        except ValueError:
            continue
    raise ValueError(f"Unrecognized date format: {dstr}")


In [79]:
import glob

def load_speeches(path=SPEECH_FOLDER):
    
    
    json_files = glob.glob(str(path) + "/*.json")

    speeches = {}
    
    for json_file in json_files:
        with open(json_file, "r", encoding="utf-8") as f:
            raw = json.load(f)

        for row in raw:
        
            sid = row["id"]
            date = parse_date(row["date"])
            if date < START_DATE:
                continue 
            
            speeches[sid] = {
                "author": json_file.split("/")[-1].split(".")[0],
                "text": row["text"],
                "date": parse_date(row["date"]),
            }
    return speeches

def load_topic_scores(path=TOPIC_SCORE_FOLDER):
    
    json_files = glob.glob(str(path) + "/*.json")
    scores = {}
    
    for json_file in json_files: 
        with open(json_file, "r", encoding="utf-8") as f:
            raw = json.load(f)
            
        for row in raw:
            sid = row["id"]
        # adjust key "gpt-5" if your JSON differs
            scores[sid] = row["gpt-5"]
    return scores

def load_rates(path=RATES_FILE):
    df = pd.read_excel(path)
    df["Date"] = df["Date"].apply(lambda x: str(x).split(" ")[0])
    df["Date"] = df["Date"].apply(parse_date)
    df = df.set_index("Date").sort_index()
    df = df[["Rate"]]
    return df  # index: date, columns: fed_funds, asw, ...

In [80]:
rates = load_rates()
print(rates)
speeches = load_speeches()
print(len(speeches))
scores = load_topic_scores()
print(len(scores))

                Rate
Date                
2018-06-04  2.141069
2018-06-05  2.156929
2018-06-06  2.150165
2018-06-07  2.169461
2018-06-08  2.159822
...              ...
2025-10-21  3.407672
2025-10-22  3.395943
2025-10-23  3.438809
2025-10-24  3.427455
2025-10-27  3.431500

[1841 rows x 1 columns]
1038
1176


In [81]:

def build_global_indices(speeches, topic_scores, rates_df):
    
    # 1) Authors
    author_names = sorted({v["author"] for v in speeches.values()})
    author2idx = {name: i for i, name in enumerate(author_names)}

    # 2) Topics
    topic_names = set()
    for sid, topics in topic_scores.items():
        for tname in topics.keys():
            topic_names.add(tname)
    topic_names = sorted(topic_names)
    topic2idx = {name: i for i, name in enumerate(topic_names)}

    # 3) Speech ids
    speech_ids = sorted(speeches.keys())
    speech2idx = {sid: i for i, sid in enumerate(speech_ids)}

    # 4) Dates
    all_dates = sorted(set(rates_df.index))  # dates where we have rates
    date2idx = {d: i for i, d in enumerate(all_dates)}

    return {
        "author2idx": author2idx,
        "topic2idx": topic2idx,
        "speech2idx": speech2idx,
        "date2idx": date2idx,
        "dates": all_dates,
    }

In [82]:
global_indices = build_global_indices(speeches, scores, rates)
print(global_indices)

{'author2idx': {'barr': 0, 'bowman': 1, 'brainard': 2, 'bullard': 3, 'clarida': 4, 'collins': 5, 'daly': 6, 'dudley': 7, 'george': 8, 'harker': 9, 'jefferson': 10, 'kaplan': 11, 'kugler': 12, 'logan': 13, 'mester': 14, 'miran': 15, 'powell': 16, 'quarles': 17, 'rosengren': 18, 'waller': 19, 'williams': 20}, 'topic2idx': {'Balance Sheet': 0, 'Fed Funds Rate': 1, 'Financial Stability': 2, 'Inflation': 3, 'Labor Market': 4, 'Real Activity': 5}, 'speech2idx': {'c00035:91336': 0, 'c00035:91558': 1, 'c00035:92844': 2, 'c00035:93002': 3, 'c00035:93939': 4, 'c00035:95290': 5, 'd00001:101893': 6, 'd00001:89293': 7, 'd00001:89325': 8, 'd00001:89337': 9, 'd00001:89348': 10, 'd00001:89356': 11, 'd00001:92114': 12, 'fedbsp:101198': 13, 'fedbsp:101802': 14, 'fedbsp:101950': 15, 'fedbsp:101951': 16, 'fedbsp:11783': 17, 'fedbsp:11784': 18, 'fedbsp:11796': 19, 'fedbsp:11797': 20, 'fedbsp:11804': 21, 'fedbsp:11805': 22, 'fedbsp:11809': 23, 'fedbsp:11820': 24, 'fedbsp:11822': 25, 'fedbsp:11835': 26, 'fed

In [68]:
print(global_indices["speech2idx"])

{'c00035:91336': 0, 'c00035:91558': 1, 'c00035:92844': 2, 'c00035:93002': 3, 'c00035:93939': 4, 'c00035:95290': 5, 'd00001:101893': 6, 'd00001:89293': 7, 'd00001:89325': 8, 'd00001:89337': 9, 'd00001:89348': 10, 'd00001:89356': 11, 'd00001:92114': 12, 'fedbsp:101198': 13, 'fedbsp:101802': 14, 'fedbsp:101950': 15, 'fedbsp:101951': 16, 'fedbsp:11783': 17, 'fedbsp:11784': 18, 'fedbsp:11796': 19, 'fedbsp:11797': 20, 'fedbsp:11804': 21, 'fedbsp:11805': 22, 'fedbsp:11809': 23, 'fedbsp:11820': 24, 'fedbsp:11822': 25, 'fedbsp:11835': 26, 'fedbsp:11836': 27, 'fedbsp:11841': 28, 'fedbsp:11842': 29, 'fedbsp:11849': 30, 'fedbsp:11850': 31, 'fedbsp:11855': 32, 'fedbsp:11856': 33, 'fedbsp:11863': 34, 'fedbsp:11873': 35, 'fedbsp:11878': 36, 'fedbsp:11885': 37, 'fedbsp:11896': 38, 'fedbsp:11920': 39, 'fedbsp:11925': 40, 'fedbsp:87429': 41, 'fedbsp:87430': 42, 'fedbsp:87431': 43, 'fedbsp:87432': 44, 'fedbsp:87433': 45, 'fedbsp:87434': 46, 'fedbsp:88245': 47, 'fedbsp:88246': 48, 'fedbsp:88247': 49, 'fed

In [72]:
# build_graphs.py (part 3)

def group_speeches_by_date(speeches):
    speeches_by_date = defaultdict(list)
    for sid, info in speeches.items():
        d = info["date"]
        speeches_by_date[d].append(sid)
    return speeches_by_date

speeches_by_date = group_speeches_by_date(speeches)

defaultdict(<class 'list'>, {datetime.datetime(2023, 9, 1, 0, 0): ['fedcsp:96858'], datetime.datetime(2024, 2, 6, 0, 0): ['fedcsp:97730', 'fedbsp:97736'], datetime.datetime(2024, 4, 2, 0, 0): ['fedcsp:98005', 'fedgsq:98004'], datetime.datetime(2024, 5, 16, 0, 0): ['fedcsp:98251', 'fedpsp:98246'], datetime.datetime(2024, 2, 29, 0, 0): ['fedcsp:97883'], datetime.datetime(2023, 10, 2, 0, 0): ['fedcsp:97001', 'fedgsq:97026', 'fedgsq:97020'], datetime.datetime(2023, 10, 20, 0, 0): ['fedcsp:97194', 'fedpsp:97189'], datetime.datetime(2018, 2, 13, 0, 0): ['fedcsp:21979', 'fedgsq:22296'], datetime.datetime(2021, 9, 10, 0, 0): ['fedcsp:93044'], datetime.datetime(2021, 1, 4, 0, 0): ['fedcsp:89391', 'fedcsp:89390'], datetime.datetime(2022, 3, 22, 0, 0): ['fedcsp:93864'], datetime.datetime(2020, 3, 17, 0, 0): ['fedcsp:87682'], datetime.datetime(2020, 9, 2, 0, 0): ['fedcsp:88683', 'fednsp:88675'], datetime.datetime(2018, 1, 6, 0, 0): ['fedcsp:21968'], datetime.datetime(2018, 1, 18, 0, 0): ['fedcsp:2

In [83]:

def get_speeches_in_window(target_date, lookback_days, speeches_by_date):
    """
    Returns a list of speech IDs whose date is in [target_date - lookback_days + 1, target_date].
    """
    start_date = target_date - timedelta(days=lookback_days - 1)
    cur_date = start_date
    selected = []

    while cur_date <= target_date:
        if cur_date in speeches_by_date:
            selected.extend(speeches_by_date[cur_date])
        cur_date += timedelta(days=1)

    return selected


target_date = datetime(2025, 10, 23)
lookback_days = 30
print(get_speeches_in_window(target_date, lookback_days, speeches_by_date))


['fedfsp:101820', 'feddsp:101892', 'd00001:101893', 'fedgsq:101842', 'fedgsq:101843', 'fedgsq:101844', 'fedbsp:101950', 'feddsp:101898', 'fedgsq:101846', 'fednsp:101985', 'fedgsq:101888', 'fedgsq:101890', 'feddsp:101918', 'fedgsq:101904', 'fedgsq:101913', 'fedgsq:101934', 'fedgsq:101914', 'fedgsq:101915', 'fedbsp:101951', 'fedgsq:101935', 'fedgsq:101938', 'fedgsq:101964', 'fedgsq:101952', 'fedgsq:101946', 'fedgsq:101979', 'fedgsq:101989', 'fedgsq:101990', 'fedgsq:101991']


In [85]:
# build_graphs.py (part 6)

def build_all_graphs(
    speeches,
    topic_scores,
    rates_df,
    out_dir="graphs",
    lookback_days=LOOKBACK_DAYS,
    target_column=TARGET_COLUMN,
    predict_delta=PREDICT_DELTA,
):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    global_idx = build_global_indices(speeches, topic_scores, rates_df)
    speeches_by_date = group_speeches_by_date(speeches)

    graphs = []
    dates = global_idx["dates"]

    for d in dates:
        g = build_graph_for_date(
            d,
            speeches,
            topic_scores,
            rates_df,
            speeches_by_date,
            global_idx,
            lookback_days=lookback_days,
            target_column=target_column,
            predict_delta=predict_delta,
        )
        if g is None:
            continue
        graphs.append(g)

    # Option 1: save individually
    for i, g in enumerate(graphs):
        torch.save(g, out_dir / f"graph_{i:04d}.pt")

    # Option 2: save as a list (one big file)
    torch.save(graphs, out_dir / "graphs_all.pt")

    print(f"Built {len(graphs)} graphs and saved to {out_dir}")
    return graphs


In [94]:
# build_graphs.py (part 5)

def build_graph_for_date(
    d,
    speeches,
    topic_scores,
    rates_df,
    speeches_by_date,
    global_idx,
    lookback_days=LOOKBACK_DAYS,
    target_column=TARGET_COLUMN,
    predict_delta=PREDICT_DELTA,
):
    """
    Build HeteroData graph snapshot for date d.
    The target is the next day's rate (or change).
    """

    dates = global_idx["dates"]
    date2idx = global_idx["date2idx"]
    author2idx = global_idx["author2idx"]
    topic2idx = global_idx["topic2idx"]

    # --- Check that we can define a target (need d and next_date) ---
    if d not in date2idx:
        return None  # no market data for this date
    t_idx = date2idx[d]
    if t_idx + 1 >= len(dates):
        return None  # no next day to predict

    next_date = dates[t_idx + 1]
    if next_date not in rates_df.index:
        return None

    # --- Gather speeches in window ---
    speech_ids_window = get_speeches_in_window(d, lookback_days, speeches_by_date)
    if len(speech_ids_window) == 0:
        return None  # no information in window

    # --- Build local indices for this snapshot ---
    # Local speech indices
    local_speech_ids = sorted(set(speech_ids_window))
    speech_local2global = {i: sid for i, sid in enumerate(local_speech_ids)}
    speech_global2local = {sid: i for i, sid in enumerate(local_speech_ids)}

    # Local authors (only those that appear)
    author_names_window = sorted(
        {speeches[sid]["author"] for sid in local_speech_ids}
    )
    author_local2name = {i: name for i, name in enumerate(author_names_window)}
    author_name2local = {name: i for i, name in author_local2name.items()}

    # Local topics (only those that appear)
    topic_names_window = set()
    for sid in local_speech_ids:
        if sid not in topic_scores:
            continue
        for tname in topic_scores[sid].keys():
            topic_names_window.add(tname)
    topic_names_window = sorted(topic_names_window)
    topic_local2name = {i: name for i, name in enumerate(topic_names_window)}
    topic_name2local = {name: i for i, name in topic_local2name.items()}

    num_authors = len(author_local2name)
    num_speeches = len(speech_local2global)
    num_topics = len(topic_local2name)
    num_days = 1  # single day node

    data = HeteroData()

    # ============================
    # 1) NODE FEATURES
    # ============================

    # --- author nodes ---
    # simple feature: global author index (can be embedded later)
    author_global_idx = []
    for i in range(num_authors):
        name = author_local2name[i]
        author_global_idx.append(author2idx.get(name, -1))
    author_global_idx = torch.tensor(author_global_idx, dtype=torch.long).unsqueeze(-1)
    data["author"].x = author_global_idx  # shape [num_authors, 1]

    # --- speech nodes ---
    # feature: days since start + maybe current rate
    speech_features = []
    start_date = dates[0]
    for i in range(num_speeches):
        sid = speech_local2global[i]
        sdate = speeches[sid]["date"]
        days_since_start = (sdate - start_date).days
        speech_features.append([days_since_start])
    data["speech"].x = torch.tensor(speech_features, dtype=torch.float32)  # [S, 1]

    # --- topic nodes ---
    # simple zero or global topic index
    topic_global_idx = []
    for i in range(num_topics):
        name = topic_local2name[i]
        topic_global_idx.append(topic2idx.get(name, -1))
    topic_global_idx = torch.tensor(topic_global_idx, dtype=torch.long).unsqueeze(-1)
    data["topic"].x = topic_global_idx  # [T, 1]

    # --- day node ---
    # You can encode today's rate and maybe date index
    today_rate = rates_df.loc[d, target_column]
    today_idx = date2idx[d]
    day_x = torch.tensor([[today_idx, today_rate]], dtype=torch.float32)  # [1, 2]
    data["day"].x = day_x

    # ============================
    # 2) EDGES
    # ============================

    # --- author -> speech edges (author "gives" speech) ---
    author_src = []
    speech_dst = []
    for sid in local_speech_ids:
        author_name = speeches[sid]["author"]
        if author_name not in author_name2local:
            continue
        a_local = author_name2local[author_name]
        s_local = speech_global2local[sid]
        author_src.append(a_local)
        speech_dst.append(s_local)

    if len(author_src) == 0:
        return None

    data["author", "gives", "speech"].edge_index = torch.tensor(
        [author_src, speech_dst], dtype=torch.long
    )

    # --- speech -> topic edges (with hawk/dove features) ---
    st_src = []
    st_dst = []
    st_attr = []
    for sid in local_speech_ids:
        if sid not in topic_scores:
            continue
        s_local = speech_global2local[sid]
        topics = topic_scores[sid]
        for tname, vals in topics.items():
            if tname not in topic_name2local:
                continue
            t_local = topic_name2local[tname]
            
            net = vals 
            st_src.append(s_local)
            st_dst.append(t_local)
            st_attr.append([net])

    if len(st_src) > 0:
        data["speech", "mentions", "topic"].edge_index = torch.tensor(
            [st_src, st_dst], dtype=torch.long
        )
        data["speech", "mentions", "topic"].edge_attr = torch.tensor(
            st_attr, dtype=torch.float32
        )
    else:
        # still set empty tensors to avoid errors
        data["speech", "mentions", "topic"].edge_index = torch.empty(
            (2, 0), dtype=torch.long
        )
        data["speech", "mentions", "topic"].edge_attr = torch.empty(
            (0, 3), dtype=torch.float32
        )

    # --- day -> speech edges (connect day node to all speeches in window) ---
    day_src = []
    day_dst = []
    day_attr = []

    for sid in local_speech_ids:
        s_local = speech_global2local[sid]
        sdate = speeches[sid]["date"]
        lag_days = (d - sdate).days  # how many days ago
        # simple recency feature: linear lag + exponential decay
        decay = np.exp(-lag_days / 10.0)
        day_src.append(0)        # only one day node, index 0
        day_dst.append(s_local)
        day_attr.append([lag_days, decay])

    data["day", "references", "speech"].edge_index = torch.tensor(
        [day_src, day_dst], dtype=torch.long
    )
    data["day", "references", "speech"].edge_attr = torch.tensor(
        day_attr, dtype=torch.float32
    )

    # ============================
    # 3) TARGET y (next-day rate or Î”rate)
    # ============================
    today_val = float(rates_df.loc[d, target_column])
    next_val = float(rates_df.loc[next_date, target_column])

    if predict_delta:
        y = next_val - today_val
    else:
        y = next_val

    data.y = torch.tensor([y], dtype=torch.float32)
    data.date = torch.tensor([date2idx[d]], dtype=torch.long)  # optional

    return data

In [95]:
# run_build.py
if __name__ == "__main__":
    speeches = load_speeches()
    topic_scores = load_topic_scores()
    rates_df = load_rates()

    graphs = build_all_graphs(
        speeches,
        topic_scores,
        rates_df,
        out_dir="graphs_ffr_delta",   # change as you like
        lookback_days=30,
        target_column="Rate",
        predict_delta=True,
    )


0.7
0.8
0.1
0.8
0.4
0.9
0.4
0.6
-0.1
0.7
0.5
0.0
0.8
0.8
0.2
0.6
0.4
0.1
0.0
0.0
0.0
0.0
0.3
0.0
0.5
0.0
0.0
0.3
0.2
0.7
0.0
0.0
0.0
0.0
0.4
0.0
0.0
0.3
0.0
0.4
0.0
0.0
0.0
0.0
-0.3
0.1
0.6
0.0
0.6
0.7
-0.3
0.6
0.2
0.7
0.1
0.0
0.0
0.0
0.2
0.7
0.0
0.0
0.0
0.0
-0.7
0.0
0.0
0.6
0.0
0.1
0.0
0.0
0.7
0.8
0.1
0.8
0.4
0.9
0.4
0.6
-0.1
0.7
0.5
0.0
0.8
0.8
0.2
0.6
0.4
0.1
0.0
0.0
0.0
0.0
0.3
0.0
0.5
0.0
0.0
0.3
0.2
0.7
0.0
0.0
0.0
0.0
0.4
0.0
0.0
0.3
0.0
0.4
0.0
0.0
0.0
0.0
-0.3
0.1
0.6
0.0
0.6
0.7
-0.3
0.6
0.2
0.7
0.1
0.0
0.0
0.0
0.2
0.7
0.0
0.0
0.0
0.0
-0.7
0.0
0.0
0.6
0.0
0.1
0.0
0.0
0.7
0.8
0.1
0.8
0.4
0.9
0.4
0.6
-0.1
0.7
0.5
0.0
0.8
0.8
0.2
0.6
0.4
0.1
0.0
0.0
0.0
0.0
0.3
0.0
0.5
0.0
0.0
0.3
0.2
0.7
0.0
0.0
0.0
0.0
0.4
0.0
0.0
0.3
0.0
0.4
0.0
0.0
0.0
0.0
-0.3
0.1
0.6
0.0
0.6
0.7
-0.3
0.6
0.2
0.7
0.1
0.0
0.0
0.0
0.2
0.7
0.0
0.0
0.0
0.0
-0.7
0.0
0.0
0.6
0.0
0.1
0.0
0.0
0.7
0.8
0.1
0.8
0.4
0.9
0.4
0.6
-0.1
0.7
0.5
0.0
0.8
0.8
0.2
0.6
0.4
0.1
0.0
0.0
0.0
0.0
0.3
0.0
0.0
0.0
0.0
0.0
0.4
0.0
0.0