Text–Time Series Fusion Model (TTSF)

In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Build sentiment features with an LLM encoder (FinBERT)

In [2]:
# Load your data
prices = pd.read_csv("data/prices.csv", parse_dates=["date"]).sort_values("date")
news   = pd.read_csv("data/news.csv",   parse_dates=["date"]).sort_values("date")

# Choose a finance-sentiment model (both are widely used)
MODEL_NAME = "ProsusAI/finbert"  # alt: "yiyanghkust/finbert-tone"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()
model.to("cuda:0")

def batch_sentiment(texts, batch_size=16, max_length=128):
    """Return list of dicts with probs: {'neg': p0, 'neu': p1, 'pos': p2} (FinBERT label order)."""
    out = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Scoring headlines"):
            chunk = texts[i:i+batch_size]
            tok = tokenizer(chunk, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
            logits = model(**{k: v.to("cuda:0") for k, v in tok.items()}).logits
            probs = softmax(logits, dim=-1).cpu().numpy()
            for p in probs:
                out.append({"neg": float(p[0]), "neu": float(p[1]), "pos": float(p[2])})
    return out

# Compute sentiment per headline
news = news.dropna(subset=["headline"]).copy()
scores = batch_sentiment(news["headline"].astype(str).tolist(), batch_size=32)
news = pd.concat([news.reset_index(drop=True), pd.DataFrame(scores)], axis=1)

# Aggregate to daily features (you can try other aggregations)
daily_sent = (
    news.groupby("date")
        .agg(pos_mean=("pos", "mean"), neg_mean=("neg", "mean"), neu_mean=("neu", "mean"),
             pos_max=("pos", "max"),  neg_max=("neg", "max"),
             n_headlines=("headline", "count"))
        .reset_index()
)

2025-10-22 15:34:53.064404: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Scoring headlines: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 19.01it/s]


# Merge with price features and make supervised sequences

In [3]:
df = prices.merge(daily_sent, on="date", how="left").fillna(0.0)

# Simple price features (extend with TA if you like)
df["ret1"] = df["close"].pct_change()
df["hl_spread"] = (df["high"] - df["low"]) / df["close"].shift(1)
df["vol_norm"]  = (df["volume"] / df["volume"].rolling(20).mean()).fillna(1.0)

# Target: next-day return (regression) or sign (classification)
df["y_reg"] = df["ret1"].shift(-1)
df["y_cls"] = (df["y_reg"] > 0).astype(int)

feature_cols = [
    "ret1", "hl_spread", "vol_norm",
    "pos_mean", "neg_mean", "neu_mean", "pos_max", "neg_max", "n_headlines",
]
df = df.dropna().reset_index(drop=True)

# Train/val/test split by time (no leakage!)
split1 = int(len(df) * 0.7)
split2 = int(len(df) * 0.85)
train, val, test = df.iloc[:split1], df.iloc[split1:split2], df.iloc[split2:]

# Standardize using *train* statistics only
scaler = StandardScaler().fit(train[feature_cols])
def scale_block(block):
    X = scaler.transform(block[feature_cols])
    y_reg = block["y_reg"].values
    y_cls = block["y_cls"].values
    return X, y_reg, y_cls

Xtr, ytr_reg, ytr_cls = scale_block(train)
Xva, yva_reg, yva_cls = scale_block(val)
Xte, yte_reg, yte_cls = scale_block(test)

# Build sliding windows for LSTM
def make_windows(X, y_reg, y_cls, window=60, horizon=1, task="reg"):
    Xw, Y = [], []
    for i in range(len(X) - window - horizon + 1):
        Xw.append(X[i:i + window])
        if task == "reg":
            Y.append(y_reg[i+window+horizon-1])
        else:
            Y.append(y_cls[i+window+horizon-1])
    return np.array(Xw, dtype=np.float32), np.array(Y)

WINDOW, H = 10, 1
Xtr_w, Ytr = make_windows(Xtr, ytr_reg, ytr_cls, WINDOW, H, task="reg")
Xva_w, Yva = make_windows(Xva, yva_reg, yva_cls, WINDOW, H, task="reg")
Xte_w, Yte = make_windows(Xte, yte_reg, yte_cls, WINDOW, H, task="reg")

Xtr_w.shape, Xva_w.shape, Xte_w.shape  # (N, window, n_features)

((90, 10, 9), (12, 10, 9), (12, 10, 9))

# LSTM regressor over fused features

In [4]:
assert torch.cuda.is_available()
device = torch.device("cuda:0")

train_ds = TensorDataset(torch.tensor(Xtr_w, dtype=torch.float32), torch.tensor(Ytr, dtype=torch.float32).unsqueeze(-1))
val_ds   = TensorDataset(torch.tensor(Xva_w, dtype=torch.float32), torch.tensor(Yva, dtype=torch.float32).unsqueeze(-1))
test_ds  = TensorDataset(torch.tensor(Xte_w, dtype=torch.float32), torch.tensor(Yte, dtype=torch.float32).unsqueeze(-1))

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256)
test_loader  = DataLoader(test_ds, batch_size=256)

class LSTMReg(nn.Module):
    def __init__(self, n_features, hidden=64, layers=2, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=n_features, hidden_size=hidden, num_layers=layers,
            dropout=dropout if layers > 1 else 0.0, batch_first=True,
        )
        self.head = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Linear(hidden, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        # x: (B, T, F)
        out, _ = self.lstm(x)
        # use last timestep
        return self.head(out[:, -1, :])

model = LSTMReg(n_features=Xtr_w.shape[-1]).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
loss_fn = nn.MSELoss()

best_val = float("inf")
patience = 8
bad = 0
best_state = None

for epoch in range(100):
    model.train()
    tr_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        pred = model(xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        tr_loss += loss.item() * len(xb)
    tr_loss /= len(train_ds)

    model.eval()
    with torch.no_grad():
        va_loss = 0.0
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            va_loss += loss_fn(pred, yb).item() * len(xb)
        va_loss /= len(val_ds)

    print(f"Epoch {epoch:03d} | train {tr_loss:.6f} | val {va_loss:.6f}")
    if va_loss < best_val - 1e-5:
        best_val = va_loss
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= patience:
            break

# Restore best model
model.load_state_dict({k:v.to(device) for k, v in best_state.items()})

# Evaluate (MSE and directional accuracy)
model.eval()
with torch.no_grad():
    preds, ys = [], []
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds.append(model(xb).cpu().numpy())
        ys.append(yb.numpy())

preds = np.concatenate(preds).ravel()
ys    = np.concatenate(ys).ravel()
mse = np.mean((preds - ys) ** 2)
direction_acc = np.mean((preds > 0) == (ys > 0))
print("Test MSE:", mse, " | Directional accuracy:", direction_acc)

Epoch 000 | train 0.021497 | val 0.007390
Epoch 001 | train 0.004815 | val 0.000195
Epoch 002 | train 0.001796 | val 0.002263
Epoch 003 | train 0.006059 | val 0.002409
Epoch 004 | train 0.006727 | val 0.000714
Epoch 005 | train 0.003787 | val 0.000117
Epoch 006 | train 0.001646 | val 0.002009
Epoch 007 | train 0.000593 | val 0.005616
Epoch 008 | train 0.001526 | val 0.008409
Epoch 009 | train 0.002610 | val 0.008909
Epoch 010 | train 0.003042 | val 0.007072
Epoch 011 | train 0.002307 | val 0.004243
Epoch 012 | train 0.001661 | val 0.001720
Epoch 013 | train 0.000748 | val 0.000331
Test MSE: 0.0002947589  | Directional accuracy: 0.9166666666666666
