In [1]:
# Cell 1: Imports + config

import os
import numpy as np
import pandas as pd

pd.set_option("display.width", 140)
pd.set_option("display.max_columns", 50)

TICKERS = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA"]

# Market files: try your original path first; fallback to /mnt/data (project environment)
MARKET_DIR_PRIMARY = "data/raw/market"
MARKET_DIR_FALLBACK = "/mnt/data"

# Macro / sentiment files (adjust if needed)
PPI_FILE  = "PPI_hourly.csv"
CPI_FILE  = "CPI_hourly.csv"
FOMC_FILE = "FOMC_rate_hourly.csv"
NFP_FILE  = "NFP_hourly.csv"
GDP_FILE  = "GDP_hourly.csv"
SENT_FILE = "news_sentiment_hourly.csv"

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [2]:
# Cell 2: Helper functions

def _find_market_file(ticker: str) -> str:
    p1 = os.path.join(MARKET_DIR_PRIMARY, f"{ticker}_1h.csv")
    p2 = os.path.join(MARKET_DIR_FALLBACK, f"{ticker}_1h.csv")
    if os.path.exists(p1):
        return p1
    if os.path.exists(p2):
        return p2
    raise FileNotFoundError(f"Could not find market file for {ticker} in {p1} or {p2}")

def _read_market_csv(path: str) -> pd.DataFrame:
    """
    Tries to read yfinance-style CSVs that sometimes include 3 metadata lines.
    Falls back to a normal read if parsing fails.
    """
    # Attempt 1: your known format (skip first 3 lines)
    df = pd.read_csv(
        path,
        skiprows=3,
        names=["Datetime", "Close", "High", "Low", "Open", "Volume"]
    )
    df["Datetime"] = pd.to_datetime(df["Datetime"], utc=True, errors="coerce")
    if df["Datetime"].isna().mean() > 0.20:
        # Attempt 2: fallback (maybe file already has headers)
        df = pd.read_csv(path)
        # Try common column name variants
        dt_col = "Datetime" if "Datetime" in df.columns else ("Date" if "Date" in df.columns else None)
        if dt_col is None:
            raise ValueError(f"Could not find Datetime/Date column in {path}. Columns: {df.columns.tolist()}")
        df = df.rename(columns={dt_col: "Datetime"})
        df["Datetime"] = pd.to_datetime(df["Datetime"], utc=True, errors="coerce")

        # Ensure required columns exist
        needed = ["Close", "High", "Low", "Open", "Volume"]
        missing = [c for c in needed if c not in df.columns]
        if missing:
            raise ValueError(f"Missing columns {missing} in {path}. Columns: {df.columns.tolist()}")

        df = df[["Datetime"] + needed]

    # Clean
    df = df.dropna(subset=["Datetime"]).copy()
    df = df.sort_values("Datetime")
    # Drop duplicate timestamps (keep last)
    df = df.drop_duplicates(subset=["Datetime"], keep="last")

    # Force numeric
    for c in ["Close", "High", "Low", "Open", "Volume"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df = df.dropna(subset=["Close", "Volume"]).copy()

    return df

def _minute_profile(dt_series: pd.Series) -> pd.Series:
    return dt_series.dt.minute.value_counts(dropna=False).sort_index()


In [3]:
# Cell 3: Load all tickers (NO reindex / NO ffill)

market_dfs = []
for t in TICKERS:
    fpath = _find_market_file(t)
    dft = _read_market_csv(fpath)
    dft["ticker"] = t
    market_dfs.append(dft)

prices_all = pd.concat(market_dfs, ignore_index=True)
prices_all = prices_all.sort_values(["ticker", "Datetime"]).reset_index(drop=True)

print("Loaded rows:", len(prices_all))
print("Per-ticker row counts:")
print(prices_all["ticker"].value_counts())
print("\nMarket timestamp minute profile (should be stable, often all 30):")
print(_minute_profile(prices_all["Datetime"]))
prices_all.head()


Loaded rows: 24458
Per-ticker row counts:
ticker
AAPL     3494
AMZN     3494
GOOGL    3494
META     3494
MSFT     3494
NVDA     3494
TSLA     3494
Name: count, dtype: int64

Market timestamp minute profile (should be stable, often all 30):
Datetime
30    24458
Name: count, dtype: int64


Unnamed: 0,Datetime,Close,High,Low,Open,Volume,ticker
0,2023-12-06 14:30:00+00:00,192.419998,194.759995,192.205002,194.449997,11260893,AAPL
1,2023-12-06 15:30:00+00:00,193.095001,193.339996,192.360107,192.419998,4374474,AAPL
2,2023-12-06 16:30:00+00:00,192.830002,193.130005,192.470001,193.100006,3252326,AAPL
3,2023-12-06 17:30:00+00:00,192.905899,192.979996,192.369995,192.839996,3389634,AAPL
4,2023-12-06 18:30:00+00:00,192.779999,193.235001,192.740005,192.910004,2713794,AAPL


In [4]:
# Cell 4: Gap diagnostics (expect >1h gaps because we removed synthetic bars)

tmp = prices_all.sort_values(["ticker", "Datetime"]).copy()
tmp["dt_diff_hours"] = tmp.groupby("ticker")["Datetime"].diff().dt.total_seconds() / 3600.0

print("Gap distribution (hours) - per ticker (top few values):")
for t in TICKERS:
    s = tmp.loc[tmp["ticker"] == t, "dt_diff_hours"].dropna()
    print(f"\n{t}:")
    print(s.value_counts().head(10))

# If you see a massive number of exactly 1.0 gaps AND many 18h gaps, that's normal for intraday-only data.


Gap distribution (hours) - per ticker (top few values):

AAPL:
dt_diff_hours
1.0     2992
18.0     389
66.0      85
90.0      12
42.0       6
65.0       2
46.0       2
67.0       2
70.0       2
94.0       1
Name: count, dtype: int64

MSFT:
dt_diff_hours
1.0     2992
18.0     389
66.0      85
90.0      12
42.0       6
65.0       2
46.0       2
67.0       2
70.0       2
94.0       1
Name: count, dtype: int64

GOOGL:
dt_diff_hours
1.0     2992
18.0     389
66.0      85
90.0      12
42.0       6
65.0       2
46.0       2
67.0       2
70.0       2
94.0       1
Name: count, dtype: int64

AMZN:
dt_diff_hours
1.0     2992
18.0     389
66.0      85
90.0      12
42.0       6
65.0       2
46.0       2
67.0       2
70.0       2
94.0       1
Name: count, dtype: int64

NVDA:
dt_diff_hours
1.0     2992
18.0     389
66.0      85
90.0      12
42.0       6
65.0       2
46.0       2
67.0       2
70.0       2
94.0       1
Name: count, dtype: int64

META:
dt_diff_hours
1.0     2992
18.0     389
66.0      8

In [5]:
# Cell 5: Macro loader + merge helper

def load_macro_csv(path: str, cols: list[str]) -> pd.DataFrame:
    df = pd.read_csv(path)
    if "Datetime" not in df.columns:
        raise ValueError(f"{path}: no Datetime column. Columns={df.columns.tolist()}")
    df["Datetime"] = pd.to_datetime(df["Datetime"], utc=True, errors="coerce")
    df = df.dropna(subset=["Datetime"]).sort_values("Datetime")
    keep = ["Datetime"] + [c for c in cols if c in df.columns]
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"{path}: missing columns {missing}. Columns={df.columns.tolist()}")
    return df[keep].copy()

def align_macro_to_market_minutes(macro_df: pd.DataFrame, market_minutes: int) -> pd.DataFrame:
    """
    If macro timestamps are HH:00 but market is HH:30 (or similar),
    shift macro timestamps so they match market minute.
    """
    macro_minutes = int(macro_df["Datetime"].dt.minute.mode().iloc[0])
    if macro_minutes == market_minutes:
        return macro_df
    # Example: macro at :00, market at :30 => shift +30 minutes
    delta = (market_minutes - macro_minutes) % 60
    macro_df = macro_df.copy()
    macro_df["Datetime"] = macro_df["Datetime"] + pd.Timedelta(minutes=delta)
    return macro_df

market_minute_mode = int(prices_all["Datetime"].dt.minute.mode().iloc[0])
print("Market minute mode:", market_minute_mode)

# Load macros
ppi  = load_macro_csv(PPI_FILE,  ["PPI_YoY"])
cpi  = load_macro_csv(CPI_FILE,  ["CPI_YoY", "CPI_MoM"])
fomc = load_macro_csv(FOMC_FILE, ["Fed_Funds_Rate"])
nfp  = load_macro_csv(NFP_FILE,  ["NonFarm_Payrolls_Change"])
gdp  = load_macro_csv(GDP_FILE,  ["GDP_Growth_QoQ"])

# Align macros to market minute if needed
ppi  = align_macro_to_market_minutes(ppi,  market_minute_mode)
cpi  = align_macro_to_market_minutes(cpi,  market_minute_mode)
fomc = align_macro_to_market_minutes(fomc, market_minute_mode)
nfp  = align_macro_to_market_minutes(nfp,  market_minute_mode)
gdp  = align_macro_to_market_minutes(gdp,  market_minute_mode)

# Merge (left join on Datetime)
prices_all = prices_all.merge(ppi,  on="Datetime", how="left")
prices_all = prices_all.merge(cpi,  on="Datetime", how="left")
prices_all = prices_all.merge(fomc, on="Datetime", how="left")
prices_all = prices_all.merge(nfp,  on="Datetime", how="left")
prices_all = prices_all.merge(gdp,  on="Datetime", how="left")

# Forward-fill macros by time per ticker (no bfill)
macro_cols = ["PPI_YoY", "CPI_YoY", "CPI_MoM", "Fed_Funds_Rate", "NonFarm_Payrolls_Change", "GDP_Growth_QoQ"]
prices_all = prices_all.sort_values(["ticker", "Datetime"])
prices_all[macro_cols] = prices_all.groupby("ticker")[macro_cols].ffill()

# Drop any remaining NaNs (early prefix) for macro columns
before = len(prices_all)
prices_all = prices_all.dropna(subset=macro_cols).reset_index(drop=True)
print(f"Dropped {before - len(prices_all)} rows due to macro NaNs (early prefix).")
prices_all.head()


Market minute mode: 30
Dropped 0 rows due to macro NaNs (early prefix).


Unnamed: 0,Datetime,Close,High,Low,Open,Volume,ticker,PPI_YoY,CPI_YoY,CPI_MoM,Fed_Funds_Rate,NonFarm_Payrolls_Change,GDP_Growth_QoQ
0,2023-12-06 14:30:00+00:00,192.419998,194.759995,192.205002,194.449997,11260893,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9
1,2023-12-06 15:30:00+00:00,193.095001,193.339996,192.360107,192.419998,4374474,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9
2,2023-12-06 16:30:00+00:00,192.830002,193.130005,192.470001,193.100006,3252326,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9
3,2023-12-06 17:30:00+00:00,192.905899,192.979996,192.369995,192.839996,3389634,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9
4,2023-12-06 18:30:00+00:00,192.779999,193.235001,192.740005,192.910004,2713794,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9


In [6]:
# Cell 6: Sentiment load + alignment that will not double-shift

sent = pd.read_csv(SENT_FILE)
if "Datetime" not in sent.columns or "ticker" not in sent.columns or "news_count" not in sent.columns:
    raise ValueError(f"{SENT_FILE} must include at least Datetime, ticker, news_count. Columns={sent.columns.tolist()}")

sent["Datetime"] = pd.to_datetime(sent["Datetime"], utc=True, errors="coerce")
sent = sent.dropna(subset=["Datetime"]).sort_values(["ticker", "Datetime"]).copy()

print("Sentiment minute profile (before align):")
print(_minute_profile(sent["Datetime"]))

def align_sentiment_to_market(sent_df: pd.DataFrame, market_minute: int) -> pd.DataFrame:
    sent_df = sent_df.copy()
    sent_minute_mode = int(sent_df["Datetime"].dt.minute.mode().iloc[0])

    # If already matches market minute, do nothing
    if sent_minute_mode == market_minute:
        return sent_df

    # If sentiment is HH:00 and market is HH:30, shift +30; otherwise shift to match
    delta = (market_minute - sent_minute_mode) % 60
    sent_df["Datetime"] = sent_df["Datetime"] + pd.Timedelta(minutes=delta)
    return sent_df

sent = align_sentiment_to_market(sent, market_minute_mode)

print("\nSentiment minute profile (after align):")
print(_minute_profile(sent["Datetime"]))

# Create sentiment features (same as your logic)
sent["has_news"] = (sent["news_count"] > 0).astype(int)

# If these columns exist in your file, weâ€™ll use them; else we create safe zeros
for base_col in ["overall_sentiment_mean", "ticker_sentiment_mean", "ticker_relevance_mean"]:
    if base_col not in sent.columns:
        sent[base_col] = np.nan

sent["overall_sentiment_ffill"] = sent.groupby("ticker")["overall_sentiment_mean"].ffill().fillna(0.0)
sent["ticker_sentiment_ffill"]  = sent.groupby("ticker")["ticker_sentiment_mean"].ffill().fillna(0.0)
sent["ticker_relevance_ffill"]  = sent.groupby("ticker")["ticker_relevance_mean"].ffill().fillna(0.0)

sent_merge = sent[[
    "Datetime", "ticker",
    "news_count", "has_news",
    "overall_sentiment_ffill", "ticker_sentiment_ffill", "ticker_relevance_ffill"
]].copy()

prices_all = prices_all.merge(sent_merge, on=["Datetime", "ticker"], how="left")

# Fill missing sentiment with zeros
fill0 = ["news_count", "has_news", "overall_sentiment_ffill", "ticker_sentiment_ffill", "ticker_relevance_ffill"]
prices_all[fill0] = prices_all[fill0].fillna(0)

# Proof that join works (should be >0 if your sentiment file actually contains news)
probe = prices_all.loc[prices_all["news_count"] > 0, ["Datetime", "ticker", "news_count"]]
print("\nRows with news_count>0 after merge:", len(probe))
print(probe.head(10))


Sentiment minute profile (before align):
Datetime
0    180607
Name: count, dtype: int64

Sentiment minute profile (after align):
Datetime
30    180607
Name: count, dtype: int64

Rows with news_count>0 after merge: 1865
                     Datetime ticker  news_count
0   2023-12-06 14:30:00+00:00   AAPL           1
7   2023-12-07 14:30:00+00:00   AAPL           1
64  2023-12-19 15:30:00+00:00   AAPL           1
239 2024-01-26 15:30:00+00:00   AAPL           1
254 2024-01-30 16:30:00+00:00   AAPL           1
272 2024-02-01 20:30:00+00:00   AAPL           1
292 2024-02-06 19:30:00+00:00   AAPL           1
330 2024-02-14 15:30:00+00:00   AAPL           1
400 2024-02-29 15:30:00+00:00   AAPL           1
448 2024-03-11 13:30:00+00:00   AAPL           1


In [7]:
# Cell 7: Create 4-step-ahead target (4 bars ahead)

HORIZON_BARS = 4

prices_all = prices_all.sort_values(["ticker", "Datetime"]).reset_index(drop=True)

prices_all["Close_t_plus_4"] = prices_all.groupby("ticker")["Close"].shift(-HORIZON_BARS)
prices_all["target_log_return_4h"] = np.log(prices_all["Close_t_plus_4"] / prices_all["Close"])

before = len(prices_all)
prices_all = prices_all.dropna(subset=["target_log_return_4h"]).reset_index(drop=True)
print(f"Dropped {before - len(prices_all)} rows without target (last {HORIZON_BARS} per ticker).")

print(prices_all["target_log_return_4h"].describe())
prices_all.head()


Dropped 28 rows without target (last 4 per ticker).
count    24430.000000
mean         0.000774
std          0.018615
min         -0.190422
25%         -0.006593
50%          0.000822
75%          0.007955
max          0.194717
Name: target_log_return_4h, dtype: float64


Unnamed: 0,Datetime,Close,High,Low,Open,Volume,ticker,PPI_YoY,CPI_YoY,CPI_MoM,Fed_Funds_Rate,NonFarm_Payrolls_Change,GDP_Growth_QoQ,news_count,has_news,overall_sentiment_ffill,ticker_sentiment_ffill,ticker_relevance_ffill,Close_t_plus_4,target_log_return_4h
0,2023-12-06 14:30:00+00:00,192.419998,194.759995,192.205002,194.449997,11260893,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,1,1,0.001,0.001,0.624819,192.779999,0.001869
1,2023-12-06 15:30:00+00:00,193.095001,193.339996,192.360107,192.419998,4374474,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,192.425995,-0.003471
2,2023-12-06 16:30:00+00:00,192.830002,193.130005,192.470001,193.100006,3252326,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,192.309998,-0.0027
3,2023-12-06 17:30:00+00:00,192.905899,192.979996,192.369995,192.839996,3389634,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,194.725006,0.009386
4,2023-12-06 18:30:00+00:00,192.779999,193.235001,192.740005,192.910004,2713794,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,194.300003,0.007854


In [8]:
# Cell 8: 1-bar log return (per ticker)

prices_all["log_return_1h"] = prices_all.groupby("ticker")["Close"].transform(lambda x: np.log(x / x.shift(1)))

before = len(prices_all)
prices_all = prices_all.dropna(subset=["log_return_1h"]).reset_index(drop=True)
print(f"Dropped {before - len(prices_all)} rows due to first return per ticker.")

print(prices_all["log_return_1h"].describe())
prices_all.head()


Dropped 7 rows due to first return per ticker.
count    24423.000000
mean         0.000194
std          0.009454
min         -0.139523
25%         -0.002701
50%          0.000163
75%          0.003077
max          0.182174
Name: log_return_1h, dtype: float64


Unnamed: 0,Datetime,Close,High,Low,Open,Volume,ticker,PPI_YoY,CPI_YoY,CPI_MoM,Fed_Funds_Rate,NonFarm_Payrolls_Change,GDP_Growth_QoQ,news_count,has_news,overall_sentiment_ffill,ticker_sentiment_ffill,ticker_relevance_ffill,Close_t_plus_4,target_log_return_4h,log_return_1h
0,2023-12-06 15:30:00+00:00,193.095001,193.339996,192.360107,192.419998,4374474,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,192.425995,-0.003471,0.003502
1,2023-12-06 16:30:00+00:00,192.830002,193.130005,192.470001,193.100006,3252326,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,192.309998,-0.0027,-0.001373
2,2023-12-06 17:30:00+00:00,192.905899,192.979996,192.369995,192.839996,3389634,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,194.725006,0.009386,0.000394
3,2023-12-06 18:30:00+00:00,192.779999,193.235001,192.740005,192.910004,2713794,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,194.300003,0.007854,-0.000653
4,2023-12-06 19:30:00+00:00,192.425995,193.050003,192.339996,192.779999,3147189,AAPL,1.105605,3.246538,0.244648,5.5,141.0,4.9,0,0,0.001,0.001,0.624819,194.940002,0.01298,-0.001838


In [9]:
# Cell 9: Rolling volatility on log returns (bars, not calendar hours)

prices_all["vol_12h"] = prices_all.groupby("ticker")["log_return_1h"].transform(
    lambda x: x.rolling(window=12, min_periods=12).std()
)
prices_all["vol_24h"] = prices_all.groupby("ticker")["log_return_1h"].transform(
    lambda x: x.rolling(window=24, min_periods=24).std()
)

before = len(prices_all)
prices_all = prices_all.dropna(subset=["vol_24h"]).reset_index(drop=True)
print(f"Dropped {before - len(prices_all)} rows due to vol warmup (~24 bars per ticker).")

print(prices_all[["vol_12h", "vol_24h"]].describe())
prices_all.head()


Dropped 161 rows due to vol warmup (~24 bars per ticker).
            vol_12h       vol_24h
count  24262.000000  24262.000000
mean       0.007654      0.008000
std        0.005620      0.005089
min        0.000545      0.001366
25%        0.004022      0.004588
50%        0.006042      0.006510
75%        0.009325      0.009677
max        0.054228      0.038745


Unnamed: 0,Datetime,Close,High,Low,Open,Volume,ticker,PPI_YoY,CPI_YoY,CPI_MoM,Fed_Funds_Rate,NonFarm_Payrolls_Change,GDP_Growth_QoQ,news_count,has_news,overall_sentiment_ffill,ticker_sentiment_ffill,ticker_relevance_ffill,Close_t_plus_4,target_log_return_4h,log_return_1h,vol_12h,vol_24h
0,2023-12-11 17:30:00+00:00,192.455002,192.589996,192.059998,192.109894,3798928,AAPL,1.105605,3.139856,0.033478,5.5,141.0,4.9,0,0,0.255194,0.322052,0.928465,192.835007,0.001973,0.001846,0.005741,0.00497
1,2023-12-11 18:30:00+00:00,192.791794,192.970001,192.470001,192.470001,3897170,AAPL,1.105605,3.139856,0.033478,5.5,141.0,4.9,0,0,0.255194,0.322052,0.928465,193.240005,0.002322,0.001748,0.005779,0.004929
2,2023-12-11 19:30:00+00:00,192.869995,193.125,192.75,192.794998,4731090,AAPL,1.105605,3.139856,0.033478,5.5,141.0,4.9,0,0,0.255194,0.322052,0.928465,193.465805,0.003084,0.000406,0.005727,0.004922
3,2023-12-11 20:30:00+00:00,193.179993,193.309998,192.830002,192.869995,5215874,AAPL,1.105605,3.139856,0.033478,5.5,141.0,4.9,0,0,0.255194,0.322052,0.928465,193.660004,0.002482,0.001606,0.005756,0.004933
4,2023-12-12 14:30:00+00:00,192.835007,193.389999,191.720993,193.080002,11344243,AAPL,1.105605,3.139856,0.033478,5.5,141.0,4.9,0,0,0.255194,0.322052,0.928465,194.220001,0.007157,-0.001787,0.005711,0.004945


In [10]:
# Cell 10: Volume features (no forward-fill of Volume anywhere)

prices_all["log_volume"] = np.log(prices_all["Volume"].clip(lower=0) + 1)

def rolling_zscore(x: pd.Series, window: int = 24) -> pd.Series:
    mu = x.rolling(window, min_periods=window).mean()
    sd = x.rolling(window, min_periods=window).std()
    return (x - mu) / sd

prices_all["vol_z_24h"] = prices_all.groupby("ticker")["log_volume"].transform(lambda x: rolling_zscore(x, 24))

before = len(prices_all)
prices_all = prices_all.dropna(subset=["vol_z_24h"]).reset_index(drop=True)
print(f"Dropped {before - len(prices_all)} rows due to volume z-score warmup (~24 bars per ticker).")

prices_all[["log_volume", "vol_z_24h"]].describe()


Dropped 161 rows due to volume z-score warmup (~24 bars per ticker).


Unnamed: 0,log_volume,vol_z_24h
count,24101.0,24101.0
mean,15.269252,-0.027153
std,1.130981,0.986118
min,0.0,-4.653561
25%,14.509745,-0.746569
50%,15.159441,-0.188598
75%,15.964994,0.524187
max,19.311836,3.92126


In [11]:
# Cell 11: Feature sets

RETURN_COLS = ["log_return_1h"]

VOL_COLS = ["vol_12h", "vol_24h"]

VOLUME_COLS = ["log_volume", "vol_z_24h"]

SENTIMENT_COLS = [
    "news_count",
    "has_news",
    "ticker_sentiment_ffill",
    "overall_sentiment_ffill",
    "ticker_relevance_ffill"
]

MACRO_COLS = [
    "CPI_YoY", "CPI_MoM",
    "PPI_YoY",
    "GDP_Growth_QoQ",
    "Fed_Funds_Rate",
    "NonFarm_Payrolls_Change"
]

TARGET_COL = "target_log_return_4h"

ALL_FEATURES = RETURN_COLS + VOL_COLS + VOLUME_COLS + SENTIMENT_COLS + MACRO_COLS

print("Total features:", len(ALL_FEATURES))
print(ALL_FEATURES)

# Final hard check: no NaNs in features/target
need = ALL_FEATURES + [TARGET_COL]
nan_counts = prices_all[need].isna().sum().sort_values(ascending=False)
print("\nTop NaN counts (should all be 0):")
print(nan_counts.head(10))


Total features: 16
['log_return_1h', 'vol_12h', 'vol_24h', 'log_volume', 'vol_z_24h', 'news_count', 'has_news', 'ticker_sentiment_ffill', 'overall_sentiment_ffill', 'ticker_relevance_ffill', 'CPI_YoY', 'CPI_MoM', 'PPI_YoY', 'GDP_Growth_QoQ', 'Fed_Funds_Rate', 'NonFarm_Payrolls_Change']

Top NaN counts (should all be 0):
log_return_1h              0
ticker_relevance_ffill     0
NonFarm_Payrolls_Change    0
Fed_Funds_Rate             0
GDP_Growth_QoQ             0
PPI_YoY                    0
CPI_MoM                    0
CPI_YoY                    0
overall_sentiment_ffill    0
vol_12h                    0
dtype: int64


In [12]:
# Cell 12: Train/Val/Test split by global time boundaries

prices_all = prices_all.sort_values(["Datetime", "ticker"]).reset_index(drop=True)
unique_times = np.array(sorted(prices_all["Datetime"].unique()))
n = len(unique_times)

train_end = unique_times[int(n * 0.70)]
val_end   = unique_times[int(n * 0.85)]

prices_all["split"] = "test"
prices_all.loc[prices_all["Datetime"] <= train_end, "split"] = "train"
prices_all.loc[(prices_all["Datetime"] > train_end) & (prices_all["Datetime"] <= val_end), "split"] = "val"

print(prices_all["split"].value_counts())
print("\nSplit boundaries:")
print("Train end:", train_end)
print("Val end:", val_end)


split
train    16877
val       3612
test      3612
Name: count, dtype: int64

Split boundaries:
Train end: 2025-05-05 18:30:00+00:00
Val end: 2025-08-21 13:30:00+00:00


In [13]:
# Cell 13: Train-only scaling

from sklearn.preprocessing import StandardScaler

scalers = {}

def fit_scaler(cols):
    scaler = StandardScaler()
    scaler.fit(prices_all.loc[prices_all["split"] == "train", cols])
    return scaler

# Scale (same groups you did)
scalers["vol"]    = fit_scaler(VOL_COLS)
scalers["volume"] = fit_scaler(VOLUME_COLS)
scalers["macro"]  = fit_scaler(MACRO_COLS)

prices_all[VOL_COLS] = scalers["vol"].transform(prices_all[VOL_COLS])
prices_all[VOLUME_COLS] = scalers["volume"].transform(prices_all[VOLUME_COLS])
prices_all[MACRO_COLS] = scalers["macro"].transform(prices_all[MACRO_COLS])

print("Scaling complete.")


Scaling complete.


In [14]:
# Cell 14: Stats check

print("\nTRAIN stats (should be ~0 mean, ~1 std):")
print(prices_all.loc[prices_all["split"]=="train", VOL_COLS + VOLUME_COLS + MACRO_COLS].describe().loc[["mean","std"]])

print("\nVAL stats (should be shifted, NOT zero-mean):")
print(prices_all.loc[prices_all["split"]=="val", VOL_COLS + VOLUME_COLS + MACRO_COLS].describe().loc[["mean","std"]])

print("\nTarget distribution (train):")
print(prices_all.loc[prices_all["split"]=="train", TARGET_COL].describe())



TRAIN stats (should be ~0 mean, ~1 std):
           vol_12h       vol_24h    log_volume     vol_z_24h       CPI_YoY       CPI_MoM  PPI_YoY  GDP_Growth_QoQ  Fed_Funds_Rate  \
mean  1.818774e-16 -8.757059e-17  5.186874e-16  2.105062e-18  9.161231e-16 -9.430679e-17  0.00000         0.00000    5.388960e-17   
std   1.000030e+00  1.000030e+00  1.000030e+00  1.000030e+00  1.000030e+00  1.000030e+00  1.00003         1.00003    1.000030e+00   

      NonFarm_Payrolls_Change  
mean             9.430679e-17  
std              1.000030e+00  

VAL stats (should be shifted, NOT zero-mean):
       vol_12h   vol_24h  log_volume  vol_z_24h   CPI_YoY   CPI_MoM   PPI_YoY  GDP_Growth_QoQ  Fed_Funds_Rate  NonFarm_Payrolls_Change
mean -0.267244 -0.286369    0.078932  -0.010606 -1.460197 -1.279862  0.277359       -2.586379   -1.421673e+00                -1.197264
std   0.775399  0.762925    0.905342   0.993989  0.523301  0.553439  0.329701        1.541397    5.329808e-15                 0.833862

Target di

In [15]:
# Cell 15: Sliding window creation (per ticker), pooled output

SEQ_LEN = 24  # 24 bars (not calendar hours)

def make_windows_strict(df: pd.DataFrame,
                        feature_cols: list[str],
                        target_col: str,
                        seq_len: int) -> tuple[np.ndarray, np.ndarray, pd.DataFrame]:
    """
    Strict split isolation: df should already be filtered to one split.
    Builds windows per ticker. No window crosses ticker boundaries.
    Returns:
      X: (N, seq_len, n_features)
      y: (N,)
      meta: DataFrame with [ticker, end_time] per sample
    """
    X_list, y_list, meta_rows = [], [], []

    for t, g in df.groupby("ticker", sort=False):
        g = g.sort_values("Datetime").reset_index(drop=True)
        feat = g[feature_cols].to_numpy(dtype=np.float32)
        targ = g[target_col].to_numpy(dtype=np.float32)
        times = g["Datetime"].to_numpy()

        if len(g) < seq_len:
            continue

        for end_idx in range(seq_len - 1, len(g)):
            start_idx = end_idx - seq_len + 1
            X_list.append(feat[start_idx:end_idx+1])
            y_list.append(targ[end_idx])
            meta_rows.append((t, times[end_idx]))

    X = np.stack(X_list, axis=0) if X_list else np.empty((0, seq_len, len(feature_cols)), dtype=np.float32)
    y = np.array(y_list, dtype=np.float32) if y_list else np.empty((0,), dtype=np.float32)
    meta = pd.DataFrame(meta_rows, columns=["ticker", "end_time"])
    return X, y, meta

train_df = prices_all.loc[prices_all["split"]=="train"].copy()
val_df   = prices_all.loc[prices_all["split"]=="val"].copy()
test_df  = prices_all.loc[prices_all["split"]=="test"].copy()

X_train, y_train, meta_train = make_windows_strict(train_df, ALL_FEATURES, TARGET_COL, SEQ_LEN)
X_val,   y_val,   meta_val   = make_windows_strict(val_df,   ALL_FEATURES, TARGET_COL, SEQ_LEN)
X_test,  y_test,  meta_test  = make_windows_strict(test_df,  ALL_FEATURES, TARGET_COL, SEQ_LEN)

print("Shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:  ", X_val.shape,   "y_val:  ", y_val.shape)
print("X_test: ", X_test.shape,  "y_test: ", y_test.shape)


Shapes:
X_train: (16716, 24, 16) y_train: (16716,)
X_val:   (3451, 24, 16) y_val:   (3451,)
X_test:  (3451, 24, 16) y_test:  (3451,)


In [16]:
# Cell 16: Sanity checks

def sanity_check_Xy(X, y, name=""):
    assert X.ndim == 3, f"{name}: X must be 3D (N, seq_len, n_features). Got {X.shape}"
    assert y.ndim == 1, f"{name}: y must be 1D (N,). Got {y.shape}"
    assert X.shape[0] == y.shape[0], f"{name}: X and y sample count mismatch"
    assert np.isfinite(X).all(), f"{name}: X contains NaN/inf"
    assert np.isfinite(y).all(), f"{name}: y contains NaN/inf"

sanity_check_Xy(X_train, y_train, "train")
sanity_check_Xy(X_val, y_val, "val")
sanity_check_Xy(X_test, y_test, "test")

print("All (X,y) checks passed.")

print("\nTarget stats by split:")
print("train:", pd.Series(y_train).describe())
print("val:  ", pd.Series(y_val).describe())
print("test: ", pd.Series(y_test).describe())

print("\nSamples per ticker (train):")
print(meta_train["ticker"].value_counts())


All (X,y) checks passed.

Target stats by split:
train: count    16716.000000
mean         0.000456
std          0.019781
min         -0.190422
25%         -0.007235
50%          0.000749
75%          0.008103
max          0.194717
dtype: float64
val:   count    3451.000000
mean        0.001587
std         0.015644
min        -0.132553
25%        -0.005264
50%         0.000915
75%         0.007236
max         0.115137
dtype: float64
test:  count    3451.000000
mean        0.000882
std         0.016093
min        -0.124975
25%        -0.006402
50%         0.000567
75%         0.007751
max         0.099293
dtype: float64

Samples per ticker (train):
ticker
AAPL     2388
AMZN     2388
GOOGL    2388
META     2388
MSFT     2388
NVDA     2388
TSLA     2388
Name: count, dtype: int64


In [17]:
# Cell 17: Optional PyTorch Dataset/DataLoaders

import torch
from torch.utils.data import Dataset, DataLoader

class WindowDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

BATCH_SIZE = 256

train_loader = DataLoader(WindowDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = DataLoader(WindowDataset(X_val, y_val),     batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_loader  = DataLoader(WindowDataset(X_test, y_test),   batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

# Quick batch shape proof
xb, yb = next(iter(train_loader))
print("One batch shapes:", xb.shape, yb.shape)


One batch shapes: torch.Size([256, 24, 16]) torch.Size([256])


In [19]:
df = prices_all.sort_values(["ticker","Datetime"]).copy()
same_close = df.groupby("ticker")["Close"].diff().fillna(0).eq(0).mean()
same_ohlc = (
    df.groupby("ticker")[["Open","High","Low","Close"]]
      .diff()
      .fillna(0)
      .eq(0)
      .all(axis=1)
      .mean()
)

print("Share same Close as previous bar:", same_close)
print("Share OHLC all unchanged:", same_ohlc)


Share same Close as previous bar: 0.0017426662794074934
Share OHLC all unchanged: 0.0002904443799012489


In [20]:
# Cell 4A-1: Baseline metrics helpers

import numpy as np

def mse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return float(np.mean((y_true - y_pred) ** 2))

def mae(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return float(np.mean(np.abs(y_true - y_pred)))

def rmse(y_true, y_pred):
    return float(np.sqrt(mse(y_true, y_pred)))

def directional_accuracy(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return float(np.mean(np.sign(y_true) == np.sign(y_pred)))

def pearson_corr(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        return np.nan
    return float(np.corrcoef(y_true, y_pred)[0, 1])

def print_report(name, y_true, y_pred):
    print(f"\n{name}")
    print("  MAE :", mae(y_true, y_pred))
    print("  MSE :", mse(y_true, y_pred))
    print("  RMSE:", rmse(y_true, y_pred))
    print("  DirAcc:", directional_accuracy(y_true, y_pred))
    print("  Corr  :", pearson_corr(y_true, y_pred))


In [21]:
# Cell 4A-2: Baseline 0 (predict 0 for all)

yhat0_train = np.zeros_like(y_train)
yhat0_val   = np.zeros_like(y_val)
yhat0_test  = np.zeros_like(y_test)

print_report("Zero baseline - TRAIN", y_train, yhat0_train)
print_report("Zero baseline - VAL",   y_val,   yhat0_val)
print_report("Zero baseline - TEST",  y_test,  yhat0_test)



Zero baseline - TRAIN
  MAE : 0.012491372413933277
  MSE : 0.0003914755943696946
  RMSE: 0.019785742199111323
  DirAcc: 0.0004187604690117253
  Corr  : nan

Zero baseline - VAL
  MAE : 0.009927444159984589
  MSE : 0.00024717237101867795
  RMSE: 0.015721716541735445
  DirAcc: 0.0
  Corr  : nan

Zero baseline - TEST
  MAE : 0.010859946720302105
  MSE : 0.0002596811973489821
  RMSE: 0.01611462681382917
  DirAcc: 0.0
  Corr  : nan


In [22]:
# Cell 4A-3: Baseline 1 (persistence using last-bar log_return_1h)

# Identify index of log_return_1h inside ALL_FEATURES
try:
    idx_lr = ALL_FEATURES.index("log_return_1h")
except ValueError:
    raise ValueError("log_return_1h is not in ALL_FEATURES; cannot run persistence baseline.")

# last timestep feature: X[:, -1, idx_lr]
yhat1_train = X_train[:, -1, idx_lr]
yhat1_val   = X_val[:,   -1, idx_lr]
yhat1_test  = X_test[:,  -1, idx_lr]

print_report("Persistence (use last 1h return) - TRAIN", y_train, yhat1_train)
print_report("Persistence (use last 1h return) - VAL",   y_val,   yhat1_val)
print_report("Persistence (use last 1h return) - TEST",  y_test,  yhat1_test)



Persistence (use last 1h return) - TRAIN
  MAE : 0.014576387591660023
  MSE : 0.0004961505765095353
  RMSE: 0.022274437737225497
  DirAcc: 0.48145489351519505
  Corr  : -0.01075625577536466

Persistence (use last 1h return) - VAL
  MAE : 0.011461262591183186
  MSE : 0.00030745944241061807
  RMSE: 0.017534521448007018
  DirAcc: 0.48507678933642423
  Corr  : -0.0019926925883315405

Persistence (use last 1h return) - TEST
  MAE : 0.012575560249388218
  MSE : 0.0003410863864701241
  RMSE: 0.01846852420931689
  DirAcc: 0.5030425963488844
  Corr  : -0.03699378013858804


In [23]:
# Cell 4A-4: Baseline 2 (Ridge on last-bar features)

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Use last timestep of each window as a flat feature vector
Xtr_last = X_train[:, -1, :]  # (N, n_features)
Xva_last = X_val[:,   -1, :]
Xte_last = X_test[:,  -1, :]

ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(Xtr_last, y_train)

yhat2_train = ridge.predict(Xtr_last)
yhat2_val   = ridge.predict(Xva_last)
yhat2_test  = ridge.predict(Xte_last)

print_report("Ridge (last-bar features) - TRAIN", y_train, yhat2_train)
print_report("Ridge (last-bar features) - VAL",   y_val,   yhat2_val)
print_report("Ridge (last-bar features) - TEST",  y_test,  yhat2_test)

# Optional: show top coefficients by absolute magnitude
coef = ridge.coef_
top_idx = np.argsort(np.abs(coef))[::-1][:10]
print("\nTop 10 Ridge coefficients (by abs value):")
for i in top_idx:
    print(f"  {ALL_FEATURES[i]:<28} {coef[i]: .6f}")



Ridge (last-bar features) - TRAIN
  MAE : 0.012507694773375988
  MSE : 0.0003877466660924256
  RMSE: 0.019691284013299528
  DirAcc: 0.5247068676716918
  Corr  : 0.09486055164873466

Ridge (last-bar features) - VAL
  MAE : 0.010682265274226665
  MSE : 0.00038478767964988947
  RMSE: 0.01961600570069986
  DirAcc: 0.5609968125181107
  Corr  : 0.025233728980882733

Ridge (last-bar features) - TEST
  MAE : 0.015241938643157482
  MSE : 0.0031056422740221024
  RMSE: 0.05572828971018313
  DirAcc: 0.4682700666473486
  Corr  : -0.017856564918283493

Top 10 Ridge coefficients (by abs value):
  log_return_1h                -0.018048
  overall_sentiment_ffill       0.004717
  news_count                    0.004507
  has_news                     -0.003562
  ticker_sentiment_ffill       -0.002174
  PPI_YoY                      -0.002050
  GDP_Growth_QoQ               -0.001555
  CPI_YoY                      -0.001252
  ticker_relevance_ffill        0.001251
  vol_24h                       0.000725


In [24]:
# Cell 4A-5: Baseline 3 (Ridge on window summaries: mean and std over time)

from sklearn.linear_model import Ridge

def window_summary(X):
    # X: (N, T, F)
    mu = X.mean(axis=1)  # (N, F)
    sd = X.std(axis=1)   # (N, F)
    return np.concatenate([mu, sd], axis=1)  # (N, 2F)

Xtr_sum = window_summary(X_train)
Xva_sum = window_summary(X_val)
Xte_sum = window_summary(X_test)

ridge2 = Ridge(alpha=1.0, random_state=42)
ridge2.fit(Xtr_sum, y_train)

yhat3_train = ridge2.predict(Xtr_sum)
yhat3_val   = ridge2.predict(Xva_sum)
yhat3_test  = ridge2.predict(Xte_sum)

print_report("Ridge (window mean+std) - TRAIN", y_train, yhat3_train)
print_report("Ridge (window mean+std) - VAL",   y_val,   yhat3_val)
print_report("Ridge (window mean+std) - TEST",  y_test,  yhat3_test)



Ridge (window mean+std) - TRAIN
  MAE : 0.012498813681304455
  MSE : 0.00038633879739791155
  RMSE: 0.01965550298002856
  DirAcc: 0.5284158889686528
  Corr  : 0.11223751222412386

Ridge (window mean+std) - VAL
  MAE : 0.010703432373702526
  MSE : 0.0003014605026692152
  RMSE: 0.017362617967035247
  DirAcc: 0.5517241379310345
  Corr  : 0.03651384602171301

Ridge (window mean+std) - TEST
  MAE : 0.017132820561528206
  MSE : 0.001490810769610107
  RMSE: 0.038611018759029225
  DirAcc: 0.4789915966386555
  Corr  : 0.020403861943042886
