In [2]:
import pandas as pd

# List of tickers we have data for
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA"]

price_dfs = []
for ticker in tickers:
    fname = f"data/raw/market/{ticker}_1h.csv"
    # The CSV has 3 header lines, skip them and assign column names manually
    df = pd.read_csv(fname, skiprows=3,
                     names=["Datetime", "Close", "High", "Low", "Open", "Volume"])
    # Convert to datetime and set UTC
    df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True, errors='coerce')
    df = df.dropna(subset=['Datetime'])  # drop any rows that didn't parse to datetime
    df = df.set_index('Datetime')
    df['ticker'] = ticker  # add ticker column
    price_dfs.append(df)

# Concatenate all tickers data
prices_all = pd.concat(price_dfs)
# Sort by datetime (and ticker, though ticker is not in index yet)
prices_all = prices_all.sort_index()
prices_all.head()


Unnamed: 0_level_0,Close,High,Low,Open,Volume,ticker
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-12-06 14:30:00+00:00,192.419998,194.759995,192.205002,194.449997,11260893,AAPL
2023-12-06 14:30:00+00:00,46.444,47.387001,46.361,47.2145,11433656,NVDA
2023-12-06 14:30:00+00:00,145.289993,147.850006,145.089996,147.580002,10481937,AMZN
2023-12-06 14:30:00+00:00,318.5,322.25,317.670105,321.929993,3490719,META
2023-12-06 14:30:00+00:00,242.029999,246.199997,241.345093,242.919998,39241808,TSLA


In [3]:
# Derive the full hourly date range from min to max timestamp in data
# Robustly handle either a DatetimeIndex or a MultiIndex where one level is datetime
def _infer_datetime_bounds(idx):
    if isinstance(idx, pd.DatetimeIndex):
        return idx.min(), idx.max()
    if isinstance(idx, pd.MultiIndex):
        # try to find a datetime level by dtype
        for i in range(idx.nlevels):
            lvl = idx.get_level_values(i)
            if pd.api.types.is_datetime64_any_dtype(lvl):
                return lvl.min(), lvl.max()
        # try converting each level to datetime
        for i in range(idx.nlevels):
            lvl = pd.to_datetime(idx.get_level_values(i), utc=True, errors='coerce')
            if not lvl.isna().all():
                return lvl.min(), lvl.max()
    # fallback: try converting the index itself
    vals = pd.to_datetime(idx, utc=True, errors='coerce')
    if isinstance(vals, (pd.DatetimeIndex, pd.Series)) and not vals.isna().all():
        return vals.min(), vals.max()
    raise TypeError('Could not determine datetime bounds from index')

start, end = _infer_datetime_bounds(prices_all.index)
full_index = pd.date_range(start=start, end=end, freq='H', tz='UTC')
# Reindex each ticker sub-DataFrame to include all hours, forward-fill price data if needed
# Choose grouping method dynamically to avoid ambiguity when 'ticker' is both an index level and a column
if 'ticker' in prices_all.index.names:
    gb = prices_all.groupby(level='ticker')
else:
    gb = prices_all.groupby('ticker')
def _reindex_group(g):
    if isinstance(g.index, pd.MultiIndex):
        for i in range(g.index.nlevels):
            vals = pd.to_datetime(g.index.get_level_values(i), utc=True, errors='coerce')
            if not vals.isna().all():
                g = g.copy()
                g.index = vals
                return g.reindex(full_index, method='ffill')
        idx = pd.to_datetime([x[1] if isinstance(x, tuple) and len(x) > 1 else x for x in g.index], utc=True, errors='coerce')
        g = g.copy()
        g.index = idx
        return g.reindex(full_index, method='ffill')
    else:
        idx = pd.to_datetime(g.index, utc=True, errors='coerce')
        if idx.isna().all():
            idx = pd.to_datetime([x[1] if isinstance(x, tuple) and len(x) > 1 else x for x in g.index], utc=True, errors='coerce')
        g = g.copy()
        g.index = idx
        return g.reindex(full_index, method='ffill')
prices_all = gb.apply(_reindex_group)
# After groupby.apply the group key may become an index level named 'ticker',
# but a 'ticker' column may already exist. Drop duplicate column before reset_index.
prices_all = prices_all.drop(columns='ticker', errors='ignore')
# The above creates a multi-index (ticker, datetime); bring the index levels back to columns
prices_all = prices_all.reset_index(level=0).rename_axis('Datetime').reset_index()
prices_all.head()


  full_index = pd.date_range(start=start, end=end, freq='H', tz='UTC')
  prices_all = gb.apply(_reindex_group)


Unnamed: 0,Datetime,ticker,Close,High,Low,Open,Volume
0,2023-12-06 14:30:00+00:00,AAPL,192.419998,194.759995,192.205002,194.449997,11260893
1,2023-12-06 15:30:00+00:00,AAPL,193.095001,193.339996,192.360107,192.419998,4374474
2,2023-12-06 16:30:00+00:00,AAPL,192.830002,193.130005,192.470001,193.100006,3252326
3,2023-12-06 17:30:00+00:00,AAPL,192.905899,192.979996,192.369995,192.839996,3389634
4,2023-12-06 18:30:00+00:00,AAPL,192.779999,193.235001,192.740005,192.910004,2713794


In [4]:
import pandas as pd

# 1) Load PPI hourly
ppi = pd.read_csv("PPI_hourly.csv")
ppi["Datetime"] = pd.to_datetime(ppi["Datetime"], utc=True, errors="coerce")
ppi = ppi.dropna(subset=["Datetime"]).sort_values("Datetime")

# Keep only what we need
ppi = ppi[["Datetime", "PPI_YoY"]]

# 2) Merge into your existing prices_all
# prices_all currently has columns: Datetime, ticker, Close, High, Low, Open, Volume
prices_all["Datetime"] = pd.to_datetime(prices_all["Datetime"], utc=True, errors="coerce")
prices_all = prices_all.dropna(subset=["Datetime"]).sort_values(["ticker", "Datetime"])

prices_all = prices_all.merge(ppi, on="Datetime", how="left")

# 3) Forward-fill PPI across time (global series), then (optional) backfill at the very start
prices_all["PPI_YoY"] = prices_all["PPI_YoY"].ffill().bfill()

# 4) Sanity checks (do these after every merge)
print("PPI missing after fill:", prices_all["PPI_YoY"].isna().sum())
print("PPI unique values sample:", prices_all["PPI_YoY"].dropna().head(5).tolist())
print(prices_all[["Datetime","ticker","Close","PPI_YoY"]].head(10))


PPI missing after fill: 0
PPI unique values sample: [1.1056048120471784, 1.1056048120471784, 1.1056048120471784, 1.1056048120471784, 1.1056048120471784]
                   Datetime ticker       Close   PPI_YoY
0 2023-12-06 14:30:00+00:00   AAPL  192.419998  1.105605
1 2023-12-06 15:30:00+00:00   AAPL  193.095001  1.105605
2 2023-12-06 16:30:00+00:00   AAPL  192.830002  1.105605
3 2023-12-06 17:30:00+00:00   AAPL  192.905899  1.105605
4 2023-12-06 18:30:00+00:00   AAPL  192.779999  1.105605
5 2023-12-06 19:30:00+00:00   AAPL  192.425995  1.105605
6 2023-12-06 20:30:00+00:00   AAPL  192.309998  1.105605
7 2023-12-06 21:30:00+00:00   AAPL  192.309998  1.105605
8 2023-12-06 22:30:00+00:00   AAPL  192.309998  1.105605
9 2023-12-06 23:30:00+00:00   AAPL  192.309998  1.105605


In [5]:
# 1) Load CPI hourly
cpi = pd.read_csv("CPI_hourly.csv")
cpi["Datetime"] = pd.to_datetime(cpi["Datetime"], utc=True, errors="coerce")
cpi = cpi.dropna(subset=["Datetime"]).sort_values("Datetime")

# Keep only needed columns
cpi = cpi[["Datetime", "CPI_YoY", "CPI_MoM"]]

# 2) Merge into prices_all
prices_all = prices_all.merge(cpi, on="Datetime", how="left")

# 3) Forward-fill CPI (global macro)
prices_all[["CPI_YoY", "CPI_MoM"]] = (
    prices_all[["CPI_YoY", "CPI_MoM"]]
    .ffill()
    .bfill()
)

# 4) Sanity checks
print("CPI YoY missing:", prices_all["CPI_YoY"].isna().sum())
print("CPI MoM missing:", prices_all["CPI_MoM"].isna().sum())

print(
    prices_all[
        ["Datetime", "ticker", "Close", "PPI_YoY", "CPI_YoY", "CPI_MoM"]
    ].head(10)
)


CPI YoY missing: 0
CPI MoM missing: 0
                   Datetime ticker       Close   PPI_YoY   CPI_YoY   CPI_MoM
0 2023-12-06 14:30:00+00:00   AAPL  192.419998  1.105605  3.246538  0.244648
1 2023-12-06 15:30:00+00:00   AAPL  193.095001  1.105605  3.246538  0.244648
2 2023-12-06 16:30:00+00:00   AAPL  192.830002  1.105605  3.246538  0.244648
3 2023-12-06 17:30:00+00:00   AAPL  192.905899  1.105605  3.246538  0.244648
4 2023-12-06 18:30:00+00:00   AAPL  192.779999  1.105605  3.246538  0.244648
5 2023-12-06 19:30:00+00:00   AAPL  192.425995  1.105605  3.246538  0.244648
6 2023-12-06 20:30:00+00:00   AAPL  192.309998  1.105605  3.246538  0.244648
7 2023-12-06 21:30:00+00:00   AAPL  192.309998  1.105605  3.246538  0.244648
8 2023-12-06 22:30:00+00:00   AAPL  192.309998  1.105605  3.246538  0.244648
9 2023-12-06 23:30:00+00:00   AAPL  192.309998  1.105605  3.246538  0.244648


In [6]:
# 1) Load FOMC-aligned hourly rate (THIS is the correct file)
fomc = pd.read_csv("FOMC_rate_hourly.csv")
fomc["Datetime"] = pd.to_datetime(fomc["Datetime"], utc=True, errors="coerce")
fomc = fomc.dropna(subset=["Datetime"]).sort_values("Datetime")

# Inspect columns
print(fomc.columns)
# Expect: ["Datetime", "Fed_Funds_Rate"]

# 2) Merge into prices_all
prices_all = prices_all.merge(
    fomc[["Datetime", "Fed_Funds_Rate"]],
    on="Datetime",
    how="left"
)

# 3) Forward-fill & backfill (policy regime)
prices_all["Fed_Funds_Rate"] = (
    prices_all["Fed_Funds_Rate"]
    .ffill()
    .bfill()
)

# 4) Sanity checks
print("Fed Funds missing:", prices_all["Fed_Funds_Rate"].isna().sum())
print(
    prices_all[
        ["Datetime", "ticker", "Close", "CPI_YoY", "PPI_YoY", "Fed_Funds_Rate"]
    ].head(10)
)


Index(['Datetime', 'Fed_Funds_Rate'], dtype='object')
Fed Funds missing: 0
                   Datetime ticker       Close   CPI_YoY   PPI_YoY  \
0 2023-12-06 14:30:00+00:00   AAPL  192.419998  3.246538  1.105605   
1 2023-12-06 15:30:00+00:00   AAPL  193.095001  3.246538  1.105605   
2 2023-12-06 16:30:00+00:00   AAPL  192.830002  3.246538  1.105605   
3 2023-12-06 17:30:00+00:00   AAPL  192.905899  3.246538  1.105605   
4 2023-12-06 18:30:00+00:00   AAPL  192.779999  3.246538  1.105605   
5 2023-12-06 19:30:00+00:00   AAPL  192.425995  3.246538  1.105605   
6 2023-12-06 20:30:00+00:00   AAPL  192.309998  3.246538  1.105605   
7 2023-12-06 21:30:00+00:00   AAPL  192.309998  3.246538  1.105605   
8 2023-12-06 22:30:00+00:00   AAPL  192.309998  3.246538  1.105605   
9 2023-12-06 23:30:00+00:00   AAPL  192.309998  3.246538  1.105605   

   Fed_Funds_Rate  
0             5.5  
1             5.5  
2             5.5  
3             5.5  
4             5.5  
5             5.5  
6             

In [7]:
# 1) Load NFP hourly
nfp = pd.read_csv("NFP_hourly.csv")
nfp["Datetime"] = pd.to_datetime(nfp["Datetime"], utc=True, errors="coerce")
nfp = nfp.dropna(subset=["Datetime"]).sort_values("Datetime")

# Inspect columns
print("NFP columns:", nfp.columns.tolist())

# Attempt to find the Non-Farm Payrolls change column automatically
candidate = None
patterns = ["non", "nfp", "payroll"]
for col in nfp.columns:
    if col.lower() == "datetime":
        continue
    low = col.lower()
    if ("non" in low and "farm" in low) or "payroll" in low or "nfp" in low or "nonfarm" in low:
        candidate = col
        break
# If none matched, pick first numeric column (excluding Datetime)
if candidate is None:
    for col in nfp.columns:
        if col.lower() == "datetime":
            continue
        if pd.api.types.is_numeric_dtype(nfp[col]):
            candidate = col
            break
# Fallback to any non-datetime column
if candidate is None:
    cols = [c for c in nfp.columns if c.lower() != "datetime"]
    if cols:
        candidate = cols[0]
if candidate is None:
    raise KeyError("Could not find a suitable NonFarm_Payrolls_Change column in NFP file")

print("Using NFP column:", candidate)
nfp = nfp[["Datetime", candidate]].rename(columns={candidate: "NonFarm_Payrolls_Change"})

# 3) Merge into prices_all
prices_all = prices_all.merge(nfp, on="Datetime", how="left")

# 4) Forward-fill & backfill (labor regime)
prices_all["NonFarm_Payrolls_Change"] = (
    prices_all["NonFarm_Payrolls_Change"]
    .ffill()
    .bfill()
)

# 5) Sanity checks
print("NFP missing:", prices_all["NonFarm_Payrolls_Change"].isna().sum())
print(prices_all[["Datetime","ticker","Close","NonFarm_Payrolls_Change"]].head(10))


NFP columns: ['Datetime', 'NonFarm_Payrolls_Change']
Using NFP column: NonFarm_Payrolls_Change
NFP missing: 0
                   Datetime ticker       Close  NonFarm_Payrolls_Change
0 2023-12-06 14:30:00+00:00   AAPL  192.419998                    141.0
1 2023-12-06 15:30:00+00:00   AAPL  193.095001                    141.0
2 2023-12-06 16:30:00+00:00   AAPL  192.830002                    141.0
3 2023-12-06 17:30:00+00:00   AAPL  192.905899                    141.0
4 2023-12-06 18:30:00+00:00   AAPL  192.779999                    141.0
5 2023-12-06 19:30:00+00:00   AAPL  192.425995                    141.0
6 2023-12-06 20:30:00+00:00   AAPL  192.309998                    141.0
7 2023-12-06 21:30:00+00:00   AAPL  192.309998                    141.0
8 2023-12-06 22:30:00+00:00   AAPL  192.309998                    141.0
9 2023-12-06 23:30:00+00:00   AAPL  192.309998                    141.0


In [8]:
import pandas as pd

gdp_hourly = pd.read_csv("GDP_hourly.csv")
gdp_hourly["Datetime"] = pd.to_datetime(gdp_hourly["Datetime"], utc=True, errors="coerce")
gdp_hourly = gdp_hourly.dropna(subset=["Datetime"]).sort_values("Datetime")

# Merge: GDP is global, so merge on Datetime only
prices_all["Datetime"] = pd.to_datetime(prices_all["Datetime"], utc=True, errors="coerce")
prices_all = prices_all.dropna(subset=["Datetime"]).sort_values(["ticker", "Datetime"])

prices_all = prices_all.merge(
    gdp_hourly[["Datetime", "GDP_Growth_QoQ"]],
    on="Datetime",
    how="left"
)

# Safety fill (should already be filled, but keep this pattern consistent)
prices_all["GDP_Growth_QoQ"] = prices_all["GDP_Growth_QoQ"].ffill().bfill()

# Sanity checks
print("GDP missing:", prices_all["GDP_Growth_QoQ"].isna().sum())
print(prices_all[["Datetime", "ticker", "Close", "GDP_Growth_QoQ"]].head(10))


GDP missing: 0
                   Datetime ticker       Close  GDP_Growth_QoQ
0 2023-12-06 14:30:00+00:00   AAPL  192.419998             4.9
1 2023-12-06 15:30:00+00:00   AAPL  193.095001             4.9
2 2023-12-06 16:30:00+00:00   AAPL  192.830002             4.9
3 2023-12-06 17:30:00+00:00   AAPL  192.905899             4.9
4 2023-12-06 18:30:00+00:00   AAPL  192.779999             4.9
5 2023-12-06 19:30:00+00:00   AAPL  192.425995             4.9
6 2023-12-06 20:30:00+00:00   AAPL  192.309998             4.9
7 2023-12-06 21:30:00+00:00   AAPL  192.309998             4.9
8 2023-12-06 22:30:00+00:00   AAPL  192.309998             4.9
9 2023-12-06 23:30:00+00:00   AAPL  192.309998             4.9


In [9]:
import pandas as pd

# Try opening your produced sentiment file (adjust filename as needed)
sent = pd.read_csv("news_sentiment_hourly.csv")  # rename if yours differs
print(sent.columns.tolist())
print(sent.head(5))
print(sent.tail(5))

# Confirm timestamp parsing
sent["Datetime"] = pd.to_datetime(sent["Datetime"], utc=True, errors="coerce")
print("Sentiment NaT count:", sent["Datetime"].isna().sum())


['Datetime', 'ticker', 'news_count', 'overall_sentiment_mean', 'ticker_sentiment_mean', 'ticker_relevance_mean']
                    Datetime ticker  news_count  overall_sentiment_mean  \
0  2023-01-01 00:00:00+00:00   AAPL           0                     NaN   
1  2023-01-01 01:00:00+00:00   AAPL           0                     NaN   
2  2023-01-01 02:00:00+00:00   AAPL           0                     NaN   
3  2023-01-01 03:00:00+00:00   AAPL           0                     NaN   
4  2023-01-01 04:00:00+00:00   AAPL           0                     NaN   

   ticker_sentiment_mean  ticker_relevance_mean  
0                    NaN                    NaN  
1                    NaN                    NaN  
2                    NaN                    NaN  
3                    NaN                    NaN  
4                    NaN                    NaN  
                         Datetime ticker  news_count  overall_sentiment_mean  \
180602  2025-12-10 20:00:00+00:00   TSLA           1    

In [10]:
print("News tickers:", sorted(sent["ticker"].unique().tolist()))
print("Market tickers:", sorted(prices_all["ticker"].unique().tolist()))

sent = sent.sort_values(["ticker", "Datetime"])
prices_all = prices_all.sort_values(["ticker", "Datetime"])


News tickers: ['AAPL', 'AMZN', 'GOOGL', 'META', 'MSFT', 'NVDA', 'TSLA']
Market tickers: ['AAPL', 'AMZN', 'GOOGL', 'META', 'MSFT', 'NVDA', 'TSLA']


In [11]:
# Align news timestamps to market bar timestamps (HH:30)
sent = sent.copy()
sent["Datetime"] = pd.to_datetime(sent["Datetime"], utc=True, errors="coerce")
sent = sent.dropna(subset=["Datetime"])

# shift from :00 to :30 so it matches your OHLCV timestamps
sent["Datetime"] = sent["Datetime"] + pd.Timedelta(minutes=30)

# sanity check: should now show :30 minutes
print(sent["Datetime"].dt.minute.value_counts().head())


Datetime
30    180607
Name: count, dtype: int64


### Don't run unless you are sure

In [12]:
# Do inner merge but avoid column-name collisions by specifying suffixes
inner = prices_all.merge(
    sent[["Datetime","ticker","news_count"]],
    on=["Datetime","ticker"],
    how="inner",
    suffixes=("_price", "_sent")
)
print('inner columns:', list(inner.columns))
print('inner-merged rows:', len(inner))
# Prefer the sent-side news_count if it exists after the merge
news_col = None
for candidate in ['news_count_sent', 'news_count_y', 'news_count']:
    if candidate in inner.columns:
        news_col = candidate
        break
if news_col is None:
    print('No news_count column found in inner; available cols:', inner.columns.tolist())
else:
    print('using', news_col, '-> positive news rows:', int((inner[news_col] > 0).sum()))


inner columns: ['Datetime', 'ticker', 'Close', 'High', 'Low', 'Open', 'Volume', 'PPI_YoY', 'CPI_YoY', 'CPI_MoM', 'Fed_Funds_Rate', 'NonFarm_Payrolls_Change', 'GDP_Growth_QoQ', 'news_count']
inner-merged rows: 122689
using news_count -> positive news rows: 7791


### Don't run unless you are sure

In [13]:
# Remove previously-merged sentiment columns to avoid duplicates
to_drop = [
    "news_count", "has_news",
    "overall_sentiment_ffill",
    "ticker_sentiment_ffill",
    "ticker_relevance_ffill",
    # if prior merges created suffixed columns, drop them too:
    "news_count_x", "news_count_y",
    "has_news_x", "has_news_y",
    "overall_sentiment_ffill_x", "overall_sentiment_ffill_y",
    "ticker_sentiment_ffill_x", "ticker_sentiment_ffill_y",
    "ticker_relevance_ffill_x", "ticker_relevance_ffill_y",
]
prices_all = prices_all.drop(columns=to_drop, errors="ignore")

print("Sentiment columns now present in prices_all:",
      [c for c in prices_all.columns if "sentiment" in c.lower() or "news_count" in c.lower() or "has_news" in c.lower()])


Sentiment columns now present in prices_all: []


In [14]:
import pandas as pd

# ============================
# 0) Clean copies
# ============================
prices_all = prices_all.copy()
sent = sent.copy()

prices_all["Datetime"] = pd.to_datetime(prices_all["Datetime"], utc=True, errors="coerce")
sent["Datetime"] = pd.to_datetime(sent["Datetime"], utc=True, errors="coerce")

prices_all = prices_all.dropna(subset=["Datetime"]).sort_values(["ticker","Datetime"])
sent = sent.dropna(subset=["Datetime"]).sort_values(["ticker","Datetime"])

# ============================
# 1) ALIGNMENT FIX (FINAL)
# ============================
# Sentiment is hourly at HH:00, market bars close at HH:30
sent["Datetime"] = sent["Datetime"] + pd.Timedelta(minutes=30)

# ============================
# 2) Option A features
# ============================
sent["has_news"] = (sent["news_count"] > 0).astype(int)

sent["overall_sentiment_ffill"] = sent.groupby("ticker")["overall_sentiment_mean"].ffill()
sent["ticker_sentiment_ffill"]  = sent.groupby("ticker")["ticker_sentiment_mean"].ffill()
sent["ticker_relevance_ffill"]  = sent.groupby("ticker")["ticker_relevance_mean"].ffill()

sent[[
    "overall_sentiment_ffill",
    "ticker_sentiment_ffill",
    "ticker_relevance_ffill"
]] = sent[[
    "overall_sentiment_ffill",
    "ticker_sentiment_ffill",
    "ticker_relevance_ffill"
]].fillna(0.0)

# ============================
# 3) Drop old sentiment cols (safety)
# ============================
sentiment_cols = [
    "news_count","has_news",
    "overall_sentiment_ffill",
    "ticker_sentiment_ffill",
    "ticker_relevance_ffill"
]
prices_all = prices_all.drop(columns=sentiment_cols, errors="ignore")

# ============================
# 4) Merge (this WILL work now)
# ============================
sent_merge = sent[[
    "Datetime","ticker",
    "news_count","has_news",
    "overall_sentiment_ffill",
    "ticker_sentiment_ffill",
    "ticker_relevance_ffill"
]]

prices_all = prices_all.merge(
    sent_merge,
    on=["Datetime","ticker"],
    how="left"
)

# ============================
# 5) Fill gaps safely
# ============================
prices_all["news_count"] = prices_all["news_count"].fillna(0)
prices_all["has_news"] = prices_all["has_news"].fillna(0)
prices_all["overall_sentiment_ffill"] = prices_all["overall_sentiment_ffill"].fillna(0.0)
prices_all["ticker_sentiment_ffill"] = prices_all["ticker_sentiment_ffill"].fillna(0.0)
prices_all["ticker_relevance_ffill"] = prices_all["ticker_relevance_ffill"].fillna(0.0)

# ============================
# 6) HARD PROOF
# ============================
inner = prices_all.merge(
    sent[["Datetime","ticker","news_count"]].rename(columns={"news_count":"probe"}),
    on=["Datetime","ticker"],
    how="inner"
)

print("inner-merged rows:", len(inner))
print("inner rows with probe>0:", int((inner["probe"] > 0).sum()))
print(inner.loc[inner["probe"] > 0, ["Datetime","ticker","probe"]].head(10))

print("\nPreview:")
print(prices_all[
    ["Datetime","ticker","Close","news_count","has_news","ticker_sentiment_ffill"]
].head(15))


inner-merged rows: 0
inner rows with probe>0: 0
Empty DataFrame
Columns: [Datetime, ticker, probe]
Index: []

Preview:
                    Datetime ticker       Close  news_count  has_news  \
0  2023-12-06 14:30:00+00:00   AAPL  192.419998         0.0       0.0   
1  2023-12-06 15:30:00+00:00   AAPL  193.095001         0.0       0.0   
2  2023-12-06 16:30:00+00:00   AAPL  192.830002         0.0       0.0   
3  2023-12-06 17:30:00+00:00   AAPL  192.905899         0.0       0.0   
4  2023-12-06 18:30:00+00:00   AAPL  192.779999         0.0       0.0   
5  2023-12-06 19:30:00+00:00   AAPL  192.425995         0.0       0.0   
6  2023-12-06 20:30:00+00:00   AAPL  192.309998         0.0       0.0   
7  2023-12-06 21:30:00+00:00   AAPL  192.309998         0.0       0.0   
8  2023-12-06 22:30:00+00:00   AAPL  192.309998         0.0       0.0   
9  2023-12-06 23:30:00+00:00   AAPL  192.309998         0.0       0.0   
10 2023-12-07 00:30:00+00:00   AAPL  192.309998         0.0       0.0   
11 20

In [15]:
import numpy as np

# Ensure proper sorting
prices_all = prices_all.sort_values(["ticker", "Datetime"]).reset_index(drop=True)

HORIZON = 4  # 4-hour ahead target

# Compute future close per ticker
prices_all["Close_t_plus_4"] = (
    prices_all
    .groupby("ticker")["Close"]
    .shift(-HORIZON)
)

# Compute next-4-hour log return
prices_all["target_log_return_4h"] = np.log(
    prices_all["Close_t_plus_4"] / prices_all["Close"]
)

# Drop rows without a valid target (last 4 rows per ticker)
before = len(prices_all)
prices_all = prices_all.dropna(subset=["target_log_return_4h"]).reset_index(drop=True)
after = len(prices_all)

print(f"Dropped {before - after} rows without 4h-ahead target.")

# Sanity checks
print("\nTarget summary statistics:")
print(prices_all["target_log_return_4h"].describe())

print("\nSample rows:")
print(
    prices_all[
        ["Datetime","ticker","Close","Close_t_plus_4","target_log_return_4h"]
    ].head(10)
)


Dropped 28 rows without 4h-ahead target.

Target summary statistics:
count    122661.000000
mean          0.000154
std           0.008425
min          -0.190422
25%           0.000000
50%           0.000000
75%           0.000000
max           0.199039
Name: target_log_return_4h, dtype: float64

Sample rows:
                   Datetime ticker       Close  Close_t_plus_4  \
0 2023-12-06 14:30:00+00:00   AAPL  192.419998      192.779999   
1 2023-12-06 15:30:00+00:00   AAPL  193.095001      192.425995   
2 2023-12-06 16:30:00+00:00   AAPL  192.830002      192.309998   
3 2023-12-06 17:30:00+00:00   AAPL  192.905899      192.309998   
4 2023-12-06 18:30:00+00:00   AAPL  192.779999      192.309998   
5 2023-12-06 19:30:00+00:00   AAPL  192.425995      192.309998   
6 2023-12-06 20:30:00+00:00   AAPL  192.309998      192.309998   
7 2023-12-06 21:30:00+00:00   AAPL  192.309998      192.309998   
8 2023-12-06 22:30:00+00:00   AAPL  192.309998      192.309998   
9 2023-12-06 23:30:00+00:00   

In [17]:
import numpy as np

# Ensure proper ordering
prices_all = prices_all.sort_values(["ticker", "Datetime"]).reset_index(drop=True)

# 1-hour log return per ticker (index-safe)
prices_all["log_return_1h"] = prices_all.groupby("ticker")["Close"].transform(
    lambda x: np.log(x / x.shift(1))
)

# Drop the first row per ticker (no previous hour)
before = len(prices_all)
prices_all = prices_all.dropna(subset=["log_return_1h"]).reset_index(drop=True)
after = len(prices_all)

print(f"Dropped {before - after} rows due to 1h return NaN.")

# Sanity checks
print("\nlog_return_1h summary:")
print(prices_all["log_return_1h"].describe())

print("\nSample rows:")
print(
    prices_all[
        ["Datetime","ticker","Close","log_return_1h","target_log_return_4h"]
    ].head(10)
)


Dropped 7 rows due to 1h return NaN.

log_return_1h summary:
count    122654.000000
mean          0.000039
std           0.004219
min          -0.139523
25%           0.000000
50%           0.000000
75%           0.000000
max           0.182174
Name: log_return_1h, dtype: float64

Sample rows:
                   Datetime ticker       Close  log_return_1h  \
0 2023-12-06 15:30:00+00:00   AAPL  193.095001       0.003502   
1 2023-12-06 16:30:00+00:00   AAPL  192.830002      -0.001373   
2 2023-12-06 17:30:00+00:00   AAPL  192.905899       0.000394   
3 2023-12-06 18:30:00+00:00   AAPL  192.779999      -0.000653   
4 2023-12-06 19:30:00+00:00   AAPL  192.425995      -0.001838   
5 2023-12-06 20:30:00+00:00   AAPL  192.309998      -0.000603   
6 2023-12-06 21:30:00+00:00   AAPL  192.309998       0.000000   
7 2023-12-06 22:30:00+00:00   AAPL  192.309998       0.000000   
8 2023-12-06 23:30:00+00:00   AAPL  192.309998       0.000000   
9 2023-12-07 00:30:00+00:00   AAPL  192.309998       0.

In [18]:
# Ensure sorted
prices_all = prices_all.sort_values(["ticker", "Datetime"]).reset_index(drop=True)

# Rolling vol per ticker (std of 1h log returns)
# min_periods makes early values available; weâ€™ll drop remaining NaNs after.
prices_all["vol_12h"] = prices_all.groupby("ticker")["log_return_1h"].transform(
    lambda x: x.rolling(window=12, min_periods=12).std()
)

prices_all["vol_24h"] = prices_all.groupby("ticker")["log_return_1h"].transform(
    lambda x: x.rolling(window=24, min_periods=24).std()
)

# Drop rows where vol_24h is NaN (first 23 rows per ticker)
before = len(prices_all)
prices_all = prices_all.dropna(subset=["vol_24h"]).reset_index(drop=True)
after = len(prices_all)

print(f"Dropped {before - after} rows due to rolling volatility warm-up.")

# Sanity checks
print("\nvol_12h summary:")
print(prices_all["vol_12h"].describe())

print("\nvol_24h summary:")
print(prices_all["vol_24h"].describe())

print("\nSample rows:")
print(
    prices_all[
        ["Datetime","ticker","log_return_1h","vol_12h","vol_24h","target_log_return_4h"]
    ].head(10)
)


Dropped 161 rows due to rolling volatility warm-up.

vol_12h summary:
count    122493.000000
mean          0.002144
std           0.003635
min           0.000000
25%           0.000000
50%           0.000325
75%           0.003054
max           0.052930
Name: vol_12h, dtype: float64

vol_24h summary:
count    122493.000000
mean          0.002738
std           0.003213
min           0.000000
25%           0.000000
50%           0.002025
75%           0.003834
max           0.037465
Name: vol_24h, dtype: float64

Sample rows:
                   Datetime ticker  log_return_1h   vol_12h   vol_24h  \
0 2023-12-07 14:30:00+00:00   AAPL       0.012480  0.003603  0.002705   
1 2023-12-07 15:30:00+00:00   AAPL      -0.002185  0.003713  0.002679   
2 2023-12-07 16:30:00+00:00   AAPL       0.003288  0.003765  0.002724   
3 2023-12-07 17:30:00+00:00   AAPL      -0.002620  0.003910  0.002796   
4 2023-12-07 18:30:00+00:00   AAPL       0.000206  0.003906  0.002788   
5 2023-12-07 19:30:00+00:00   AA

In [19]:
import numpy as np

# Ensure proper ordering
prices_all = prices_all.sort_values(["ticker", "Datetime"]).reset_index(drop=True)

# ----------------------------
# 1) Log volume
# ----------------------------
# Add +1 to avoid log(0)
prices_all["log_volume"] = np.log(prices_all["Volume"] + 1)

# ----------------------------
# 2) Rolling volume z-score (24h)
# ----------------------------
def rolling_zscore(x, window=24):
    mu = x.rolling(window, min_periods=window).mean()
    sd = x.rolling(window, min_periods=window).std()
    return (x - mu) / sd

prices_all["vol_z_24h"] = prices_all.groupby("ticker")["log_volume"].transform(
    lambda x: rolling_zscore(x, window=24)
)

# Drop warm-up NaNs
before = len(prices_all)
prices_all = prices_all.dropna(subset=["vol_z_24h"]).reset_index(drop=True)
after = len(prices_all)

print(f"Dropped {before - after} rows due to volume z-score warm-up.")

# ----------------------------
# Sanity checks
# ----------------------------
print("\nlog_volume summary:")
print(prices_all["log_volume"].describe())

print("\nvol_z_24h summary:")
print(prices_all["vol_z_24h"].describe())

print("\nSample rows:")
print(
    prices_all[
        [
            "Datetime","ticker","Volume",
            "log_volume","vol_z_24h",
            "news_count","has_news",
            "target_log_return_4h"
        ]
    ].head(10)
)


Dropped 34853 rows due to volume z-score warm-up.

log_volume summary:
count    87640.000000
mean        15.246394
std          0.947637
min          0.000000
25%         14.641866
50%         15.185104
75%         15.754516
max         19.311836
Name: log_volume, dtype: float64

vol_z_24h summary:
count    87640.000000
mean        -0.006338
std          1.166965
min         -4.694855
25%         -0.509627
50%         -0.076844
75%          0.400778
max          4.694855
Name: vol_z_24h, dtype: float64

Sample rows:
                   Datetime ticker    Volume  log_volume  vol_z_24h  \
0 2023-12-08 13:30:00+00:00   AAPL   4479168   15.314948  -0.079984   
1 2023-12-08 14:30:00+00:00   AAPL  11185810   16.230157   4.083575   
2 2023-12-08 15:30:00+00:00   AAPL   5158425   15.456142   0.572241   
3 2023-12-08 16:30:00+00:00   AAPL   5398492   15.501630   0.739879   
4 2023-12-08 17:30:00+00:00   AAPL   4520094   15.324044  -0.059071   
5 2023-12-08 18:30:00+00:00   AAPL   3904469   15.17

In [20]:
# ============================
# Feature groups
# ============================

RETURN_COLS = [
    "log_return_1h"
]

VOL_COLS = [
    "vol_12h",
    "vol_24h"
]

VOLUME_COLS = [
    "log_volume",
    "vol_z_24h"
]

SENTIMENT_COLS = [
    "news_count",
    "has_news",
    "ticker_sentiment_ffill",
    "overall_sentiment_ffill",
    "ticker_relevance_ffill"
]

MACRO_COLS = [
    "CPI_YoY", "CPI_MoM",
    "PPI_YoY",
    "GDP_Growth_QoQ",
    "Fed_Funds_Rate",
    "NonFarm_Payrolls_Change"
]

TARGET_COL = "target_log_return_4h"

ALL_FEATURES = (
    RETURN_COLS
    + VOL_COLS
    + VOLUME_COLS
    + SENTIMENT_COLS
    + MACRO_COLS
)

print("Total features:", len(ALL_FEATURES))
print(ALL_FEATURES)


Total features: 16
['log_return_1h', 'vol_12h', 'vol_24h', 'log_volume', 'vol_z_24h', 'news_count', 'has_news', 'ticker_sentiment_ffill', 'overall_sentiment_ffill', 'ticker_relevance_ffill', 'CPI_YoY', 'CPI_MoM', 'PPI_YoY', 'GDP_Growth_QoQ', 'Fed_Funds_Rate', 'NonFarm_Payrolls_Change']


In [21]:
# ============================
# Temporal split
# ============================

prices_all = prices_all.sort_values(["Datetime","ticker"]).reset_index(drop=True)

unique_times = prices_all["Datetime"].sort_values().unique()
n = len(unique_times)

train_end = unique_times[int(n * 0.70)]
val_end   = unique_times[int(n * 0.85)]

prices_all["split"] = "test"
prices_all.loc[prices_all["Datetime"] <= train_end, "split"] = "train"
prices_all.loc[
    (prices_all["Datetime"] > train_end) &
    (prices_all["Datetime"] <= val_end),
    "split"
] = "val"

print(prices_all["split"].value_counts())
print("\nSplit boundaries:")
print("Train end:", train_end)
print("Val end:", val_end)


split
train    61355
val      13146
test     13139
Name: count, dtype: int64

Split boundaries:
Train end: 2025-05-02 17:30:00+00:00
Val end: 2025-08-20 14:30:00+00:00


In [22]:
from sklearn.preprocessing import StandardScaler

# ============================
# Fit scalers on TRAIN only
# ============================

scalers = {}

def fit_scaler(cols):
    scaler = StandardScaler()
    scaler.fit(prices_all.loc[prices_all["split"] == "train", cols])
    return scaler

scalers["vol"]    = fit_scaler(VOL_COLS)
scalers["volume"] = fit_scaler(VOLUME_COLS)
scalers["macro"]  = fit_scaler(MACRO_COLS)

# Apply transforms
prices_all[VOL_COLS] = scalers["vol"].transform(prices_all[VOL_COLS])
prices_all[VOLUME_COLS] = scalers["volume"].transform(prices_all[VOLUME_COLS])
prices_all[MACRO_COLS] = scalers["macro"].transform(prices_all[MACRO_COLS])

print("Scaling complete.")


Scaling complete.


In [23]:
print("\nTRAIN stats (should be ~0 mean, ~1 std):")
print(prices_all.loc[prices_all["split"]=="train", VOL_COLS + VOLUME_COLS + MACRO_COLS].describe().loc[["mean","std"]])

print("\nVAL stats (should be shifted, NOT zero-mean):")
print(prices_all.loc[prices_all["split"]=="val", VOL_COLS + VOLUME_COLS + MACRO_COLS].describe().loc[["mean","std"]])



TRAIN stats (should be ~0 mean, ~1 std):
       vol_12h       vol_24h    log_volume     vol_z_24h       CPI_YoY  \
mean  0.000000  8.894089e-17  8.745854e-16 -1.227570e-17  7.708210e-16   
std   1.000008  1.000008e+00  1.000008e+00  1.000008e+00  1.000008e+00   

           CPI_MoM       PPI_YoY  GDP_Growth_QoQ  Fed_Funds_Rate  \
mean -7.411741e-17  5.929393e-17    1.482348e-16    5.336453e-16   
std   1.000008e+00  1.000008e+00    1.000008e+00    1.000008e+00   

      NonFarm_Payrolls_Change  
mean                 0.000000  
std                  1.000008  

VAL stats (should be shifted, NOT zero-mean):
       vol_12h   vol_24h  log_volume  vol_z_24h   CPI_YoY   CPI_MoM   PPI_YoY  \
mean -0.152178 -0.235011    0.105932  -0.015499 -1.510318 -1.229532  0.304840   
std   0.796431  0.792317    0.961873   1.013466  0.517528  0.580867  0.329469   

      GDP_Growth_QoQ  Fed_Funds_Rate  NonFarm_Payrolls_Change  
mean       -2.686173   -1.436314e+00                -1.179440  
std         1.4

In [25]:
import numpy as np

# Choose which features go into X (exclude raw OHLC unless you explicitly want them)
FEATURE_COLS = [
    # price dynamics
    "log_return_1h",
    "vol_12h", "vol_24h",
    "log_volume", "vol_z_24h",

    # sentiment / events
    "news_count", "has_news",
    "overall_sentiment_ffill",
    "ticker_sentiment_ffill",
    "ticker_relevance_ffill",

    # macro regime (already scaled)
    "CPI_YoY", "CPI_MoM",
    "PPI_YoY",
    "GDP_Growth_QoQ",
    "Fed_Funds_Rate",
    "NonFarm_Payrolls_Change",
]

TARGET_COL = "target_log_return_4h"

missing = [c for c in FEATURE_COLS + [TARGET_COL, "ticker", "Datetime", "split"] if c not in prices_all.columns]
print("Missing columns:", missing)

assert not missing, f"Missing required columns: {missing}"

print("Feature count:", len(FEATURE_COLS))


Missing columns: []
Feature count: 16


In [26]:
LOOKBACK = 24  # you can later test 48, 72, etc.

def build_sequences_panel(df, feature_cols, target_col, lookback):
    """
    Build sequences per ticker, preserving temporal order.
    The sequence label y corresponds to the row at the END of the window.
    """
    X, y, meta = [], [], []  # meta will store (Datetime, ticker, split) for auditing

    for ticker, g in df.groupby("ticker", sort=False):
        g = g.sort_values("Datetime").reset_index(drop=True)

        feat = g[feature_cols].to_numpy(dtype=np.float32)
        targ = g[target_col].to_numpy(dtype=np.float32)
        dt   = g["Datetime"].to_numpy()
        spl  = g["split"].to_numpy()

        n = len(g)
        if n < lookback:
            continue

        for end in range(lookback - 1, n):
            start = end - lookback + 1

            X.append(feat[start:end+1, :])
            y.append(targ[end])

            meta.append((dt[end], ticker, spl[end]))

    X = np.stack(X)
    y = np.array(y, dtype=np.float32)
    meta = np.array(meta, dtype=object)

    return X, y, meta

X_all, y_all, meta_all = build_sequences_panel(
    prices_all, FEATURE_COLS, TARGET_COL, LOOKBACK
)

print("X_all shape:", X_all.shape)  # (num_samples, lookback, num_features)
print("y_all shape:", y_all.shape)
print("meta_all shape:", meta_all.shape)

# Split by meta (end-of-window split)
splits = meta_all[:, 2]
train_mask = splits == "train"
val_mask   = splits == "val"
test_mask  = splits == "test"

X_train, y_train = X_all[train_mask], y_all[train_mask]
X_val, y_val     = X_all[val_mask], y_all[val_mask]
X_test, y_test   = X_all[test_mask], y_all[test_mask]

print("\nSequence counts:")
print("train:", X_train.shape[0], "val:", X_val.shape[0], "test:", X_test.shape[0])

# Quick audit: show first few meta rows
print("\nMeta sample (Datetime, ticker, split):")
print(meta_all[:10])


X_all shape: (87479, 24, 16)
y_all shape: (87479,)
meta_all shape: (87479, 3)

Sequence counts:
train: 61194 val: 13146 test: 13139

Meta sample (Datetime, ticker, split):
[[Timestamp('2023-12-09 12:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-09 13:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-09 14:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-09 15:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-09 16:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-09 17:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-09 18:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-11 14:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-11 15:30:00+0000', tz='UTC') 'AAPL' 'train']
 [Timestamp('2023-12-11 16:30:00+0000', tz='UTC') 'AAPL' 'train']]


In [27]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, X, y):
        # Ensure float32 tensors
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)  # (N, 1) for regression

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [28]:
BATCH_SIZE = 256

train_ds = SequenceDataset(X_train, y_train)
val_ds   = SequenceDataset(X_val, y_val)
test_ds  = SequenceDataset(X_test, y_test)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    num_workers=0
)

val_loader = DataLoader(
    val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

test_loader = DataLoader(
    test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

print("Train batches:", len(train_loader))
print("Val batches:", len(val_loader))
print("Test batches:", len(test_loader))


Train batches: 239
Val batches: 52
Test batches: 52


In [29]:
xb, yb = next(iter(train_loader))
print("Batch X shape:", xb.shape)  # expected (B, 24, 16)
print("Batch y shape:", yb.shape)  # expected (B, 1)
print("X dtype:", xb.dtype, "y dtype:", yb.dtype)


Batch X shape: torch.Size([256, 24, 16])
Batch y shape: torch.Size([256, 1])
X dtype: torch.float32 y dtype: torch.float32


In [30]:
feature_order = FEATURE_COLS.copy()
print("Feature order:", feature_order)


Feature order: ['log_return_1h', 'vol_12h', 'vol_24h', 'log_volume', 'vol_z_24h', 'news_count', 'has_news', 'overall_sentiment_ffill', 'ticker_sentiment_ffill', 'ticker_relevance_ffill', 'CPI_YoY', 'CPI_MoM', 'PPI_YoY', 'GDP_Growth_QoQ', 'Fed_Funds_Rate', 'NonFarm_Payrolls_Change']


In [31]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, X, y):
        # Ensure float32 tensors
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)  # (N, 1) for regression

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [32]:
BATCH_SIZE = 256

train_ds = SequenceDataset(X_train, y_train)
val_ds   = SequenceDataset(X_val, y_val)
test_ds  = SequenceDataset(X_test, y_test)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    num_workers=0
)

val_loader = DataLoader(
    val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

test_loader = DataLoader(
    test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

print("Train batches:", len(train_loader))
print("Val batches:", len(val_loader))
print("Test batches:", len(test_loader))


Train batches: 239
Val batches: 52
Test batches: 52


In [33]:
xb, yb = next(iter(train_loader))
print("Batch X shape:", xb.shape)  # expected (B, 24, 16)
print("Batch y shape:", yb.shape)  # expected (B, 1)
print("X dtype:", xb.dtype, "y dtype:", yb.dtype)


Batch X shape: torch.Size([256, 24, 16])
Batch y shape: torch.Size([256, 1])
X dtype: torch.float32 y dtype: torch.float32


In [34]:
import torch
import torch.nn as nn

class LSTMRegressor(nn.Module):
    def __init__(self, n_features, hidden_size=64, num_layers=1, dropout=0.0):
        super().__init__()
        self.n_features = n_features
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # If num_layers == 1, PyTorch ignores dropout in LSTM, so handle gracefully
        lstm_dropout = dropout if num_layers > 1 else 0.0

        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=lstm_dropout
        )

        self.head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, 1)
        )

    def forward(self, x):
        # x: (B, T, F)
        out, (hn, cn) = self.lstm(x)
        # hn: (num_layers, B, hidden_size) -> take last layer
        h_last = hn[-1]  # (B, hidden_size)
        yhat = self.head(h_last)  # (B, 1)
        return yhat


In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

n_features = X_train.shape[2]

model = LSTMRegressor(
    n_features=n_features,
    hidden_size=64,
    num_layers=1,
    dropout=0.0
).to(device)

print(model)


Using device: cpu
LSTMRegressor(
  (lstm): LSTM(16, 64, batch_first=True)
  (head): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [36]:
xb, yb = next(iter(train_loader))
xb = xb.to(device)
yb = yb.to(device)

with torch.no_grad():
    pred = model(xb)

print("pred shape:", pred.shape)
print("yb shape:", yb.shape)

# sanity: should be finite
print("pred finite:", torch.isfinite(pred).all().item())
print("pred sample:", pred[:5].view(-1).cpu().numpy())


pred shape: torch.Size([256, 1])
yb shape: torch.Size([256, 1])
pred finite: True
pred sample: [-0.01507325 -0.03253574 -0.0279959  -0.03488839 -0.02618472]


In [37]:
import math
import numpy as np
import torch
import torch.nn as nn

loss_fn = nn.MSELoss()

def rmse_from_mse(mse):
    return math.sqrt(mse)

def run_one_epoch(model, loader, optimizer=None, device="cpu"):
    """
    If optimizer is provided -> train mode (backprop).
    Else -> eval mode (no grad).
    Returns average MSE and RMSE.
    """
    is_train = optimizer is not None
    model.train(is_train)

    total_mse = 0.0
    n = 0

    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)

        if is_train:
            optimizer.zero_grad()

        pred = model(xb)
        loss = loss_fn(pred, yb)

        if is_train:
            loss.backward()
            # Optional: gradient clipping for stability (good for LSTMs)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        bs = xb.size(0)
        total_mse += loss.item() * bs
        n += bs

    avg_mse = total_mse / n
    return avg_mse, rmse_from_mse(avg_mse)


In [38]:
import copy

# Hyperparameters (baseline)
LR = 1e-3
EPOCHS = 30
PATIENCE = 5  # early stop if val doesn't improve for this many epochs

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

best_val_mse = float("inf")
best_state = None
pat = 0

history = []

for epoch in range(1, EPOCHS + 1):
    train_mse, train_rmse = run_one_epoch(model, train_loader, optimizer=optimizer, device=device)
    val_mse, val_rmse     = run_one_epoch(model, val_loader, optimizer=None, device=device)

    history.append({
        "epoch": epoch,
        "train_mse": train_mse, "train_rmse": train_rmse,
        "val_mse": val_mse,     "val_rmse": val_rmse
    })

    print(
        f"Epoch {epoch:02d} | "
        f"train RMSE={train_rmse:.6f} | val RMSE={val_rmse:.6f}"
    )

    # Early stopping on val MSE
    if val_mse < best_val_mse - 1e-10:
        best_val_mse = val_mse
        best_state = copy.deepcopy(model.state_dict())
        pat = 0
    else:
        pat += 1
        if pat >= PATIENCE:
            print(f"Early stopping triggered at epoch {epoch}. Best val RMSE={math.sqrt(best_val_mse):.6f}")
            break

# Load best model
if best_state is not None:
    model.load_state_dict(best_state)


Epoch 01 | train RMSE=0.010045 | val RMSE=0.015472
Epoch 02 | train RMSE=0.009530 | val RMSE=0.011719
Epoch 03 | train RMSE=0.009488 | val RMSE=0.010813
Epoch 04 | train RMSE=0.009436 | val RMSE=0.011997
Epoch 05 | train RMSE=0.009410 | val RMSE=0.016487
Epoch 06 | train RMSE=0.009408 | val RMSE=0.009695
Epoch 07 | train RMSE=0.009360 | val RMSE=0.010491
Epoch 08 | train RMSE=0.009332 | val RMSE=0.009478
Epoch 09 | train RMSE=0.009338 | val RMSE=0.008548
Epoch 10 | train RMSE=0.009313 | val RMSE=0.010093
Epoch 11 | train RMSE=0.009288 | val RMSE=0.015195
Epoch 12 | train RMSE=0.009279 | val RMSE=0.008012
Epoch 13 | train RMSE=0.009227 | val RMSE=0.009376
Epoch 14 | train RMSE=0.009204 | val RMSE=0.009270
Epoch 15 | train RMSE=0.009200 | val RMSE=0.010687
Epoch 16 | train RMSE=0.009174 | val RMSE=0.007975
Epoch 17 | train RMSE=0.009104 | val RMSE=0.008742
Epoch 18 | train RMSE=0.009093 | val RMSE=0.009372
Epoch 19 | train RMSE=0.009039 | val RMSE=0.010277
Epoch 20 | train RMSE=0.009049 

In [39]:
test_mse, test_rmse = run_one_epoch(model, test_loader, optimizer=None, device=device)
print(f"\nFinal TEST RMSE: {test_rmse:.6f}")



Final TEST RMSE: 0.008293


In [40]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Rebuild meta masks from earlier variables
splits = meta_all[:, 2]
test_mask = splits == "test"

# True y for test
y_true = y_all[test_mask].astype(np.float64)

# Extract last-step features for baselines (timestep = -1)
X_last = X_all[test_mask, -1, :].astype(np.float64)  # shape: (N_test, F)

# Feature index helpers
feat_index = {f:i for i,f in enumerate(FEATURE_COLS)}
idx_lr1h = feat_index["log_return_1h"]

print("Test samples:", len(y_true))
print("X_last shape:", X_last.shape)


Test samples: 13139
X_last shape: (13139, 16)


In [41]:
y_pred_zero = np.zeros_like(y_true)
rmse_zero = np.sqrt(mean_squared_error(y_true, y_pred_zero))
print("Baseline (predict 0) TEST RMSE:", rmse_zero)


Baseline (predict 0) TEST RMSE: 0.007579834704195661


In [42]:
y_pred_lr1h = X_last[:, idx_lr1h]
rmse_lr1h = np.sqrt(mean_squared_error(y_true, y_pred_lr1h))
print("Baseline (predict log_return_1h) TEST RMSE:", rmse_lr1h)


Baseline (predict log_return_1h) TEST RMSE: 0.00888768808856586


In [43]:
# Train/val split masks for fitting ridge properly
train_mask = splits == "train"
val_mask   = splits == "val"

X_train_last = X_all[train_mask, -1, :].astype(np.float64)
y_train_last = y_all[train_mask].astype(np.float64)

X_val_last = X_all[val_mask, -1, :].astype(np.float64)
y_val_last = y_all[val_mask].astype(np.float64)

# Small alpha grid, choose best on val
alphas = [0.1, 1.0, 10.0, 100.0]
best = None

for a in alphas:
    reg = Ridge(alpha=a, random_state=42)
    reg.fit(X_train_last, y_train_last)
    val_pred = reg.predict(X_val_last)
    val_rmse = np.sqrt(mean_squared_error(y_val_last, val_pred))
    print(f"Ridge alpha={a:<6} val RMSE={val_rmse:.6f}")
    if best is None or val_rmse < best["val_rmse"]:
        best = {"alpha": a, "model": reg, "val_rmse": val_rmse}

# Evaluate best ridge on test
ridge_best = best["model"]
y_pred_ridge = ridge_best.predict(X_last)
rmse_ridge = np.sqrt(mean_squared_error(y_true, y_pred_ridge))

print("\nBest Ridge alpha:", best["alpha"], "val RMSE:", best["val_rmse"])
print("Baseline (Ridge last-step) TEST RMSE:", rmse_ridge)


Ridge alpha=0.1    val RMSE=0.007399
Ridge alpha=1.0    val RMSE=0.007399
Ridge alpha=10.0   val RMSE=0.007399
Ridge alpha=100.0  val RMSE=0.007398

Best Ridge alpha: 100.0 val RMSE: 0.007397902718953849
Baseline (Ridge last-step) TEST RMSE: 0.007598440937625124


In [44]:
rmse_lstm = 0.008293  # <-- your reported test RMSE

summary = pd.DataFrame({
    "model": ["Zero", "Last log_return_1h", "Ridge(last-step)", "LSTM(sequence)"],
    "test_rmse": [rmse_zero, rmse_lr1h, rmse_ridge, rmse_lstm]
}).sort_values("test_rmse")

print(summary)


                model  test_rmse
0                Zero   0.007580
2    Ridge(last-step)   0.007598
3      LSTM(sequence)   0.008293
1  Last log_return_1h   0.008888
