### Current Idea (open to other ideas of course!):


Traditional factor models can struggle capturing short-horizon "micro-regimes" in stock price behavior. The Goal of this project is to use a small autoencoder to learn regimes from 20-day *(I just picked a number we can change this obviously)* sliding windows of price/volume features, and then studiy:

- What the regimes look like
- How they transition from one to another
- Whether transitions have predictive power for near-term returns or volatitlity
- Potential comparissons to simple baselines (momentum, volatility)

### Gathering Data 


####  Optional Download (You shouldn't need to do this) Please use the link in the readme to get the dataset

In [1]:
# uncomment to download data. 
# Not needed if you have the .parquet file in the data/historical folder

# !python qualifier\utils\download_stock_data.py

#### Load Dataset from file

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Set random seeds for reproducibility
np.random.seed(42)

# Data path
PARQUET_PATH = Path("data/historical/all_stocks_historical.parquet")

print(f"Loading data from: {PARQUET_PATH}")
df = pd.read_parquet(PARQUET_PATH)
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumn dtypes:")
print(df.dtypes)

Loading data from: data/historical/all_stocks_historical.parquet

DataFrame shape: (89859344, 7)

Column dtypes:
ticker            object
date      datetime64[us]
open             float64
high             float64
low              float64
close            float64
volume           float64
dtype: object


In [3]:
# standardize and prepare data
print("Available columns:", df.columns.tolist())

# ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

# Sort by ticker and date
df = df.sort_values(['ticker', 'date']).reset_index(drop=True)

print("\n" + "=" * 60)
print("Data Summary:")
print("=" * 60)
print(f"Total rows: {len(df):,}")
print(f"Unique tickers: {df['ticker'].nunique():,}")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"Days covered: {(df['date'].max() - df['date'].min()).days:,} days")
print(f"Date column type: {df['date'].dtype}")

Available columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume']

Data Summary:
Total rows: 89,859,344
Unique tickers: 7,144
Date range: 1962-01-02 to 2025-11-13
Days covered: 23,326 days
Date column type: datetime64[us]


### Cleaning Data

#### Handling Nans

In [4]:
# original data with nans
print("\n" + "=" * 60)
print("Original Data:")
print("=" * 60)
display(df.head())

# no nan 
print("\n" + "=" * 60)
print("valid (non-null) data:")
print("=" * 60)

# remove rows with nans
cleaned_df = df.dropna(subset=['close', 'volume'])
display(cleaned_df[['ticker', 'date', 'open', 'high', 'low', 'close', 'volume']].head())


Original Data:


Unnamed: 0,ticker,date,open,high,low,close,volume
0,A,1962-01-02,,,,,
1,A,1962-01-03,,,,,
2,A,1962-01-04,,,,,
3,A,1962-01-05,,,,,
4,A,1962-01-08,,,,,



valid (non-null) data:


Unnamed: 0,ticker,date,open,high,low,close,volume
9539,A,1999-11-18,27.245214,29.939798,23.951837,26.347021,62546380.0
9540,A,1999-11-19,25.710805,25.748231,23.839568,24.176392,15234146.0
9541,A,1999-11-22,24.737756,26.347021,23.989261,26.347021,6577870.0
9542,A,1999-11-23,25.448832,26.122478,23.951841,23.951841,5975611.0
9543,A,1999-11-24,24.026691,25.112009,23.951841,24.588062,4843231.0


#### Trimming dates

In [5]:
# trimming the df_valid to start at the year 2000
cleaned_trimmed_df = cleaned_df[cleaned_df['date'] >= '2000-01-01'].copy()
print("\n" + "=" * 60)
print("Filtered data (Year 2000 onwards):")
print("=" * 60)
print(f"Total rows: {len(cleaned_trimmed_df):,}")
print(f"Date range: {cleaned_trimmed_df['date'].min().date()} to {cleaned_trimmed_df['date'].max().date()}")
display(cleaned_trimmed_df[['ticker', 'date', 'open', 'high', 'low', 'close', 'volume']].head())


Filtered data (Year 2000 onwards):
Total rows: 21,521,135
Date range: 2000-01-03 to 2025-11-13


Unnamed: 0,ticker,date,open,high,low,close,volume
9569,A,2000-01-03,47.155195,47.26747,40.343889,43.113323,4674353.0
9570,A,2000-01-04,40.792992,41.242092,38.772056,39.81995,4765083.0
9571,A,2000-01-05,39.670253,39.819954,36.114903,37.349918,5758642.0
9572,A,2000-01-06,36.900802,37.125352,34.805015,35.927757,2534434.0
9573,A,2000-01-07,35.366397,39.483121,35.328974,38.921749,2819626.0


#### Removing stocks that do not have at least 1000 days of data and Filtering for highest volume stocks

In [6]:
# two-stage universe selection/filtering to shrink for MVP
# 1. remove tickers with insufficient history
# 2. select top N by average volume from quality-filtered set


min_days_required = 1000  # min number of days per ticker
top_n = 50  # target universe size

print("\n" + "=" * 60)
print(f"Stage 1: Quality Filter (min {min_days_required} trading days)")
print("=" * 60)

# count days per ticker in the full trimmed dataset
rows_per_ticker_all = cleaned_trimmed_df.groupby('ticker').size()
valid_tickers = rows_per_ticker_all[rows_per_ticker_all >= min_days_required].index.tolist()

print(f"Tickers before quality filter: {len(rows_per_ticker_all):,}")
print(f"Tickers after quality filter:  {len(valid_tickers):,}")
print(f"Tickers removed: {len(rows_per_ticker_all) - len(valid_tickers):,}")

# filter to quality tickers only
quality_filtered_df = cleaned_trimmed_df[cleaned_trimmed_df['ticker'].isin(valid_tickers)].copy()

print("\n" + "=" * 60)
print(f"Stage 2: Liquidity Ranking (top {top_n} by avg volume)")
print("=" * 60)

# average volume on quality-filtered stocks
avg_volume_by_ticker = quality_filtered_df.groupby('ticker')['volume'].mean().sort_values(ascending=False)

# select top N from the filtered universe
universe = avg_volume_by_ticker.head(top_n).index.tolist()



Stage 1: Quality Filter (min 1000 trading days)
Tickers before quality filter: 7,093
Tickers after quality filter:  5,182
Tickers removed: 1,911

Stage 2: Liquidity Ranking (top 50 by avg volume)


In [7]:
# create data dataframe from the quality filters.
data = quality_filtered_df[quality_filtered_df['ticker'].isin(universe)].copy()
data = data.sort_values(['ticker', 'date']).reset_index(drop=True)

print(f"Selected universe size: {len(universe)} tickers (target: {top_n})")
print(f"{len(universe)} tickers have >= {min_days_required} days of history")

print("-" * 60)
for i, (ticker, vol) in enumerate(avg_volume_by_ticker.head(10).items(), 1):
    print(f"  {i:2d}. {ticker:6s} - {vol:>15,.0f} shares/day")

Selected universe size: 50 tickers (target: 50)
50 tickers have >= 1000 days of history
------------------------------------------------------------
   1. NVDA   -     600,617,217 shares/day
   2. AAPL   -     374,911,010 shares/day
   3. AMZN   -     116,037,570 shares/day
   4. GOOGL  -     112,346,485 shares/day
   5. GOOG   -     110,036,509 shares/day
   6. TSLA   -      96,896,489 shares/day
   7. TQQQ   -      84,554,877 shares/day
   8. BAC    -      77,327,168 shares/day
   9. QQQ    -      66,639,305 shares/day
  10. NIO    -      61,121,488 shares/day


In [8]:
print("\n" + "=" * 60)
print(f"Final Universe: {len(universe)} stocks")
print("=" * 60)
print(f"Total rows: {len(data):,}")
print(f"Unique tickers: {data['ticker'].nunique()}")
print(f"Date range: {data['date'].min().date()} to {data['date'].max().date()}")


print("\n" + "-" * 60)
print("Data Completeness (per ticker):")
print("-" * 60)
rows_per_ticker = data.groupby('ticker').size()
print(f"  Min:    {rows_per_ticker.min():>5,} days  (>= {min_days_required} guaranteed)")
print(f"  Median: {rows_per_ticker.median():>5,.0f} days")
print(f"  Mean:   {rows_per_ticker.mean():>5,.0f} days")
print(f"  Max:    {rows_per_ticker.max():>5,} days")

print("\n" + "=" * 60)
print("Selected Tickers (sorted by avg volume):")
print(f"  {', '.join(universe)}")

display(data.head(10))


Final Universe: 50 stocks
Total rows: 244,098
Unique tickers: 50
Date range: 2000-01-03 to 2025-11-13

------------------------------------------------------------
Data Completeness (per ticker):
------------------------------------------------------------
  Min:    1,007 days  (>= 1000 guaranteed)
  Median: 6,506 days
  Mean:   4,882 days
  Max:    6,507 days

Selected Tickers (sorted by avg volume):
  NVDA, AAPL, AMZN, GOOGL, GOOG, TSLA, TQQQ, BAC, QQQ, NIO, PLTR, INTC, MSFT, F, SOFI, CSCO, OPEN, AMD, PFE, RIVN, T, SNAP, CMG, WMT, META, EBAY, ORCL, AVGO, ITUB, CMCSA, CSX, LRCX, HPQ, UBER, WFC, VALE, MU, BBAI, TLRY, GRAB, NOK, RGTI, HOOD, JPM, BABA, C, AMAT, PBR, XOM, BBD


Unnamed: 0,ticker,date,open,high,low,close,volume
0,AAPL,2000-01-03,0.786328,0.843498,0.762428,0.839281,535796800.0
1,AAPL,2000-01-04,0.811633,0.82944,0.75868,0.768521,512377600.0
2,AAPL,2000-01-05,0.777892,0.828971,0.772269,0.779767,778321600.0
3,AAPL,2000-01-06,0.7957,0.80226,0.712287,0.712287,767972800.0
4,AAPL,2000-01-07,0.723534,0.757274,0.716037,0.746027,460734400.0
5,AAPL,2000-01-10,0.764771,0.766646,0.710413,0.732906,505064000.0
6,AAPL,2000-01-11,0.719316,0.74509,0.678547,0.695417,441548800.0
7,AAPL,2000-01-12,0.712287,0.716036,0.648556,0.653711,976068800.0
8,AAPL,2000-01-13,0.708421,0.740403,0.693543,0.725408,1032685000.0
9,AAPL,2000-01-14,0.749776,0.766646,0.74509,0.753056,390376000.0


### Exploring data

#### Feature engineering

In [9]:
feat_df = data.copy()
feat_df = feat_df.sort_values(['ticker', 'date']).reset_index(drop=True)

# 1. log price & 1-day log return 
feat_df['log_price'] = np.log(feat_df['close'])
feat_df['log_ret_1d'] = feat_df.groupby('ticker')['log_price'].diff()

# 2. rolling volatilities of returns 
feat_df['vol_5d'] = (
    feat_df.groupby('ticker')['log_ret_1d']
           .rolling(window=5, min_periods=5)
           .std()
           .reset_index(level=0, drop=True)
)

feat_df['vol_20d'] = (
    feat_df.groupby('ticker')['log_ret_1d']
           .rolling(window=20, min_periods=20)
           .std()
           .reset_index(level=0, drop=True)
)

# 3. vol z-score with 1-period lag to exclude current row
g = feat_df.groupby('ticker')['volume']
exp_mean = g.expanding().mean().shift(1).reset_index(level=0, drop=True)
exp_std  = g.expanding().std(ddof=0).shift(1).reset_index(level=0, drop=True)

feat_df['volume_mean_exp'] = exp_mean
feat_df['volume_std_exp']  = exp_std

# avoid dividing by 0
feat_df['volume_z'] = (
    (feat_df['volume'] - feat_df['volume_mean_exp']) 
    / (feat_df['volume_std_exp'] + 1e-8)
)


# 4. intraday range
feat_df['range_frac'] = (feat_df['high'] - feat_df['low']) / feat_df['close']

# Keep only needed columns
feature_cols = ['log_ret_1d', 'vol_5d', 'vol_20d', 'volume_z', 'range_frac']

feat_df = feat_df[['ticker', 'date'] + feature_cols]

# drop nan rows that were created because of frolling features etc
feat_df = feat_df.dropna(subset=feature_cols).reset_index(drop=True)

print("Feature DF shape:", feat_df.shape)
feat_df.head()

Feature DF shape: (243098, 7)


Unnamed: 0,ticker,date,log_ret_1d,vol_5d,vol_20d,volume_z,range_frac
0,AAPL,2000-02-01,-0.034317,0.037641,0.054475,-0.927957,0.049875
1,AAPL,2000-02-02,-0.014443,0.037796,0.050977,-0.447783,0.051866
2,AAPL,2000-02-03,0.044535,0.048172,0.051917,-0.401788,0.038718
3,AAPL,2000-02-04,0.044373,0.035449,0.048214,-0.547381,0.059028
4,AAPL,2000-02-07,0.054615,0.04039,0.048611,-0.482604,0.072876


#### Feature Correlation

In [10]:
import plotly.express as px
import plotly.io as pio
corr = feat_df[feature_cols].corr()

fig = px.imshow(
    corr,
    text_auto=".2f",
    color_continuous_scale="RdBu_r",
    origin="lower",
    title="Correlation between features"
)
fig.update_layout(height=500, width=600)
fig.show()


### Modeling 

####  Unsupervised learning

##### Creating windows

In [11]:
#  window level features 

window_len = 20  # how big (how many days) the window is

# function to make windows of 20 days for each ticker
def make_window_features(feat_df, feature_cols, window_len=20):
    frames = []
    # looping through the tickers and creating windows
    for ticker, g in feat_df.groupby('ticker'):
        g = g.sort_values('date').reset_index(drop=True)

        roll = g[feature_cols].rolling(window_len, min_periods=window_len)

        win_feats = pd.concat(
            [
                roll.mean().add_suffix('_mean'),
                roll.std().add_suffix('_std'),
                roll.min().add_suffix('_min'),
                roll.max().add_suffix('_max'),
            ],
            axis=1,
        )

        # 'date' here corresponds to the END of each window
        tmp = pd.concat([g[['ticker', 'date']], win_feats], axis=1).dropna()
        frames.append(tmp)

    window_df = pd.concat(frames, ignore_index=True)
    return window_df


window_df = make_window_features(
    feat_df,
    feature_cols=['log_ret_1d', 'vol_5d', 'vol_20d', 'volume_z', 'range_frac'],
    window_len=window_len
)

print(window_df.shape)
window_df.tail()


(242148, 22)


Unnamed: 0,ticker,date,log_ret_1d_mean,vol_5d_mean,vol_20d_mean,volume_z_mean,range_frac_mean,log_ret_1d_std,vol_5d_std,vol_20d_std,...,log_ret_1d_min,vol_5d_min,vol_20d_min,volume_z_min,range_frac_min,log_ret_1d_max,vol_5d_max,vol_20d_max,volume_z_max,range_frac_max
242143,XOM,2025-11-07,0.002848,0.009964,0.011296,-0.503225,0.015047,0.010124,0.001926,0.001072,...,-0.015229,0.00535,0.010075,-0.879765,0.007676,0.023478,0.012299,0.013176,0.24323,0.023295
242144,XOM,2025-11-10,0.002595,0.009853,0.011132,-0.489155,0.015875,0.009904,0.001848,0.001018,...,-0.015229,0.00535,0.009904,-0.879765,0.007676,0.023478,0.012299,0.012419,0.24323,0.025292
242145,XOM,2025-11-11,0.003229,0.009738,0.011027,-0.464075,0.0157,0.010161,0.001758,0.001004,...,-0.015229,0.00535,0.009904,-0.879765,0.007676,0.023478,0.012299,0.012419,0.24323,0.025292
242146,XOM,2025-11-12,0.002835,0.009807,0.010951,-0.455845,0.015166,0.01068,0.001886,0.000967,...,-0.015229,0.00535,0.009904,-0.879765,0.007676,0.023478,0.013675,0.012419,0.24323,0.025292
242147,XOM,2025-11-13,0.003802,0.009884,0.010869,-0.485967,0.014892,0.010451,0.002015,0.000934,...,-0.015229,0.00535,0.009904,-1.219489,0.007676,0.023478,0.013731,0.012419,0.24323,0.025292


##### Clustering: K means + latent components (PCA). Both in zybooks

In [12]:
# clustering + latent components (PCA)

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# use same date splits as the rest of the notebook
train_end = pd.Timestamp('2015-12-31')
val_end   = pd.Timestamp('2018-12-31')

# masks based on WINDOW END DATE. last date of window is (window_df['date'] 
train_mask_win = window_df['date'] <= train_end
val_mask_win   = (window_df['date'] > train_end) & (window_df['date'] <= val_end)
test_mask_win  = window_df['date'] > val_end

window_feature_cols = [c for c in window_df.columns if c not in ['ticker', 'date']]

# splitting the features by time
X_train_win = window_df.loc[train_mask_win, window_feature_cols].to_numpy()
X_val_win   = window_df.loc[val_mask_win,   window_feature_cols].to_numpy()
X_test_win  = window_df.loc[test_mask_win,  window_feature_cols].to_numpy()

'''
Very IMPORTANT to only use the train data to fit the scaler. 
'''
# ******standardize using TRAIN ONLY******
scaler_win = StandardScaler()
X_train_scaled = scaler_win.fit_transform(X_train_win)
X_val_scaled   = scaler_win.transform(X_val_win)
X_test_scaled  = scaler_win.transform(X_test_win)

# k-means  in original feature space to avoid leakage

k = 6  # initial guess for number of regimes. Could be tuned maybe?

kmeans_raw = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans_raw.fit(X_train_scaled)

window_df['regime_k6_raw'] = np.nan
window_df.loc[train_mask_win, 'regime_k6_raw'] = kmeans_raw.predict(X_train_scaled)
window_df.loc[val_mask_win,   'regime_k6_raw'] = kmeans_raw.predict(X_val_scaled)
window_df.loc[test_mask_win,  'regime_k6_raw'] = kmeans_raw.predict(X_test_scaled)
window_df['regime_k6_raw'] = window_df['regime_k6_raw'].astype('Int64')

'''
Very IMPORTANT to only use the train data to fit the scaler. 
'''
# PCA as latent-variable. (in zybooks). ONLY USE TRAIN DATA TO FIT PCA
pca = PCA(n_components=5, random_state=42)
Z_train = pca.fit_transform(X_train_scaled)  # latent features
Z_val   = pca.transform(X_val_scaled)
Z_test  = pca.transform(X_test_scaled)

#  k-means on latent PCA space ***AGAIN:  train-only fit***
kmeans_pca = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans_pca.fit(Z_train)

window_df['regime_k6_pca'] = np.nan
window_df.loc[train_mask_win, 'regime_k6_pca'] = kmeans_pca.predict(Z_train)
window_df.loc[val_mask_win,   'regime_k6_pca'] = kmeans_pca.predict(Z_val)
window_df.loc[test_mask_win,  'regime_k6_pca'] = kmeans_pca.predict(Z_test)
window_df['regime_k6_pca'] = window_df['regime_k6_pca'].astype('Int64')

#  profiles of PCA regimes
regime_profile_pca = (
    window_df
    .groupby('regime_k6_pca')[window_feature_cols]
    .mean()
    .assign(count=window_df.groupby('regime_k6_pca').size())
)

print("Regime counts (PCA space):")
print(window_df['regime_k6_pca'].value_counts().sort_index())
regime_profile_pca


Regime counts (PCA space):
regime_k6_pca
0     71776
1    123313
2      4787
3     17316
4     24916
5        40
Name: count, dtype: Int64


Unnamed: 0_level_0,log_ret_1d_mean,vol_5d_mean,vol_20d_mean,volume_z_mean,range_frac_mean,log_ret_1d_std,vol_5d_std,vol_20d_std,volume_z_std,range_frac_std,...,vol_5d_min,vol_20d_min,volume_z_min,range_frac_min,log_ret_1d_max,vol_5d_max,vol_20d_max,volume_z_max,range_frac_max,count
regime_k6_pca,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000817,0.024734,0.026691,0.036509,0.033333,0.026392,0.009463,0.003629,0.558293,0.013221,...,0.010941,0.020919,-0.667522,0.016179,0.054776,0.041968,0.032029,1.554841,0.066924,71776
1,0.000763,0.012635,0.013989,-0.262693,0.018643,0.013449,0.004636,0.001873,0.414756,0.00712,...,0.00558,0.011048,-0.807259,0.009236,0.02728,0.021037,0.016821,0.85153,0.036597,123313
2,-0.002133,0.085532,0.087558,2.181337,0.111124,0.095315,0.041464,0.017259,1.930224,0.055047,...,0.033461,0.060768,0.331056,0.047394,0.204041,0.162654,0.110679,8.03119,0.258995,4787
3,-0.001224,0.0259,0.026811,1.860115,0.035461,0.028641,0.01077,0.003953,1.551739,0.014989,...,0.010891,0.02061,0.242648,0.016614,0.054579,0.045503,0.032623,6.555109,0.074537,17316
4,-0.001055,0.046744,0.04963,0.58257,0.062972,0.050264,0.01877,0.007417,0.851407,0.025801,...,0.02045,0.037954,-0.401601,0.029827,0.102892,0.08126,0.06027,2.996232,0.128865,24916
5,0.01052,0.050824,0.044026,10.390311,0.064332,0.059886,0.027352,0.013936,27.251441,0.031888,...,0.020328,0.025579,0.555159,0.02429,0.178874,0.10286,0.06423,124.082659,0.146008,40


#### Supervised learning

##### Creating Future Targets

In [13]:
##### Creating Future Targets

# using feat_df so we can line it by ticker and date
feat_with_target = feat_df.copy()

# number of days ahead
horizon = 5

def add_targets(g, horizon=horizon):
    g = g.sort_values('date').reset_index(drop=True)

    # cumulative sum of 1D log returns
    g['cum_log_ret'] = g['log_ret_1d'].cumsum()

    # future horizon log return: sum_{t+1..t+horizon} log_ret_1d
    # = cum_log_ret at t+horizon minus cum_log_ret at t
    g[f'ret_{horizon}d_fwd'] = g['cum_log_ret'].shift(-horizon) - g['cum_log_ret']

    return g

feat_with_target = (
    feat_with_target
    .groupby('ticker', group_keys=False)
    .apply(add_targets, horizon=horizon)
)

# binary label: will the horizon return be positive (classification)
feat_with_target[f'up_{horizon}d'] = (feat_with_target[f'ret_{horizon}d_fwd'] > 0).astype(int)

# drop helper column and rows where forward return isn't defined
feat_with_target = (
    feat_with_target
    .drop(columns=['cum_log_ret'])
    .dropna(subset=[f'ret_{horizon}d_fwd'])
    .reset_index(drop=True)
)

feat_with_target.head()


Unnamed: 0,ticker,date,log_ret_1d,vol_5d,vol_20d,volume_z,range_frac,ret_5d_fwd,up_5d
0,AAPL,2000-02-01,-0.034317,0.037641,0.054475,-0.927957,0.049875,0.136179,1
1,AAPL,2000-02-02,-0.014443,0.037796,0.050977,-0.447783,0.051866,0.13084,1
2,AAPL,2000-02-03,0.044535,0.048172,0.051917,-0.401788,0.038718,0.094045,1
3,AAPL,2000-02-04,0.044373,0.035449,0.048214,-0.547381,0.059028,0.00692,1
4,AAPL,2000-02-07,0.054615,0.04039,0.048611,-0.482604,0.072876,0.015226,1


##### Merging windows with targets

In [14]:
#### Merging window features, regimes, and future targets

target_cols = ['ticker', 'date', f'ret_{horizon}d_fwd', f'up_{horizon}d']

supervised_df = (
    window_df
    .merge(
        feat_with_target[target_cols],
        on=['ticker', 'date'],
        how='inner'
    )
    .sort_values(['ticker', 'date'])
    .reset_index(drop=True)
)

print(supervised_df.shape)
supervised_df.head()



(241898, 26)


Unnamed: 0,ticker,date,log_ret_1d_mean,vol_5d_mean,vol_20d_mean,volume_z_mean,range_frac_mean,log_ret_1d_std,vol_5d_std,vol_20d_std,...,range_frac_min,log_ret_1d_max,vol_5d_max,vol_20d_max,volume_z_max,range_frac_max,regime_k6_raw,regime_k6_pca,ret_5d_fwd,up_5d
0,AAPL,2000-02-29,0.004984,0.035715,0.042681,-0.624599,0.052269,0.033529,0.006877,0.006093,...,0.020674,0.062921,0.048172,0.054475,-0.150583,0.090061,0,0,0.069501,1
1,AAPL,2000-03-01,0.013113,0.037051,0.042063,-0.47312,0.054979,0.042111,0.009401,0.005424,...,0.020674,0.128269,0.064356,0.051917,2.101617,0.104076,0,4,-0.065913,0
2,AAPL,2000-03-02,0.01054,0.038934,0.041781,-0.485537,0.055357,0.045333,0.012739,0.005071,...,0.020674,0.128269,0.075465,0.051917,2.101617,0.104076,0,4,0.002047,1
3,AAPL,2000-03-03,0.010714,0.040018,0.041459,-0.497518,0.056638,0.045476,0.014384,0.004573,...,0.020674,0.128269,0.075465,0.048611,2.101617,0.104076,0,4,-0.017734,0
4,AAPL,2000-03-06,0.007583,0.041904,0.041308,-0.521612,0.055327,0.045191,0.01612,0.004384,...,0.020674,0.128269,0.075465,0.048611,2.101617,0.104076,0,4,-0.035429,0


##### Adding regime transition features

In [16]:

# previous PCA regime for each ticker
supervised_df['regime_prev_pca'] = (
    supervised_df
    .groupby('ticker')['regime_k6_pca']
    .shift(1)
)

# 1 if the PCA regime changed vs previous window, else 0
supervised_df['regime_changed_pca'] = (
    supervised_df['regime_prev_pca'].notna() &
    (supervised_df['regime_prev_pca'] != supervised_df['regime_k6_pca'])
).astype(int)

# first window per ticker has no previous regime, drop those
supervised_df = supervised_df.dropna(subset=['regime_prev_pca']).reset_index(drop=True)

supervised_df.tail(40)

Unnamed: 0,ticker,date,log_ret_1d_mean,vol_5d_mean,vol_20d_mean,volume_z_mean,range_frac_mean,log_ret_1d_std,vol_5d_std,vol_20d_std,...,vol_5d_max,vol_20d_max,volume_z_max,range_frac_max,regime_k6_raw,regime_k6_pca,ret_5d_fwd,up_5d,regime_prev_pca,regime_changed_pca
241758,XOM,2025-09-12,0.002641,0.009854,0.010084,-0.321895,0.015672,0.011397,0.005166,0.001123,...,0.017798,0.011613,0.144033,0.026539,1,1,0.005867,1,1,0
241759,XOM,2025-09-15,0.002678,0.009825,0.010169,-0.354366,0.015423,0.011392,0.005177,0.001156,...,0.017798,0.011613,0.112398,0.026539,1,1,-0.002942,0,1,0
241760,XOM,2025-09-16,0.003597,0.010056,0.010288,-0.333127,0.0158,0.012069,0.005101,0.001224,...,0.017798,0.012069,0.112398,0.026539,1,1,-0.006386,0,1,0
241761,XOM,2025-09-17,0.003535,0.010212,0.010403,-0.347399,0.015815,0.012056,0.005021,0.001278,...,0.017798,0.012069,0.112398,0.026539,1,1,-0.006352,0,1,0
241762,XOM,2025-09-18,0.002428,0.010552,0.010544,-0.369386,0.01591,0.012416,0.004867,0.001339,...,0.017798,0.012416,0.029718,0.026539,1,1,0.014465,1,1,0
241763,XOM,2025-09-19,0.001617,0.011017,0.010701,-0.234189,0.015767,0.012668,0.004622,0.001397,...,0.017798,0.012668,2.128391,0.026539,1,1,0.038259,1,1,0
241764,XOM,2025-09-22,0.000331,0.011386,0.010789,-0.223868,0.015507,0.012148,0.004509,0.001431,...,0.017798,0.012668,2.128391,0.026539,1,1,0.019449,1,1,0
241765,XOM,2025-09-23,0.000979,0.011713,0.010912,-0.144345,0.015886,0.012694,0.004308,0.001485,...,0.017798,0.012694,2.128391,0.026539,1,1,-0.010587,0,1,0
241766,XOM,2025-09-24,0.001358,0.011939,0.011056,-0.161566,0.015799,0.012706,0.004204,0.001514,...,0.017798,0.012706,2.128391,0.026539,1,1,-0.022689,0,1,0
241767,XOM,2025-09-25,0.001244,0.01211,0.011192,-0.162826,0.015751,0.012623,0.004096,0.001527,...,0.017798,0.012706,2.128391,0.026539,1,1,-0.03791,0,1,0


##### Supervised feature matrix and targets 

In [None]:
'''
Still working on this one 

-Cale

'''
