### Current Idea (open to other ideas of course!):


Traditional factor models can struggle capturing short-horizon "micro-regimes" in stock price behavior. The Goal of this project is to use a small autoencoder to learn regimes from 20-day *(I just picked a number we can change this obviously)* sliding windows of price/volume features, and then studiy:

- What the regimes look like
- How they transition from one to another
- Whether transitions have predictive power for near-term returns or volatitlity
- Potential comparissons to simple baselines (momentum, volatility)

### Gathering Data 


####  Optional Download (You shouldn't need to do this) Please use the link in the readme to get the dataset

In [None]:
# uncomment to download data. 
# Not needed if you have the .parquet file in the data/historical folder

# !python qualifier\utils\download_stock_data.py

#### Load Dataset from file

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Set random seeds for reproducibility
np.random.seed(42)

# Data path
PARQUET_PATH = Path("data/historical/all_stocks_historical.parquet")

print(f"Loading data from: {PARQUET_PATH}")
df = pd.read_parquet(PARQUET_PATH)
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumn dtypes:")
print(df.dtypes)

In [None]:
# standardize and prepare data
print("Available columns:", df.columns.tolist())

# ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

# Sort by ticker and date
df = df.sort_values(['ticker', 'date']).reset_index(drop=True)

print("\n" + "=" * 60)
print("Data Summary:")
print("=" * 60)
print(f"Total rows: {len(df):,}")
print(f"Unique tickers: {df['ticker'].nunique():,}")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"Days covered: {(df['date'].max() - df['date'].min()).days:,} days")
print(f"Date column type: {df['date'].dtype}")

### Cleaning Data

#### Handling Nans

In [None]:
# original data with nans
print("\n" + "=" * 60)
print("Original Data:")
print("=" * 60)
display(df.head())

# no nan 
print("\n" + "=" * 60)
print("valid (non-null) data:")
print("=" * 60)

# remove rows with nans
cleaned_df = df.dropna(subset=['close', 'volume'])
display(cleaned_df[['ticker', 'date', 'open', 'high', 'low', 'close', 'volume']].head())

#### Trimming dates

In [None]:
# trimming the df_valid to start at the year 2000
cleaned_trimmed_df = cleaned_df[cleaned_df['date'] >= '2000-01-01'].copy()
print("\n" + "=" * 60)
print("Filtered data (Year 2000 onwards):")
print("=" * 60)
print(f"Total rows: {len(cleaned_trimmed_df):,}")
print(f"Date range: {cleaned_trimmed_df['date'].min().date()} to {cleaned_trimmed_df['date'].max().date()}")
display(cleaned_trimmed_df[['ticker', 'date', 'open', 'high', 'low', 'close', 'volume']].head())

#### Removing stocks that do not have at least 1000 days of data and Filtering for highest volume stocks

In [None]:
# two-stage universe selection/filtering to shrink for MVP
# 1. remove tickers with insufficient history
# 2. select top N by average volume from quality-filtered set


min_days_required = 1000  # min number of days per ticker
top_n = 50  # target universe size

print("\n" + "=" * 60)
print(f"Stage 1: Quality Filter (min {min_days_required} trading days)")
print("=" * 60)

# count days per ticker in the full trimmed dataset
rows_per_ticker_all = cleaned_trimmed_df.groupby('ticker').size()
valid_tickers = rows_per_ticker_all[rows_per_ticker_all >= min_days_required].index.tolist()

print(f"Tickers before quality filter: {len(rows_per_ticker_all):,}")
print(f"Tickers after quality filter:  {len(valid_tickers):,}")
print(f"Tickers removed: {len(rows_per_ticker_all) - len(valid_tickers):,}")

# filter to quality tickers only
quality_filtered_df = cleaned_trimmed_df[cleaned_trimmed_df['ticker'].isin(valid_tickers)].copy()

print("\n" + "=" * 60)
print(f"Stage 2: Liquidity Ranking (top {top_n} by avg volume)")
print("=" * 60)

# average volume on quality-filtered stocks
avg_volume_by_ticker = quality_filtered_df.groupby('ticker')['volume'].mean().sort_values(ascending=False)

# select top N from the filtered universe
universe = avg_volume_by_ticker.head(top_n).index.tolist()


In [None]:
# create data dataframe from the quality filters.
data = quality_filtered_df[quality_filtered_df['ticker'].isin(universe)].copy()
data = data.sort_values(['ticker', 'date']).reset_index(drop=True)

print(f"Selected universe size: {len(universe)} tickers (target: {top_n})")
print(f"{len(universe)} tickers have >= {min_days_required} days of history")

print("-" * 60)
for i, (ticker, vol) in enumerate(avg_volume_by_ticker.head(10).items(), 1):
    print(f"  {i:2d}. {ticker:6s} - {vol:>15,.0f} shares/day")

In [None]:
print("\n" + "=" * 60)
print(f"Final Universe: {len(universe)} stocks")
print("=" * 60)
print(f"Total rows: {len(data):,}")
print(f"Unique tickers: {data['ticker'].nunique()}")
print(f"Date range: {data['date'].min().date()} to {data['date'].max().date()}")


print("\n" + "-" * 60)
print("Data Completeness (per ticker):")
print("-" * 60)
rows_per_ticker = data.groupby('ticker').size()
print(f"  Min:    {rows_per_ticker.min():>5,} days  (>= {min_days_required} guaranteed)")
print(f"  Median: {rows_per_ticker.median():>5,.0f} days")
print(f"  Mean:   {rows_per_ticker.mean():>5,.0f} days")
print(f"  Max:    {rows_per_ticker.max():>5,} days")

print("\n" + "=" * 60)
print("Selected Tickers (sorted by avg volume):")
print(f"  {', '.join(universe)}")

display(data.head(10))

### Exploring data

#### Feature engineering

In [None]:
feat_df = data.copy()
feat_df = feat_df.sort_values(['ticker', 'date']).reset_index(drop=True)

# 1. log price & 1-day log return 
feat_df['log_price'] = np.log(feat_df['close'])
feat_df['log_ret_1d'] = feat_df.groupby('ticker')['log_price'].diff()

# 2. rolling volatilities of returns 
feat_df['vol_5d'] = (
    feat_df.groupby('ticker')['log_ret_1d']
           .rolling(window=5, min_periods=5)
           .std()
           .reset_index(level=0, drop=True)
)

feat_df['vol_20d'] = (
    feat_df.groupby('ticker')['log_ret_1d']
           .rolling(window=20, min_periods=20)
           .std()
           .reset_index(level=0, drop=True)
)

# 3. vol z-score with 1-period lag to exclude current row
g = feat_df.groupby('ticker')['volume']
exp_mean = g.expanding().mean().shift(1).reset_index(level=0, drop=True)
exp_std  = g.expanding().std(ddof=0).shift(1).reset_index(level=0, drop=True)

feat_df['volume_mean_exp'] = exp_mean
feat_df['volume_std_exp']  = exp_std

# avoid dividing by 0
feat_df['volume_z'] = (
    (feat_df['volume'] - feat_df['volume_mean_exp']) 
    / (feat_df['volume_std_exp'] + 1e-8)
)


# 4. intraday range
feat_df['range_frac'] = (feat_df['high'] - feat_df['low']) / feat_df['close']

# Keep only needed columns
feature_cols = ['log_ret_1d', 'vol_5d', 'vol_20d', 'volume_z', 'range_frac']

feat_df = feat_df[['ticker', 'date'] + feature_cols]

# drop nan rows that were created because of frolling features etc
feat_df = feat_df.dropna(subset=feature_cols).reset_index(drop=True)

print("Feature DF shape:", feat_df.shape)
feat_df.head()

#### Feature Correlation

In [None]:
import plotly.express as px
import plotly.io as pio
corr = feat_df[feature_cols].corr()

fig = px.imshow(
    corr,
    text_auto=".2f",
    color_continuous_scale="RdBu_r",
    origin="lower",
    title="Correlation between features"
)
fig.update_layout(height=500, width=600)
fig.show()


### Modeling 

####  Unsupervised learning

##### Creating windows

In [None]:
window_len = 20 # how big (how many days) the window is


# function to make le windows of 20 days for each ticker
def make_window_features(feat_df, feature_cols, window_len=20):
    
    
    frames = []
    # looping through the tickers and creating windows
    for ticker, g in feat_df.groupby('ticker'):
        g = g.sort_values('date').reset_index(drop=True)

        roll = g[feature_cols].rolling(window_len, min_periods=window_len)

        win_feats = pd.concat(
            [
                roll.mean().add_suffix('_mean'),
                roll.std().add_suffix('_std'),
                roll.min().add_suffix('_min'),
                roll.max().add_suffix('_max'),
            ],
            axis=1,
        )

        tmp = pd.concat([g[['ticker', 'date']], win_feats], axis=1).dropna()
        frames.append(tmp)

    window_df = pd.concat(frames, ignore_index=True)
    return window_df

window_df = make_window_features(
    feat_df,
    feature_cols=['log_ret_1d', 'vol_5d', 'vol_20d', 'volume_z', 'range_frac'],
    window_len=window_len
)

print(window_df.shape)
window_df.head()


##### Creating regimes using k-means

In [None]:
#  window_df we created above is what we'll use for the Unsuper. learning
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# feature cols in window_df
window_feature_cols = [c for c in window_df.columns if c not in ['ticker', 'date']]

X_win = window_df[window_feature_cols].values

scaler_win = StandardScaler()
X_win_scaled = scaler_win.fit_transform(X_win)

k = 6  # first guess for number of regimes; you can try 4, 6, 8, etc.

kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
window_df['regime_k6'] = kmeans.fit_predict(X_win_scaled)

window_df['regime_k6'].value_counts().sort_index()

In [None]:
# exploring the regime windows
regime_profile = (
    window_df
    .groupby('regime_k6')[window_feature_cols]
    .mean()
    .assign(count=window_df.groupby('regime_k6').size())
)

regime_profile

#### Supervised learning

##### Creating Future Targets

In [None]:
##### Creating Future Targets

# using feat_df so we can line it by ticker and date
feat_with_target = feat_df.copy()

# number of days ahead
horizon = 5

def add_targets(g, horizon=horizon):
    g = g.sort_values('date').reset_index(drop=True)

    # cumu sum of 1D log returns
    g['cum_log_ret'] = g['log_ret_1d'].cumsum()

    # future horizon log return: sum_{t+1..t+horizon} log_ret_1d
    # = cum_log_ret at t+horizon minus cum_log_ret at t
    g[f'ret_{horizon}d_fwd'] = g['cum_log_ret'].shift(-horizon) - g['cum_log_ret']

    return g

feat_with_target = (
    feat_with_target
    .groupby('ticker', group_keys=False)
    .apply(add_targets, horizon=horizon)
)

# binary label: will the horizon return be positive (CLASSIFICXAION)
feat_with_target[f'up_{horizon}d'] = (feat_with_target[f'ret_{horizon}d_fwd'] > 0).astype(int)

# drop helper column 
feat_with_target = (
    feat_with_target
    .drop(columns=['cum_log_ret'])
    .dropna(subset=[f'ret_{horizon}d_fwd'])
    .reset_index(drop=True)
)

feat_with_target.head()


##### Merging windows with targets

In [None]:
#### Merging window features, regimes, and future targets

#  window features, regimes, and future targets combined
supervised_df = (
    window_df
    .merge(
        feat_with_target[['ticker', 'date', 'ret_5d_fwd', 'up_5d']],
        on=['ticker', 'date'],
        how='inner'
    )
    .sort_values(['ticker', 'date'])
    .reset_index(drop=True)
)

print(supervised_df.shape)
supervised_df.head()

##### Adding regime transition features

In [None]:
from itertools import groupby

# getting the previous regime for each window and each ticker
supervised_df['regime_prev'] = (supervised_df.groupby('ticker')['regime_k6'].shift(1))

# comparing against the previous window to see if there was a regime change
supervised_df['regime_changed'] = (

    # checking that: 
    # prev regime not na 
    # and does not equal current regime
    supervised_df['regime_prev'].notna() & (supervised_df['regime_prev'] != supervised_df['regime_k6']).astype(int)
)

# the first window of the data wont have a regime before it so droping it and reset index
supervised_df = supervised_df.dropna(subset=['regime_prev']).reset_index(drop=True)

supervised_df.head()

##### Supervised feature matrix and targets 

In [None]:
'''
Still working on this one 

-Cale

'''
