In [21]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from matplotlib.collections import LineCollection
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import seaborn as sns
import math

import umap
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    silhouette_score, calinski_harabasz_score, davies_bouldin_score
)

from scipy.stats import chi2

from config.regime_tickers import custom_vol_subset

# For Alpaca API
from datetime import datetime
from zoneinfo import ZoneInfo

from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
from alpaca.data.historical.stock import StockHistoricalDataClient

from alpaca.data.requests import (
    StockBarsRequest
)

from alpaca.data.enums import Adjustment

# to run async code in jupyter notebook
import nest_asyncio
nest_asyncio.apply()

# For API Keys
import os
from dotenv import load_dotenv

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
API_KEY = None
SECRET_KEY = None

load_dotenv(override=True)

if API_KEY is None:
    API_KEY = os.environ.get('ALP_API_KEY')

if SECRET_KEY is None:
    SECRET_KEY = os.environ.get('ALP_SEC_KEY')

In [23]:
ticker_subsets = {
    'Equity_Indices': ['SPY', 'QQQ', 'IWM', 'DIA'],
    'Sectors': ['XLF', 'XLK', 'XLE', 'XLV', 'XLI'],
    'Global_Equities': ['EEM', 'EFA', 'FXI', 'EWZ'],
    'Bonds_Rates': ['TLT', 'IEF', 'SHY', 'HYG', 'LQD'],
    'Commodities': ['GLD', 'SLV', 'USO', 'DBA'],
    'Currencies': ['UUP', 'FXE', 'FXY']
}

all_tickers = sum(ticker_subsets.values(), [])

In [24]:
stock_historical_data_client = StockHistoricalDataClient(API_KEY, SECRET_KEY)

# alpaca has no older data than 2016-01-04 for this symbol set
earliest_date = datetime(2016, 1, 4, tzinfo=ZoneInfo('America/New_York'))
last_date = datetime(2025, 7, 20, tzinfo=ZoneInfo('America/New_York'))

req = StockBarsRequest(
    symbol_or_symbols = all_tickers, 
    timeframe=TimeFrame(amount = 1, unit = TimeFrameUnit.Day), 
    start = earliest_date,  
    end = last_date,                  
    limit = None,    
    adjustment=Adjustment('all') # adjust for splits and dividends                                           
)
df_adj = stock_historical_data_client.get_stock_bars(req).df.reset_index().set_index('timestamp')
df_adj = df_adj.sort_values(by=['symbol', 'timestamp']) # Ensure sorted for correct rolling calcs


In [25]:
print("Total NANs:", df_adj.pivot(columns="symbol").isna().sum().sum())

Total NANs: 0


In [26]:
# Trend feature functions
def compute_trend_features(df):
    df = df.sort_index()
    df['ret_5d'] = df['close'].pct_change(5)
    df['ret_20d'] = df['close'].pct_change(20)
    df['ma_10'] = df['close'].rolling(10).mean()
    df['ma_50'] = df['close'].rolling(50).mean()
    df['ma_200'] = df['close'].rolling(200).mean()
    df['ma_ratio_10_50'] = df['ma_10'] / df['ma_50'] - 1
    df['ma_ratio_10_200'] = df['ma_10'] / df['ma_200'] - 1
    df['rsi_14'] = compute_rsi(df['close'], 14)
    macd_data = compute_macd(df['close'])
    df['macd'] = macd_data['macd']
    df['macd_signal'] = macd_data['signal']
    df['macd_diff'] = df['macd'] - df['macd_signal']
    df['trend_strength'] = np.abs(df['ma_ratio_10_200']) * df['rsi_14']
    return df

def compute_rsi(series, period):
    delta = series.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    ma_up = up.ewm(com=period - 1, min_periods=period).mean()
    ma_down = down.ewm(com=period - 1, min_periods=period).mean()
    rs = ma_up / ma_down
    return 100 - (100 / (1 + rs))

def compute_macd(series, short=12, long=26, signal=9):
    ema_short = series.ewm(span=short, min_periods=short).mean()
    ema_long = series.ewm(span=long, min_periods=long).mean()
    macd = ema_short - ema_long
    signal_line = macd.ewm(span=signal, min_periods=signal).mean()
    return pd.DataFrame({'macd': macd, 'signal': signal_line}, index=series.index)

# Apply feature extraction
features_by_subset = {}
for subset, tickers in ticker_subsets.items():
    df_subset = df_adj[df_adj['symbol'].isin(tickers)].copy()
    df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
    df_feat = df_feat.droplevel(0)  # remove groupby index level
    df_feat = df_feat.dropna()
    features_by_subset[subset] = df_feat

  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)


In [27]:
# Combine all subsets
df_all = pd.concat(features_by_subset.values(), axis=0)
df_all = df_all.sort_index()

# Pivot wide
pivot_cols = ['ret_5d', 'ret_20d', 'ma_ratio_10_50', 'ma_ratio_10_200', 'rsi_14', 'macd_diff', 'trend_strength']
df_pivot = df_all.pivot_table(index=df_all.index, columns='symbol', values=pivot_cols)
df_pivot = df_pivot.dropna()

# Final smoothed 10 features
df_features = pd.DataFrame(index=df_pivot.index)

# 1. Mean 5-day return across equity indices
df_features['eq_idx_ret5d_mean'] = df_pivot['ret_5d'][['SPY', 'QQQ', 'IWM', 'DIA']].mean(axis=1)

# 2. Std of 20-day return across global equities
df_features['glob_eq_ret20d_std'] = df_pivot['ret_20d'][['EEM', 'EFA', 'FXI', 'EWZ']].std(axis=1)

# 3. Trend spread: max - min 10/200 MA ratio across sectors
df_features['sector_ma_ratio_spread'] = df_pivot['ma_ratio_10_200'][['XLF', 'XLK', 'XLE', 'XLV', 'XLI']].max(axis=1) - \
                                        df_pivot['ma_ratio_10_200'][['XLF', 'XLK', 'XLE', 'XLV', 'XLI']].min(axis=1)

# 4. RSI divergence: QQQ - SPY
df_features['rsi_diff_qqq_spy'] = df_pivot['rsi_14']['QQQ'] - df_pivot['rsi_14']['SPY']

# 5. Bond-equity trend divergence (LQD vs SPY)
df_features['macd_diff_lqd_spy'] = df_pivot['macd_diff']['LQD'] - df_pivot['macd_diff']['SPY']

# 6. Avg trend strength across commodities
df_features['commod_trend_strength'] = df_pivot['trend_strength'][['GLD', 'SLV', 'USO', 'DBA']].mean(axis=1)

# 7. Momentum dispersion across sectors (stdev of 10/50 MA ratio)
df_features['sector_momentum_dispersion'] = df_pivot['ma_ratio_10_50'][['XLF', 'XLK', 'XLE', 'XLV', 'XLI']].std(axis=1)

# 8. Global mean RSI across global equities (smooth replacement)
df_features['glob_eq_rsi_mean'] = df_pivot['rsi_14'][['EEM', 'EFA', 'FXI', 'EWZ']].mean(axis=1)

# 9. Currency-trend tilt: USD vs FXE/FXY
df_features['usd_vs_fxe_fxy_trend'] = df_pivot['ma_ratio_10_200']['UUP'] - \
                                      df_pivot['ma_ratio_10_200'][['FXE', 'FXY']].mean(axis=1)

# 10. Equity trend consensus (mean MACD diff)
df_features['eq_idx_macd_diff_mean'] = df_pivot['macd_diff'][['SPY', 'QQQ', 'IWM', 'DIA']].mean(axis=1)

from scipy.stats import linregress

# 11. Volatility skew: std(20d ret) - std(5d ret) across global equities
ret_5d_glob = df_pivot['ret_5d'][['EEM', 'EFA', 'FXI', 'EWZ']]
ret_20d_glob = df_pivot['ret_20d'][['EEM', 'EFA', 'FXI', 'EWZ']]
vol_skew = ret_20d_glob.rolling(window=20).std() - ret_5d_glob.rolling(window=20).std()
df_features['glob_eq_vol_skew'] = vol_skew.mean(axis=1)

# 12. Commodity momentum slope: slope of 10-day returns linear fit for commodities
def slope_of_returns(series):
    x = np.arange(len(series))
    if len(series) < 10 or series.isnull().any():
        return np.nan
    slope, _, _, _, _ = linregress(x, series)
    return slope

# Compute rolling 10-day returns for commodities and apply slope function per timestamp
commodities = ['GLD', 'SLV', 'USO', 'DBA']
mom_slope = pd.DataFrame(index=df_pivot.index)

for c in commodities:
    mom_slope[c] = df_pivot['ret_5d'][c].rolling(window=10).apply(slope_of_returns, raw=False)

df_features['commod_momentum_slope'] = mom_slope.mean(axis=1)

# Drop any rows with NaNs introduced by rolling window
df_features = df_features.dropna()

# Updated final features dataframe: 12 total features
df_trend_factors = df_features.copy()


# Final DataFrame
df_trend_factors = df_features.copy()


In [28]:
# from scipy.stats.mstats import winsorize

# for col in final_pivot_df_for_umap_gmm.columns:
#     # Winsorize at the 0+limit% and 100-limit% quantiles
#     final_pivot_df_for_umap_gmm[col] = winsorize(final_pivot_df_for_umap_gmm[col], limits=[0.01, 0.01])

In [29]:
df_trend_factors.columns

Index(['eq_idx_ret5d_mean', 'glob_eq_ret20d_std', 'sector_ma_ratio_spread',
       'rsi_diff_qqq_spy', 'macd_diff_lqd_spy', 'commod_trend_strength',
       'sector_momentum_dispersion', 'glob_eq_rsi_mean',
       'usd_vs_fxe_fxy_trend', 'eq_idx_macd_diff_mean', 'glob_eq_vol_skew',
       'commod_momentum_slope'],
      dtype='object')

In [30]:
print(df_trend_factors.isna().sum().sum()) 

df_pivot_clean = df_trend_factors.copy()

0


In [31]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import umap

# Constants
N_COMP = 3
N_NEIG = 15
MIN_D = 0.01
METRIC = "correlation"
RANDOM_STATE = 0
N_ITER = 200  # Number of random search iterations
MIN_FEATURES = 5
CLUSTER_OPTIONS = [3, 4]

# Scalers dictionary
scalers = {
    "Robust": RobustScaler(),
    "Standard": StandardScaler()
}

# Prepare results list
results = []

# Get all feature names
all_features = df_pivot_clean.columns.tolist()

for i in range(N_ITER):
    # Random subset of features (at least MIN_FEATURES)
    n_features = random.randint(MIN_FEATURES, len(all_features)-2)
    selected_features = random.sample(all_features, n_features)
    df_subset = df_pivot_clean[selected_features]

    for scaler_name, scaler in scalers.items():
        scaled = scaler.fit_transform(df_subset)

        # UMAP dimensionality reduction
        X_umap = umap.UMAP(
            n_neighbors=N_NEIG,
            min_dist=MIN_D,
            n_components=N_COMP,
            metric=METRIC,
            random_state=RANDOM_STATE + i  # vary random state
        ).fit_transform(scaled)

        for k in CLUSTER_OPTIONS:
            gmm = GaussianMixture(n_components=k, init_params="k-means++", random_state=RANDOM_STATE + i)
            gmm_labels = gmm.fit_predict(X_umap)

            # Compute metrics
            silhouette = silhouette_score(X_umap, gmm_labels)
            ch_score = calinski_harabasz_score(X_umap, gmm_labels)
            db_score = davies_bouldin_score(X_umap, gmm_labels)

            # Save result
            results.append({
                "iteration": i + 1,
                "scaler": scaler_name,
                "n_features": n_features,
                "features": selected_features,
                "n_clusters": k,
                "silhouette": silhouette,
                "calinski_harabasz": ch_score,
                "davies_bouldin": db_score
            })

# Convert to DataFrame and save to CSV
results_df = pd.DataFrame(results).sort_values(by='silhouette', ascending=False)
results_df.to_csv("random_search_clustering_results.csv", index=False)
print("âœ… Results saved to 'random_search_clustering_results.csv'")


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


âœ… Results saved to 'random_search_clustering_results.csv'


In [33]:
results_df.sort_values(by='silhouette', ascending=False).iloc[0]

iteration                                                           15
scaler                                                        Standard
n_features                                                           7
features             [rsi_diff_qqq_spy, commod_momentum_slope, comm...
n_clusters                                                           3
silhouette                                                    0.575814
calinski_harabasz                                          3814.997559
davies_bouldin                                                0.632564
Name: 58, dtype: object

In [36]:
by_clusters = results_df.groupby("n_clusters").agg(
    mean_silhouette=("silhouette", "mean"),
    max_silhouette=("silhouette", "max"),
    count=("silhouette", "count")
).reset_index()

print("ðŸ”¢ Silhouette by n_clusters:")
print(by_clusters)


ðŸ”¢ Silhouette by n_clusters:
   n_clusters  mean_silhouette  max_silhouette  count
0           3         0.369688        0.575814    400
1           4         0.371567        0.526739    400


In [37]:
# Explode the 'features' list into individual rows
exploded = results_df.explode("features")

# Group by individual feature and compute mean silhouette
feature_silhouette = exploded.groupby("features").agg(
    mean_silhouette=("silhouette", "mean"),
    max_silhouette=("silhouette", "max"),
    count=("silhouette", "count")
).reset_index().rename(columns={"features": "feature"})

# Sort by mean silhouette score
feature_silhouette = feature_silhouette.sort_values(by="mean_silhouette", ascending=False)

print("ðŸ“Š Mean silhouette score per feature:")
print(feature_silhouette)


ðŸ“Š Mean silhouette score per feature:
                       feature  mean_silhouette  max_silhouette  count
9       sector_ma_ratio_spread         0.375941        0.575814    496
11        usd_vs_fxe_fxy_trend         0.375350        0.575814    484
7            macd_diff_lqd_spy         0.375096        0.575814    500
2        eq_idx_macd_diff_mean         0.371123        0.575814    464
3            eq_idx_ret5d_mean         0.371009        0.551491    516
1        commod_trend_strength         0.370905        0.575814    492
5             glob_eq_rsi_mean         0.369017        0.568750    504
10  sector_momentum_dispersion         0.368695        0.568750    492
6             glob_eq_vol_skew         0.367198        0.568750    488
0        commod_momentum_slope         0.363531        0.575814    544
4           glob_eq_ret20d_std         0.363427        0.529027    488
8             rsi_diff_qqq_spy         0.357904        0.575814    468


In [42]:
# Define the required feature set
required_features = {
    'glob_eq_ret20d_std',
    'commod_momentum_slope',
    'eq_idx_ret5d_mean',
    'sector_ma_ratio_spread',
    'usd_vs_fxe_fxy_trend'
}

# Filter rows where all required features are present in the 'features' list
filtered_df = results_df[results_df["features"].apply(lambda x: required_features.issubset(set(x)))]

print(f"âœ… Found {len(filtered_df)} rows with all required features.")
print(filtered_df.head())

# Print only the 'features' column fully
for i, row in filtered_df.iterrows():
    print(f"Row {i} features: {row['features']}")

âœ… Found 76 rows with all required features.
     iteration    scaler  n_features  \
253         64    Robust           9   
458        115  Standard           7   
254         64  Standard           9   
62          16  Standard           8   
651        163  Standard           8   

                                              features  n_clusters  \
253  [eq_idx_macd_diff_mean, commod_trend_strength,...           4   
458  [eq_idx_ret5d_mean, commod_momentum_slope, usd...           3   
254  [eq_idx_macd_diff_mean, commod_trend_strength,...           3   
62   [macd_diff_lqd_spy, eq_idx_ret5d_mean, glob_eq...           3   
651  [usd_vs_fxe_fxy_trend, glob_eq_ret20d_std, eq_...           4   

     silhouette  calinski_harabasz  davies_bouldin  
253    0.510232        2374.834717        0.711624  
458    0.484738        2571.223389        0.723725  
254    0.472847        2384.023682        0.860103  
62     0.469544        2303.833496        0.897958  
651    0.458301        3448