In [5]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Get current working directory (where notebook is running)
current_dir = os.getcwd()
# Go up one level, then into utils
utils_path = os.path.abspath(os.path.join(current_dir, '..', 'utils'))
# Add to sys.path
sys.path.append(utils_path)

from trend_regime_utils import load_trend_data, process_trend_data, create_advanced_feat, mayority_vote_cluster_smooth
from bull_trend_regime_utils import load_bull_trend_data, create_advanced_bull_feat, merge_clean_final_clusters

import pandas as pd
import numpy as np
from datetime import datetime
from zoneinfo import ZoneInfo
import joblib

# For API Keys
from dotenv import load_dotenv
# Alpaca API keys
API_KEY = None
SECRET_KEY = None

load_dotenv(override=True)

if API_KEY is None:
    API_KEY = os.environ.get('ALP_API_KEY')

if SECRET_KEY is None:
    SECRET_KEY = os.environ.get('ALP_SEC_KEY')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
EARLIEST_DATE = datetime(2016, 1, 16, tzinfo=ZoneInfo('America/New_York'))
LAST_DATE = datetime(2025, 7, 20, tzinfo=ZoneInfo('America/New_York'))

df_trend_raw = load_trend_data(API_KEY, SECRET_KEY, EARLIEST_DATE, LAST_DATE)

df_trend_processed = process_trend_data(df_trend_raw)

df_trend_feat = create_advanced_feat(df_trend_processed)

  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)
  df_feat = df_subset.groupby('symbol').apply(compute_trend_features)


In [7]:
# Get the project root (one level up from current working directory)
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
MODEL_DIR = os.path.join(PROJECT_ROOT, 'models')

os.makedirs(MODEL_DIR, exist_ok=True)

## If want to predict
# Load models
scaler = joblib.load(os.path.join(MODEL_DIR, "trend_scaler.pkl"))
umap_model = joblib.load(os.path.join(MODEL_DIR, "trend_umap_model.pkl"))
gmm_model = joblib.load(os.path.join(MODEL_DIR, "trend_gmm_model.pkl"))

# scale data
trend_scaled = scaler.transform(df_trend_feat)

# Apply UMAP transformation
trend_umap = umap_model.transform(trend_scaled)

# Predict clusters
trend_gmm_labels = gmm_model.predict(trend_umap)

# from sklearn.metrics import silhouette_score
# print(silhouette_score(trend_umap, trend_gmm_labels))

df_with_clusters = pd.DataFrame(trend_gmm_labels, columns=["cluster"], index=df_trend_feat.index)

df_cluster_smooth = mayority_vote_cluster_smooth(df_with_clusters)

In [8]:
df_bull_raw = load_bull_trend_data(API_KEY, SECRET_KEY, EARLIEST_DATE, LAST_DATE)

bull_features_df = create_advanced_bull_feat(df_bull_raw)

In [9]:
# keep only bull days 
bull_days = df_cluster_smooth[df_cluster_smooth==1]

# Keep only rows in bull_features_df where the index exists in bull_days
only_bull_features_df = bull_features_df[bull_features_df.index.isin(bull_days.index)]

In [10]:
## If want to predict
# Load models
bull_scaler = joblib.load(os.path.join(MODEL_DIR, "bull_trend_scaler.pkl"))
bull_umap_model = joblib.load(os.path.join(MODEL_DIR, "bull_trend_umap_model.pkl"))
spectral_model = joblib.load(os.path.join(MODEL_DIR, "bull_trend_spectral_model.pkl"))

# scale data
bull_trend_scaled = bull_scaler.transform(only_bull_features_df)

# Apply UMAP transformation
bull_trend_umap = bull_umap_model.transform(bull_trend_scaled)

# Predict clusters, SpectralClustering doesnâ€™t have a .predict() method for unseen data.
# Every time you call fit_predict(), it re-computes clusters from scratch, so for new data, you need to re-run it on all data (old + new)
bull_trend_spectral_labels = spectral_model.fit_predict(bull_trend_umap)

# from sklearn.metrics import silhouette_score
# print(silhouette_score(bull_trend_umap, bull_trend_spectral_labels))



In [11]:
# compute final clusters
df_final_clusters = merge_clean_final_clusters(bull_trend_spectral_labels, only_bull_features_df, df_with_clusters)

df_final_clusters.value_counts(normalize=True, dropna=False)

final_cluster
1                0.251719
0                0.240257
2                0.187987
3                0.175608
4                0.144429
Name: proportion, dtype: float64