# Regime Identification (Similarity + HMM)

This notebook applies the statistical-model approaches to identify regimes and analogs:
- Euclidean distance matrix (baseline similarity)
- KNN with Mahalanobis distance
- KNN with Correlation distance
- GaussianHMM for latent regime states

All methods are applied on the engineered feature matrix.

In [62]:
import sys
from pathlib import Path
import importlib
import pandas as pd
import plotly.express as px

repo_root = Path.cwd()
if not (repo_root / 'src').exists() and (repo_root.parent / 'src').exists():
    repo_root = repo_root.parent
if (repo_root / 'src').exists():
    sys.path.insert(0, str(repo_root))

import src.regime_model as regime_model
importlib.reload(regime_model)

RegimeModelConfig = regime_model.RegimeModelConfig
load_feature_matrix = regime_model.load_feature_matrix
euclidean_distance_matrix = regime_model.euclidean_distance_matrix
knn_mahalanobis = regime_model.knn_mahalanobis
knn_correlation = regime_model.knn_correlation
neighbors_to_frame = regime_model.neighbors_to_frame
fit_hmm = regime_model.fit_hmm
transition_matrix = regime_model.transition_matrix
nearest_regimes_by_hmm_no_gap = regime_model.nearest_regimes_by_hmm_no_gap


In [63]:
feature_path = repo_root / 'data' / 'processed' / 'feature_matrix_clean.csv'
features = load_feature_matrix(feature_path)
features.head()

Unnamed: 0_level_0,market_level,market_trend,market_vol,yield_curve_level,yield_curve_trend,yield_curve_vol,oil ($/bbl)_level,oil ($/bbl)_trend,oil ($/bbl)_vol,copper ($/metric ton)_level,...,copper ($/metric ton)_vol,monetary_policy_level,monetary_policy_trend,monetary_policy_vol,volatility_level,volatility_trend,volatility_vol,stock_bond_corr_level,stock_bond_corr_trend,stock_bond_corr_vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1964-12-31,1.746676,0.138897,-0.514002,-1.898958,0.284913,-0.070939,-1.39548,1.475356,-1.685585,2.68422,...,2.883487,1.890044,-0.312922,-0.102421,-0.533933,0.845219,-1.761487,-1.113488,-2.995419,0.74916
1965-01-31,2.024544,0.193961,-0.344187,-1.931071,-0.12275,0.267958,-2.344524,0.814402,-0.555233,1.154654,...,2.209314,1.913656,0.074663,0.221331,-0.71434,0.683306,-1.625352,-1.472129,-3.641215,1.931925
1965-02-28,1.876295,0.150978,-0.433696,-2.134447,-0.810059,0.764713,-2.154738,0.318341,-0.185438,1.953986,...,1.814574,2.118497,0.741972,0.706316,-0.252741,0.718275,-1.478315,-1.557477,-3.233518,2.775352
1965-03-31,1.613205,-0.033994,-0.776245,-1.835985,-1.069766,0.94066,-2.004818,-0.060432,0.045415,2.094114,...,1.533152,1.821838,1.003596,0.881214,-0.702252,0.527501,-1.394257,-1.346901,-2.800504,2.751225
1965-04-30,1.89757,-0.021983,-0.723903,-1.683984,-1.027575,0.908057,-1.882513,-0.330461,0.192075,2.437331,...,1.454749,1.675595,0.974285,0.858106,-0.688971,0.447677,-1.303385,-0.94749,-2.270211,2.272496


In [64]:
cfg = RegimeModelConfig(knn_k=5, standardize=True)
dist = euclidean_distance_matrix(features, cfg)

# Downsample 
max_points = 200
if len(dist) > max_points:
    step = max(1, len(dist) // max_points)
    dist_plot = dist.iloc[::step, ::step]
else:
    dist_plot = dist

fig = px.imshow(
    dist_plot,
    color_continuous_scale='Viridis',
    title='Euclidean Distance Matrix (Downsampled)'
)
fig.update_layout(height=500)
fig.show()

In [65]:
_, maha_dist, maha_idx = knn_mahalanobis(features, cfg)
_, corr_dist, corr_idx = knn_correlation(features, cfg)

maha_neighbors = neighbors_to_frame(maha_idx, features.index, k=cfg.knn_k)
corr_neighbors = neighbors_to_frame(corr_idx, features.index, k=cfg.knn_k)

target_date = features.index[-1]
print('Target date:', target_date.date())
print('Mahalanobis neighbors:')
display(maha_neighbors.loc[target_date])
print('Correlation neighbors:')
display(corr_neighbors.loc[target_date])

Target date: 2025-10-31
Mahalanobis neighbors:


neighbor_1   2025-09-30
neighbor_2   2025-08-31
neighbor_3   2025-07-31
neighbor_4   2025-06-30
neighbor_5   2025-05-31
Name: 2025-10-31 00:00:00, dtype: datetime64[ns]

Correlation neighbors:


neighbor_1   2025-09-30
neighbor_2   2025-08-31
neighbor_3   2025-07-31
neighbor_4   2025-06-30
neighbor_5   2025-05-31
Name: 2025-10-31 00:00:00, dtype: datetime64[ns]

In [66]:
analog_df = pd.DataFrame({
    'date': features.index,
    'maha_neighbor_1': maha_neighbors['neighbor_1'].values,
    'corr_neighbor_1': corr_neighbors['neighbor_1'].values,
})

fig = px.scatter(
    analog_df,
    x='date',
    y='maha_neighbor_1',
    title='Regime Analog Transitions (Mahalanobis Nearest Neighbor)',
    labels={'maha_neighbor_1': 'Nearest Analog Date'}
)
fig.update_traces(marker={'size': 5, 'opacity': 0.6})
fig.update_layout(height=450)
fig.show()

fig = px.scatter(
    analog_df,
    x='date',
    y='corr_neighbor_1',
    title='Regime Analog Transitions (Correlation Nearest Neighbor)',
    labels={'corr_neighbor_1': 'Nearest Analog Date'}
)
fig.update_traces(marker={'size': 5, 'opacity': 0.6})
fig.update_layout(height=450)
fig.show()

In [67]:
# HMM neighbors (analogous to KNN neighbors)
hmm_cfg = RegimeModelConfig(n_components=3, standardize=True)
hmm_model, X_hmm, hmm_states = fit_hmm(features, hmm_cfg)

regime_probs = hmm_model.predict_proba(X_hmm)
regime_df = pd.DataFrame(
    regime_probs,
    index=features.index,
    columns=[f'regime_{i}' for i in range(hmm_cfg.n_components)]
)
regime_df['regime_code'] = hmm_states

target_date = features.index[-1]
nearest_hmm = nearest_regimes_by_hmm_no_gap(regime_df, target_date, k=cfg.knn_k)

print('Target date:', target_date.date())
print('HMM neighbors:')
display(nearest_hmm)

regime_df.head()

Target date: 2025-10-31
HMM neighbors:


date
2019-07-31    2.607724e-26
2019-08-31    3.377927e-26
2019-11-30    3.420610e-26
2019-10-31    3.437324e-26
2019-09-30    3.457703e-26
dtype: float64

Unnamed: 0_level_0,regime_0,regime_1,regime_2,regime_code
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1964-12-31,0.0,0.0,1.0,2
1965-01-31,0.0,0.0,1.0,2
1965-02-28,0.0,0.0,1.0,2
1965-03-31,0.0,0.0,1.0,2
1965-04-30,0.0,0.0,1.0,2


In [68]:
hmm_plot_df = pd.DataFrame({
    'date': regime_df.index,
    'regime_code': regime_df['regime_code'].values,
})

fig = px.scatter(
    hmm_plot_df,
    x='date',
    y='regime_code',
    title='HMM Regime Transitions Over Time',
    labels={'regime_code': 'Regime'}
)
fig.update_traces(marker={'size': 6, 'opacity': 0.7})
fig.update_layout(height=450)
fig.show()

tm = transition_matrix(regime_df['regime_code'])
fig = px.imshow(
    tm,
    text_auto=True,
    color_continuous_scale='Blues',
    title='HMM Regime Transition Matrix'
)
fig.update_layout(height=450)
fig.show()

In [69]:
out_maha = repo_root / 'data' / 'processed' / 'regime_neighbors_mahalanobis.csv'
out_corr = repo_root / 'data' / 'processed' / 'regime_neighbors_correlation.csv'
out_hmm = repo_root / 'data' / 'processed' / 'regime_labels_hmm.csv'

maha_neighbors.to_csv(out_maha)
corr_neighbors.to_csv(out_corr)
regime_df.to_csv(out_hmm)
out_maha, out_corr, out_hmm

(PosixPath('/Users/hippo/Downloads/nfs-regime-based-predictive-modelling/data/processed/regime_neighbors_mahalanobis.csv'),
 PosixPath('/Users/hippo/Downloads/nfs-regime-based-predictive-modelling/data/processed/regime_neighbors_correlation.csv'),
 PosixPath('/Users/hippo/Downloads/nfs-regime-based-predictive-modelling/data/processed/regime_labels_hmm.csv'))