# Regime Identification (Similarity + HMM)

This notebook applies the statistical-model approaches to identify regimes and analogs:
- Euclidean distance matrix (baseline similarity)
- KNN with Mahalanobis distance
- KNN with Correlation distance
- GaussianHMM for latent regime states

All methods are applied on the **transformed economic state variables** (12-month change → rolling 10-year z-score → winsorized at ±3).

In [1]:
import sys
from pathlib import Path
import importlib
import pandas as pd
import plotly.express as px

repo_root = Path.cwd()
if not (repo_root / 'src').exists() and (repo_root.parent / 'src').exists():
    repo_root = repo_root.parent
if (repo_root / 'src').exists():
    sys.path.insert(0, str(repo_root))

import src.regime_model as regime_model
importlib.reload(regime_model)

RegimeModelConfig = regime_model.RegimeModelConfig
load_feature_matrix = regime_model.load_feature_matrix
euclidean_distance_matrix = regime_model.euclidean_distance_matrix
knn_mahalanobis = regime_model.knn_mahalanobis
knn_correlation = regime_model.knn_correlation
neighbors_to_frame = regime_model.neighbors_to_frame
fit_hmm = regime_model.fit_hmm
transition_matrix = regime_model.transition_matrix
nearest_regimes_by_hmm_no_gap = regime_model.nearest_regimes_by_hmm_no_gap


In [2]:
feature_path = repo_root / 'data' / 'processed' / 'feature_matrix_clean.csv'
features = load_feature_matrix(feature_path)
features.head()

Unnamed: 0_level_0,market_transformed,yield_curve_transformed,oil ($/bbl)_transformed,copper ($/metric ton)_transformed,monetary_policy_transformed,volatility_transformed,stock_bond_corr_transformed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1973-12-31,-2.249706,-1.388954,3.0,2.531366,1.35073,2.869834,0.64847
1974-01-31,-2.093306,-1.00179,3.0,1.788305,0.976164,1.914132,0.978212
1974-02-28,-1.689956,-0.862877,3.0,2.037113,0.836261,0.439552,0.995444
1974-03-31,-1.835159,-1.023341,3.0,2.369871,1.011982,-0.051967,0.874601
1974-04-30,-1.717254,-1.542605,3.0,2.712843,1.530578,-0.046426,0.396054


In [3]:
cfg = RegimeModelConfig(knn_k=5, standardize=True)
dist = euclidean_distance_matrix(features, cfg)

# Downsample 
max_points = 200
if len(dist) > max_points:
    step = max(1, len(dist) // max_points)
    dist_plot = dist.iloc[::step, ::step]
else:
    dist_plot = dist

fig = px.imshow(
    dist_plot,
    color_continuous_scale='Viridis',
    title='Euclidean Distance Matrix (Downsampled)'
)
fig.update_layout(height=500)
fig.show()

In [4]:
_, maha_dist, maha_idx = knn_mahalanobis(features, cfg)
_, corr_dist, corr_idx = knn_correlation(features, cfg)

maha_neighbors = neighbors_to_frame(maha_idx, features.index, k=cfg.knn_k)
corr_neighbors = neighbors_to_frame(corr_idx, features.index, k=cfg.knn_k)

target_date = features.index[-1]
print('Target date:', target_date.date())
print('Mahalanobis neighbors:')
display(maha_neighbors.loc[target_date])
print('Correlation neighbors:')
display(corr_neighbors.loc[target_date])

Target date: 2025-10-31
Mahalanobis neighbors:


neighbor_1   2025-09-30
neighbor_2   2025-08-31
neighbor_3   2025-07-31
neighbor_4   2025-06-30
neighbor_5   2010-11-30
Name: 2025-10-31 00:00:00, dtype: datetime64[ns]

Correlation neighbors:


neighbor_1   2025-09-30
neighbor_2   2025-08-31
neighbor_3   1985-06-30
neighbor_4   2025-07-31
neighbor_5   1985-05-31
Name: 2025-10-31 00:00:00, dtype: datetime64[ns]

In [5]:
analog_df = pd.DataFrame({
    'date': features.index,
    'maha_neighbor_1': maha_neighbors['neighbor_1'].values,
    'corr_neighbor_1': corr_neighbors['neighbor_1'].values,
})

fig = px.scatter(
    analog_df,
    x='date',
    y='maha_neighbor_1',
    title='Regime Analog Transitions (Mahalanobis Nearest Neighbor)',
    labels={'maha_neighbor_1': 'Nearest Analog Date'}
)
fig.update_traces(marker={'size': 5, 'opacity': 0.6})
fig.update_layout(height=450)
fig.show()

fig = px.scatter(
    analog_df,
    x='date',
    y='corr_neighbor_1',
    title='Regime Analog Transitions (Correlation Nearest Neighbor)',
    labels={'corr_neighbor_1': 'Nearest Analog Date'}
)
fig.update_traces(marker={'size': 5, 'opacity': 0.6})
fig.update_layout(height=450)
fig.show()

In [6]:
# HMM neighbors (analogous to KNN neighbors)
hmm_cfg = RegimeModelConfig(n_components=3, standardize=True)
hmm_model, X_hmm, hmm_states = fit_hmm(features, hmm_cfg)

regime_probs = hmm_model.predict_proba(X_hmm)
regime_df = pd.DataFrame(
    regime_probs,
    index=features.index,
    columns=[f'regime_{i}' for i in range(hmm_cfg.n_components)]
)
regime_df['regime_code'] = hmm_states

target_date = features.index[-1]
nearest_hmm = nearest_regimes_by_hmm_no_gap(regime_df, target_date, k=cfg.knn_k)

print('Target date:', target_date.date())
print('HMM neighbors:')
display(nearest_hmm)

regime_df.head()

Target date: 2025-10-31
HMM neighbors:


date
1975-09-30    0.000060
2011-05-31    0.000138
1977-02-28    0.000194
1984-05-31    0.000200
1992-06-30    0.000204
dtype: float64

Unnamed: 0_level_0,regime_0,regime_1,regime_2,regime_code
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1973-12-31,1.0,1.328667e-08,1.7852359999999997e-176,0
1974-01-31,0.001276,0.9987244,9.920792e-15,1
1974-02-28,0.988716,0.01128394,4.441351e-10,0
1974-03-31,0.011517,0.988483,2.148142e-11,1
1974-04-30,0.979317,0.02068296,1.34135e-10,0


In [7]:
hmm_plot_df = pd.DataFrame({
    'date': regime_df.index,
    'regime_code': regime_df['regime_code'].values,
})

fig = px.scatter(
    hmm_plot_df,
    x='date',
    y='regime_code',
    title='HMM Regime Transitions Over Time',
    labels={'regime_code': 'Regime'}
)
fig.update_traces(marker={'size': 6, 'opacity': 0.7})
fig.update_layout(height=450)
fig.show()

tm = transition_matrix(regime_df['regime_code'])
fig = px.imshow(
    tm,
    text_auto=True,
    color_continuous_scale='Blues',
    title='HMM Regime Transition Matrix'
)
fig.update_layout(height=450)
fig.show()

In [8]:
out_maha = repo_root / 'data' / 'processed' / 'regime_neighbors_mahalanobis.csv'
out_corr = repo_root / 'data' / 'processed' / 'regime_neighbors_correlation.csv'
out_hmm = repo_root / 'data' / 'processed' / 'regime_labels_hmm.csv'

maha_neighbors.to_csv(out_maha)
corr_neighbors.to_csv(out_corr)
regime_df.to_csv(out_hmm)
out_maha, out_corr, out_hmm

(PosixPath('/Users/bachnguyen/nfs-regime-based-predictive-modelling/data/processed/regime_neighbors_mahalanobis.csv'),
 PosixPath('/Users/bachnguyen/nfs-regime-based-predictive-modelling/data/processed/regime_neighbors_correlation.csv'),
 PosixPath('/Users/bachnguyen/nfs-regime-based-predictive-modelling/data/processed/regime_labels_hmm.csv'))