# Regime-Aware LOB Midterm

This notebook does the following...

- Handles either 1-second or 1-minute LOB CSVs

- Builds robust features (depth, imbalances, flows, realized vol)

- Creates labels for short-horizon mid-price direction (±1 / 0)

- Discovers regimes with **HMM** (if available) or **KMeans** fallback

- Trains **Logistic Regression** and **XGBoost** (or RandomForest fallback)

- Evaluates via **walk-forward** splits with hour-based windows to get *many folds*

- Prints compact metrics + one clean bar chart (balanced vs toxic hit-rate)



In [None]:
import warnings, os, sys, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

from hmmlearn.hmm import GaussianHMM
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, f1_score

## 0. Parameters

In [None]:
DATA_CSV = 'BTC_1sec.csv'

TIME_COL = 'timestamp'
MID_COL = 'midpoint'
SPREAD_COL = 'spread'

TOP_K = 15

HORIZON_SEC = 30

THRESH_BPS = 1.0

TRAIN_DAYS  = 3
TEST_HOURS  = 4
EMBARGO_SEC = 60

STRIDE_HRS  = 12

USE_PCA = True
PCA_VARIANCE = 0.95

MAX_TRAIN_SAMPLES = 120000
HMM_COVARIANCE = 'diag'
HMM_ITERATIONS = 150
XGB_N_ESTIMATORS = 300
XGB_MAX_DEPTH = 6
XGB_LEARNING_RATE = 0.05
XGB_EARLY_STOPPING = 30

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Load & Inspect

The data we are using is the following Kaggle dataset (https://www.kaggle.com/datasets/martinsn/high-frequency-crypto-limit-order-book-data). We download and add the data to the project locally as the dataset is to big to push to Github.

In [None]:
df = pd.read_csv(DATA_CSV)

if TIME_COL not in df.columns:
    for c in df.columns:
        if 'time' in c.lower():
            TIME_COL = c
            break

df[TIME_COL] = pd.to_datetime(df[TIME_COL], utc=True, errors='coerce')
df = df.dropna(subset=[TIME_COL]).sort_values(TIME_COL).reset_index(drop=True)

print("Rows:", len(df))
print("Time range:", df[TIME_COL].min(), "→", df[TIME_COL].max())
print("Columns:", list(df.columns)[:12], "...")

## 2. Feature Engineering

The raw data contains features such as midpoint, buys, sells, spread, and much more. Some features we require such as the AskDepth for example can be calculated easily which is what we'll do here.

In [None]:

def level_cols(prefix, k=TOP_K, in_df=None):
    cols = [f"{prefix}_{i}" for i in range(k)]
    if in_df is not None:
        cols = [c for c in cols if c in in_df.columns]
    return cols

def agg_levels(df, prefix, k=TOP_K):
    cols = level_cols(prefix, k, in_df=df)
    if not cols:
        return pd.Series(0.0, index=df.index)
    return df[cols].sum(axis=1)

df['spread_pct'] = (df[SPREAD_COL] / df[MID_COL]).replace([np.inf, -np.inf], np.nan).fillna(0)
df['BidDepth_k'] = agg_levels(df, "bids_limit_notional", TOP_K)
df['AskDepth_k'] = agg_levels(df, "asks_limit_notional", TOP_K)
df['DepthImb_k'] = (df['BidDepth_k'] - df['AskDepth_k']) / (df['BidDepth_k'] + df['AskDepth_k'] + 1e-9)
df['MO_bid_tau'] = agg_levels(df, "bids_market_notional", TOP_K)
df['MO_ask_tau'] = agg_levels(df, "asks_market_notional", TOP_K)
df['MO_imb_tau'] = (df['MO_ask_tau'] - df['MO_bid_tau']) / (df['MO_ask_tau'] + df['MO_bid_tau'] + 1e-9)
df['CA_bid_tau'] = agg_levels(df, "bids_cancel_notional", TOP_K)
df['CA_ask_tau'] = agg_levels(df, "asks_cancel_notional", TOP_K)
df['CA_imb_tau'] = (df['CA_ask_tau'] - df['CA_bid_tau']) / (df['CA_ask_tau'] + df['CA_bid_tau'] + 1e-9)

ret = df[MID_COL].pct_change()
df['rv_5s']  = ret.rolling(5, min_periods=1).std().fillna(0)

feature_cols = [
    'spread_pct','DepthImb_k','MO_imb_tau','CA_imb_tau','rv_5s',
    'BidDepth_k','AskDepth_k','MO_bid_tau','MO_ask_tau','CA_bid_tau','CA_ask_tau'
]
print(pd.DataFrame({'exists':[c in df.columns for c in feature_cols]}, index=feature_cols))


## 3. Labels (short-horizon direction)

In [None]:

df = df.sort_values(TIME_COL).reset_index(drop=True)
approx_dt = (df[TIME_COL].iloc[1] - df[TIME_COL].iloc[0]).total_seconds()
shift_n = max(1, int(round(HORIZON_SEC / max(1, approx_dt))))
df['mid_fwd'] = df[MID_COL].shift(-shift_n)

thresh = THRESH_BPS * 1e-4
r = (df['mid_fwd'] - df[MID_COL]) / df[MID_COL]
y = np.where(r >  thresh,  1, np.where(r < -thresh, -1, 0))
df['y'] = y

df_lbl = df.dropna(subset=['mid_fwd']).copy()
print("Label distribution:", pd.Series(df_lbl['y']).value_counts(normalize=True).round(3).to_dict())


## 4. Scaling helper
This function standardizes features in the training/testing datasets so that each feature has a mean of 0 and a standard deviation of 1 by fitting scaling parameters on the training data and applies the same transformation to the test data, ensuring consistency and preventing data leakage. 

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_features(df_train, df_test, cols=None, use_pca=USE_PCA, pca_var=PCA_VARIANCE):
    if cols is None:
        cols = feature_cols
    scaler = StandardScaler()
    Xtr = scaler.fit_transform(df_train[cols].fillna(0.0).values)
    Xte = scaler.transform(df_test[cols].fillna(0.0).values)
    
    pca = None
    if use_pca:
        pca = PCA(n_components=pca_var, random_state=RANDOM_STATE)
        Xtr = pca.fit_transform(Xtr)
        Xte = pca.transform(Xte)
    
    return Xtr, Xte, pca

## 5. Walk-forward splitters
This helper function generates rolling train-test time splits for time-series cross-validation using a walk-forward approach.

In [None]:

def walk_forward_splits_hours_v2(
    timestamps, train_days=3, test_hours=4, embargo_sec=60, stride_hours=None
):
    ts = pd.to_datetime(timestamps, utc=True).sort_values()
    tmin = ts.min()
    tmax = ts.max()
    stride = pd.Timedelta(hours=(stride_hours or test_hours))

    cur = tmin
    while True:
        tr_s = cur
        tr_e = tr_s + pd.Timedelta(days=train_days)
        te_s = tr_e + pd.Timedelta(seconds=embargo_sec)
        te_e = te_s + pd.Timedelta(hours=test_hours)
        if te_e > tmax:
            break
        yield (tr_s, tr_e, te_s, te_e)
        cur = cur + stride


## 6. Regime discovery (HMM or KMeans fallback)

In [None]:
def fit_hmm(X_train, df_train, n_states=2, random_state=42):
    tox_score_cols = ['MO_imb_tau','CA_imb_tau','DepthImb_k']
    train_score = df_train[tox_score_cols].fillna(0).copy()
    train_score['tox_score'] = (
        train_score['MO_imb_tau'].abs()
      + train_score['CA_imb_tau'].abs()
      + train_score['DepthImb_k'].abs()
    )

    hmm = GaussianHMM(
        n_components=n_states, 
        covariance_type=HMM_COVARIANCE,
        n_iter=HMM_ITERATIONS,
        random_state=random_state
    )
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="Model is not converging")
        hmm.fit(X_train)
    states = hmm.predict(X_train)
    state_avg = pd.DataFrame({'state': states, 'tox': train_score['tox_score'].values}).groupby('state')['tox'].mean()
    tox_state = int(state_avg.idxmax())
    return hmm, tox_state

## 7. Supervised models (baseline vs regime-aware)

In [None]:

CLASSES = [-1,0,1]
ENC_MAP = {-1:0, 0:1, 1:2}
DEC_MAP = {v:k for k,v in ENC_MAP.items()}

def metrics_from_proba_encoded(proba, y_true_enc):
    yhat_enc = proba.argmax(axis=1)
    yhat = np.array([DEC_MAP[i] for i in yhat_enc])
    ytrue = np.array([DEC_MAP[i] for i in y_true_enc])
    acc = accuracy_score(ytrue, yhat)
    f1  = f1_score(ytrue, yhat, average='macro')
    return acc, f1, yhat, ytrue

def regime_hits(yhat, ytrue, regime_flag):
    bal = (regime_flag==0)
    tox = (regime_flag==1)
    hr_bal = float((yhat[bal]==ytrue[bal]).mean()) if bal.any() else np.nan
    hr_tox = float((yhat[tox]==ytrue[tox]).mean()) if tox.any() else np.nan
    return hr_bal, hr_tox

df_sup = df_lbl.dropna(subset=['y']).copy().sort_values(TIME_COL)
folds = list(walk_forward_splits_hours_v2(df_sup[TIME_COL], TRAIN_DAYS, TEST_HOURS, EMBARGO_SEC, STRIDE_HRS))
print("Planned folds:", len(folds))
for i,(a,b,c,d) in enumerate(folds[:5]):
    print(f"Fold {i}: TRAIN {a}→{b}  TEST {c}→{d}")


## 8. Walk-forward evaluation

In [None]:
import time

results = []
fold = 0
MIN_TRAIN = 300
MIN_TEST  = 100

for (tr_s, tr_e, te_s, te_e) in folds:
    fold_start = time.time()
    train = df_sup[(df_sup[TIME_COL] >= tr_s) & (df_sup[TIME_COL] < tr_e)].copy()
    test  = df_sup[(df_sup[TIME_COL] >= te_s) & (df_sup[TIME_COL] < te_e)].copy()
    if len(train) < MIN_TRAIN or len(test) < MIN_TEST:
        continue

    if len(train) > MAX_TRAIN_SAMPLES:
        train = train.sample(n=MAX_TRAIN_SAMPLES, random_state=RANDOM_STATE)
        print(f"\n[Fold {fold+1}/{len(folds)}] Train: {len(train):,} samples (subsampled), Test: {len(test):,}")
    else:
        print(f"\n[Fold {fold+1}/{len(folds)}] Train: {len(train):,} samples, Test: {len(test):,}")
    
    t0 = time.time()
    Xtr_base, Xte_base, pca_model = scale_features(train, test)
    if USE_PCA and pca_model is not None and fold == 0:
        print(f"PCA: {len(feature_cols)} → {pca_model.n_components_} components ({PCA_VARIANCE*100}% var)")
    print(f"  Scaling+PCA: {time.time()-t0:.1f}s")
    
    ytr_enc = np.array([ENC_MAP[v] for v in train['y'].values])
    yte_enc = np.array([ENC_MAP[v] for v in test['y'].values])

    t0 = time.time()
    lr_base = LogisticRegression(max_iter=1000, C=0.5, class_weight='balanced', solver='lbfgs')
    lr_base.fit(Xtr_base, ytr_enc)
    acc_lr_base, f1_lr_base, _, _ = metrics_from_proba_encoded(lr_base.predict_proba(Xte_base), yte_enc)
    print(f"  LR baseline: {time.time()-t0:.1f}s")

    t0 = time.time()
    w_base = compute_sample_weight(class_weight='balanced', y=ytr_enc)
    xgb_base = XGBClassifier(
        n_estimators=XGB_N_ESTIMATORS,
        max_depth=XGB_MAX_DEPTH,
        learning_rate=XGB_LEARNING_RATE,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='multi:softprob',
        num_class=3,
        tree_method='hist',
        reg_lambda=1.0,
        random_state=42,
        early_stopping_rounds=XGB_EARLY_STOPPING
    )
    split_idx = int(0.8 * len(Xtr_base))
    xgb_base.fit(
        Xtr_base[:split_idx], ytr_enc[:split_idx],
        sample_weight=w_base[:split_idx],
        eval_set=[(Xtr_base[split_idx:], ytr_enc[split_idx:])],
        verbose=False
    )
    acc_xgb_base, f1_xgb_base, _, _ = metrics_from_proba_encoded(xgb_base.predict_proba(Xte_base), yte_enc)
    print(f"  XGB baseline: {time.time()-t0:.1f}s")

    t0 = time.time()
    model_reg, tox_state = fit_hmm(Xtr_base, train, n_states=2)
    print(f"  HMM: {time.time()-t0:.1f}s")
    
    s_tr = model_reg.predict(Xtr_base)
    s_te = model_reg.predict(Xte_base)

    train['regime_hmm'] = (s_tr == tox_state).astype(int)
    test['regime_hmm']  = (s_te == tox_state).astype(int)

    Xtr = np.c_[Xtr_base, train['regime_hmm'].values]
    Xte = np.c_[Xte_base, test['regime_hmm'].values]

    t0 = time.time()
    lr = LogisticRegression(max_iter=1000, C=0.5, class_weight='balanced', solver='lbfgs')
    lr.fit(Xtr, ytr_enc)
    acc_lr, f1_lr, yhat_lr, ytrue_lr = metrics_from_proba_encoded(lr.predict_proba(Xte), yte_enc)
    print(f"  LR regime-aware: {time.time()-t0:.1f}s")

    t0 = time.time()
    w = compute_sample_weight(class_weight='balanced', y=ytr_enc)
    xgb = XGBClassifier(
        n_estimators=XGB_N_ESTIMATORS,
        max_depth=XGB_MAX_DEPTH,
        learning_rate=XGB_LEARNING_RATE,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='multi:softprob',
        num_class=3,
        tree_method='hist',
        reg_lambda=1.0,
        random_state=42,
        early_stopping_rounds=XGB_EARLY_STOPPING
    )
    split_idx = int(0.8 * len(Xtr))
    xgb.fit(
        Xtr[:split_idx], ytr_enc[:split_idx],
        sample_weight=w[:split_idx],
        eval_set=[(Xtr[split_idx:], ytr_enc[split_idx:])],
        verbose=False
    )
    acc_xgb, f1_xgb, yhat_main, ytrue_main = metrics_from_proba_encoded(xgb.predict_proba(Xte), yte_enc)
    print(f"  XGB regime-aware: {time.time()-t0:.1f}s")

    hr_bal, hr_tox = regime_hits(yhat_main, ytrue_main, test['regime_hmm'].values)

    results.append({
        'fold': fold,
        'test_start': te_s, 'test_end': te_e,
        'acc_lr_base': acc_lr_base,   'f1_lr_base': f1_lr_base,
        'acc_xgb_base': acc_xgb_base, 'f1_xgb_base': f1_xgb_base,
        'acc_lr': acc_lr,   'f1_lr': f1_lr,
        'acc_xgb': acc_xgb, 'f1_xgb': f1_xgb,
        'hr_bal': hr_bal, 'hr_tox': hr_tox,
        'lift': hr_tox - hr_bal,
        'pct_toxic_test': float((test['regime_hmm'] == 1).mean())
    })
    
    print(f"  FOLD TOTAL: {time.time()-fold_start:.1f}s")
    fold += 1

df_res = pd.DataFrame(results)
print("\nActual folds evaluated:", len(df_res))
if len(df_res)==0:
    raise RuntimeError("No folds evaluated.")

cols = ['fold','test_start','test_end','acc_lr_base','acc_lr','acc_xgb_base','acc_xgb','hr_bal','hr_tox','lift','pct_toxic_test']
display(df_res[cols].round(3))

print("\nMeans across folds:")
display(df_res[['acc_lr_base','acc_lr','acc_xgb_base','acc_xgb','hr_bal','hr_tox','lift']].mean().round(3))

## 9. Plot: Regime-conditioned hit-rate per fold

In [None]:

fig, ax = plt.subplots(figsize=(7.5,4.2))
x = np.arange(len(df_res))
ax.bar(x - 0.2, df_res['hr_bal'], width=0.4, label='Balanced', alpha=0.85)
ax.bar(x + 0.2, df_res['hr_tox'], width=0.4, label='Toxic',    alpha=0.85)

labels = pd.to_datetime(df_res['test_start']).dt.strftime('%m-%d %H:%M')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=40, ha='right')
ax.set_xlabel("Fold (Test Window Start)")
ax.set_ylabel("Hit Rate")
ax.set_title(f"Regime-Conditioned Hit Rate • Mean Lift = {df_res['lift'].mean():.3f}")
ax.legend()
plt.tight_layout()
plt.show()
