# Step 4 / Point 4 — Model + Tests

This notebook uses the snapshot feature layer (`features.parquet`) to:

- run hypothesis tests (maker-like vs taker-like, regime splits)
- discover trade archetypes via clustering
- fit a simple model for **directional** Up/Down decisions (walk-forward evaluation)

You should have already run:

1. `01_extract_snapshot.ipynb`
2. `02_feature_layer_and_regimes.ipynb`


In [1]:
username = "gabagool22"

# If None, auto-pick latest snapshot for this username.
snapshot_dir = None

# Cluster params
k_clusters = 8

# Walk-forward params
n_splits = 5


In [2]:
import os
from pathlib import Path


def _find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "research" / "snapshot.py").exists():
            return p
    raise RuntimeError(
        "Cannot locate repo root (expected to find `research/snapshot.py`). "
        "Start Jupyter from the repo root or set REPO_ROOT=/path/to/polybot."
    )


repo_root_env = os.getenv("REPO_ROOT")
repo_root = Path(repo_root_env) if repo_root_env else _find_repo_root(Path.cwd())
snapshots_root = repo_root / "research" / "data" / "snapshots"

print("repo_root:", repo_root)
print("snapshots_root:", snapshots_root)


repo_root: /Users/antoniostano/programming/polybot
snapshots_root: /Users/antoniostano/programming/polybot/research/data/snapshots


In [3]:
import pandas as pd


def _pick_latest_snapshot_dir(root: Path, username: str) -> Path:
    candidates = sorted(root.glob(f"{username}-*"))
    if not candidates:
        raise RuntimeError(f"No snapshots found under {root} for username={username}")
    return max(candidates, key=lambda p: p.stat().st_mtime)


snapshot_path = Path(snapshot_dir) if snapshot_dir else _pick_latest_snapshot_dir(snapshots_root, username)
print("snapshot_path:", snapshot_path)

features_path = snapshot_path / "features.parquet"
if not features_path.exists():
    raise RuntimeError(
        f"Missing {features_path}. Run `02_feature_layer_and_regimes.ipynb` first to generate features.parquet."
    )

df = pd.read_parquet(features_path)
df.shape

snapshot_path: /Users/antoniostano/programming/polybot/research/data/snapshots/gabagool22-20251214T200720+0000


(7221, 56)

In [4]:
import numpy as np

df = df.copy()
df["ts"] = pd.to_datetime(df["ts"], errors="coerce", utc=True)

df[["ts", "series", "asset", "side", "outcome", "exec_type", "regime", "seconds_to_end", "price", "size", "realized_pnl"]].head()

Unnamed: 0,ts,series,asset,side,outcome,exec_type,regime,seconds_to_end,price,size,realized_pnl
0,2025-12-14 12:00:21+00:00,updown-15m,BTC,BUY,Down,UNKNOWN,DIRECTIONAL,879,0.58,20.0,
1,2025-12-14 12:00:33+00:00,updown-15m,BTC,BUY,Down,UNKNOWN,COMPLETE_SET_ARBITRAGE,867,0.61,20.0,
2,2025-12-14 12:00:33+00:00,updown-15m,BTC,BUY,Down,UNKNOWN,COMPLETE_SET_ARBITRAGE,867,0.59,10.1,
3,2025-12-14 12:00:43+00:00,updown-15m,BTC,BUY,Down,UNKNOWN,COMPLETE_SET_ARBITRAGE,857,0.64,20.0,
4,2025-12-14 12:00:43+00:00,updown-15m,BTC,BUY,Down,UNKNOWN,COMPLETE_SET_ARBITRAGE,857,0.64,20.0,


## Hypothesis tests (resolved trades)

We start with simple, high-signal questions:

- Is maker-like execution higher PnL than taker-like?
- Does that differ by regime (COMPLETE_SET_ARBITRAGE vs DIRECTIONAL)?


In [5]:
from scipy.stats import mannwhitneyu


def bootstrap_mean_diff(a: np.ndarray, b: np.ndarray, *, iters: int = 50_000, seed: int = 7) -> dict:
    rng = np.random.default_rng(seed)
    a = a.astype(float)
    b = b.astype(float)
    a = a[np.isfinite(a)]
    b = b[np.isfinite(b)]
    if len(a) == 0 or len(b) == 0:
        return {"n_a": len(a), "n_b": len(b)}

    boot = []
    for _ in range(iters):
        aa = rng.choice(a, size=len(a), replace=True)
        bb = rng.choice(b, size=len(b), replace=True)
        boot.append(float(np.mean(aa) - np.mean(bb)))
    boot = np.array(boot)
    return {
        "n_a": int(len(a)),
        "n_b": int(len(b)),
        "mean_a": float(np.mean(a)),
        "mean_b": float(np.mean(b)),
        "mean_diff": float(np.mean(a) - np.mean(b)),
        "ci_95": [float(np.quantile(boot, 0.025)), float(np.quantile(boot, 0.975))],
    }


resolved = df[df["realized_pnl"].notna()].copy()
resolved = resolved[resolved["exec_type"].isin(["MAKER_LIKE", "TAKER_LIKE", "INSIDE"])]

def compare_exec(a_label: str, b_label: str, subset: pd.DataFrame) -> dict:
    a = subset.loc[subset["exec_type"] == a_label, "realized_pnl"].to_numpy()
    b = subset.loc[subset["exec_type"] == b_label, "realized_pnl"].to_numpy()
    out = {"a": a_label, "b": b_label}
    out.update(bootstrap_mean_diff(a, b, iters=20_000))
    if len(a) > 0 and len(b) > 0:
        u = mannwhitneyu(a, b, alternative="two-sided")
        out["mw_p"] = float(u.pvalue)
    return out


overall = compare_exec("MAKER_LIKE", "TAKER_LIKE", resolved)
overall

{'a': 'MAKER_LIKE',
 'b': 'TAKER_LIKE',
 'n_a': 1623,
 'n_b': 1373,
 'mean_a': 1.7316525200246458,
 'mean_b': -1.839948994901675,
 'mean_diff': 3.5716015149263205,
 'ci_95': [3.044840473062987, 4.094600520798638],
 'mw_p': 5.458599227950962e-34}

In [6]:
results = []
for regime in ["COMPLETE_SET_ARBITRAGE", "DIRECTIONAL"]:
    sub = resolved[resolved["regime"] == regime]
    results.append({"regime": regime, **compare_exec("MAKER_LIKE", "TAKER_LIKE", sub)})

pd.DataFrame(results)

Unnamed: 0,regime,a,b,n_a,n_b,mean_a,mean_b,mean_diff,ci_95,mw_p
0,COMPLETE_SET_ARBITRAGE,MAKER_LIKE,TAKER_LIKE,560,535,1.926553,-2.779204,4.705758,"[3.788787936786548, 5.618876404910714]",9.024099e-19
1,DIRECTIONAL,MAKER_LIKE,TAKER_LIKE,1063,838,1.628977,-1.240305,2.869282,"[2.231551947227418, 3.512006507899272]",5.533444e-16


## Clustering: discover trade archetypes

We cluster trade feature vectors and then look at PnL/behavior by cluster.


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans


# Use only trades with TOB + market context for cleaner clustering.
cluster_df = df.copy()
cluster_df = cluster_df[cluster_df["tob_known"] == 1]

# Some snapshots might not have market-trade features; include only existing cols.
numeric_candidates = [
    "seconds_to_end",
    "price",
    "size",
    "notional_usd",
    "spread",
    "tob_imbalance",
    "edge_vs_mid",
    "effective_spread_ratio",
    "hour_utc",
    "mt_last_age_ms",
    "mt_return_30s",
    "mt_return_300s",
    "mt_trades_60s",
    "mt_volume_60s",
    "mt_notional_60s",
]
numeric_features = [c for c in numeric_candidates if c in cluster_df.columns]

categorical_candidates = ["series", "asset", "outcome", "exec_type", "regime"]
categorical_features = [c for c in categorical_candidates if c in cluster_df.columns]

preprocess = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline(
                steps=[
                    ("impute", SimpleImputer(strategy="median")),
                    ("scale", StandardScaler()),
                ]
            ),
            numeric_features,
        ),
        (
            "cat",
            Pipeline(
                steps=[
                    ("impute", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore")),
                ]
            ),
            categorical_features,
        ),
    ]
)

model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("kmeans", KMeans(n_clusters=int(k_clusters), n_init=10, random_state=7)),
    ]
)

cluster_labels = model.fit_predict(cluster_df)
cluster_df["cluster"] = cluster_labels
cluster_df[["cluster", "series", "asset", "exec_type", "regime"]].head()

Unnamed: 0,cluster,series,asset,exec_type,regime
48,1,updown-15m,BTC,MAKER_LIKE,COMPLETE_SET_ARBITRAGE
49,1,updown-15m,BTC,MAKER_LIKE,COMPLETE_SET_ARBITRAGE
50,0,updown-15m,BTC,MAKER_LIKE,DIRECTIONAL
51,0,updown-15m,BTC,MAKER_LIKE,DIRECTIONAL
52,0,updown-15m,BTC,MAKER_LIKE,DIRECTIONAL


In [8]:
summary = (
    cluster_df.assign(win=cluster_df["realized_pnl"].fillna(0) > 0)
    .groupby("cluster", as_index=False)
    .agg(
        trades=("event_key", "count"),
        resolved_trades=("realized_pnl", lambda s: int(s.notna().sum())),
        pnl_usd=("realized_pnl", "sum"),
        avg_pnl=("realized_pnl", "mean"),
        win_rate=("win", "mean"),
        avg_seconds_to_end=("seconds_to_end", "mean"),
        avg_spread=("spread", "mean"),
        avg_imbalance=("tob_imbalance", "mean"),
    )
    .sort_values("pnl_usd", ascending=False)
)
summary

Unnamed: 0,cluster,trades,resolved_trades,pnl_usd,avg_pnl,win_rate,avg_seconds_to_end,avg_spread,avg_imbalance
3,3,567,459,834.92122,1.819,0.398589,807.010582,0.98,0.002154
5,5,487,475,562.129295,1.18343,0.572895,768.667351,0.043717,0.012116
1,1,981,939,412.72425,0.439536,0.632008,648.979613,0.015607,-0.189981
6,6,5,5,25.0215,5.0043,1.0,3382.6,0.026,0.181909
7,7,82,82,-39.3992,-0.480478,0.609756,337.243902,0.276463,-0.067704
0,0,1091,1059,-158.216095,-0.149401,0.368469,614.92484,0.016434,0.107169
4,4,818,683,-384.33786,-0.56272,0.378973,2424.594132,0.256308,0.014802
2,2,453,411,-876.77643,-2.133276,0.461369,725.728477,0.98,-0.004651


## Directional decision model (Up vs Down)

Here we try to predict his *directional* choice (Up/Down) from pre-trade features.

This is an early proxy for his signal: if a small model can predict the choice, it suggests his behavior is systematic.


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score


dir_df = df.copy()
dir_df = dir_df[
    (dir_df["side"] == "BUY")
    & (dir_df["outcome"].isin(["Up", "Down"]))
    & (dir_df["regime"] == "DIRECTIONAL")
    & (dir_df["series"].isin(["updown-15m", "up-or-down"]))
].copy()

dir_df = dir_df.sort_values("ts")
y = (dir_df["outcome"] == "Up").astype(int)

# Feature set for this model: exclude outcome + realized pnl.
drop_cols = {"outcome", "realized_pnl", "resolved_outcome", "settle_price", "transaction_hash", "event_key"}
X = dir_df.drop(columns=[c for c in drop_cols if c in dir_df.columns])

numeric_candidates = [
    "seconds_to_end",
    "price",
    "size",
    "notional_usd",
    "spread",
    "tob_imbalance",
    "edge_vs_mid",
    "effective_spread_ratio",
    "hour_utc",
    "mt_last_age_ms",
    "mt_return_30s",
    "mt_return_300s",
    "mt_trades_60s",
    "mt_volume_60s",
    "mt_notional_60s",
]
num = [c for c in numeric_candidates if c in X.columns]

cat_candidates = ["series", "asset", "exec_type"]
cat = [c for c in cat_candidates if c in X.columns]

pre = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline([
                ("impute", SimpleImputer(strategy="median")),
                ("scale", StandardScaler()),
            ]),
            num,
        ),
        (
            "cat",
            Pipeline([
                ("impute", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
            ]),
            cat,
        ),
    ]
)

clf = Pipeline(
    [
        ("pre", pre),
        ("lr", LogisticRegression(max_iter=500, solver="lbfgs")),
    ]
)

tscv = TimeSeriesSplit(n_splits=int(n_splits))
accs = []
aucs = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    clf.fit(X_train, y_train)
    proba = clf.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)

    acc = accuracy_score(y_test, pred)
    auc = roc_auc_score(y_test, proba) if len(np.unique(y_test)) > 1 else float("nan")
    accs.append(acc)
    aucs.append(auc)

print("directional_rows:", len(dir_df))
print("accuracy mean±std:", float(np.mean(accs)), float(np.std(accs)))
print("roc_auc mean±std:", float(np.nanmean(aucs)), float(np.nanstd(aucs)))

directional_rows: 4556
accuracy mean±std: 0.4885375494071146 0.05004784364251645
roc_auc mean±std: 0.49527869225366816 0.07348919145467332


 'mt_return_30s' 'mt_return_300s' 'mt_trades_60s' 'mt_volume_60s'
 'mt_notional_60s']. At least one non-missing value is needed for imputation with strategy='median'.
 'mt_return_30s' 'mt_return_300s' 'mt_trades_60s' 'mt_volume_60s'
 'mt_notional_60s']. At least one non-missing value is needed for imputation with strategy='median'.
 'mt_return_30s' 'mt_return_300s']. At least one non-missing value is needed for imputation with strategy='median'.
 'mt_return_30s' 'mt_return_300s']. At least one non-missing value is needed for imputation with strategy='median'.
