In [None]:
import pandas as pd
import numpy as np

path = "/mnt/data/tba_futures_5.mbp10.sample.csv"
df = pd.read_csv(path)

df["ts_event"] = pd.to_datetime(df["ts_event"], utc=True)
df = df.sort_values(["instrument_id", "ts_event", "sequence"]).reset_index(drop=True)

K = 10
lvl = [f"{i:02d}" for i in range(K)]
bid_px_cols = [f"bid_px_{i}" for i in lvl]
ask_px_cols = [f"ask_px_{i}" for i in lvl]
bid_sz_cols = [f"bid_sz_{i}" for i in lvl]
ask_sz_cols = [f"ask_sz_{i}" for i in lvl]

eps = 1e-12
w = np.exp(-0.5 * np.arange(K))

bid_px = df[bid_px_cols].to_numpy(dtype=float)
ask_px = df[ask_px_cols].to_numpy(dtype=float)
bid_sz = df[bid_sz_cols].to_numpy(dtype=float)
ask_sz = df[ask_sz_cols].to_numpy(dtype=float)

b1 = bid_px[:, 0]
a1 = ask_px[:, 0]
qb1 = bid_sz[:, 0]
qa1 = ask_sz[:, 0]

m = (a1 + b1) / 2.0
s = a1 - b1
rel_s = s / (m + eps)

imb1 = (qb1 - qa1) / (qb1 + qa1 + eps)

wb = (bid_sz * w).sum(axis=1)
wa = (ask_sz * w).sum(axis=1)
imbK = (wb - wa) / (wb + wa + eps)

Qb = bid_sz.sum(axis=1)
Qa = ask_sz.sum(axis=1)
depth_ratio = np.log((Qb + eps) / (Qa + eps))

micro = (a1 * qb1 + b1 * qa1) / (qb1 + qa1 + eps)
micro_prem = micro - m

d_a = ask_px - m[:, None]
d_b = m[:, None] - bid_px

C_a = np.cumsum(ask_sz, axis=1)
C_b = np.cumsum(bid_sz, axis=1)

slope_a = np.nansum(d_a, axis=1) / (np.nansum(C_a, axis=1) + eps)
slope_b = np.nansum(d_b, axis=1) / (np.nansum(C_b, axis=1) + eps)

h = K // 2
conv_a = ask_sz[:, :h].sum(axis=1) / (ask_sz[:, h:].sum(axis=1) + eps)
conv_b = bid_sz[:, :h].sum(axis=1) / (bid_sz[:, h:].sum(axis=1) + eps)

def book_wap(px_mat, sz_mat, V):
    px = np.nan_to_num(px_mat, nan=0.0)
    sz = np.nan_to_num(sz_mat, nan=0.0)
    cum = np.cumsum(sz, axis=1)
    prev = np.concatenate([np.zeros((sz.shape[0], 1)), cum[:, :-1]], axis=1)
    fill = np.clip(V - prev, 0.0, sz)
    cost = (fill * px).sum(axis=1)
    filled = fill.sum(axis=1)
    return cost / (np.maximum(filled, eps))

V = 5.0
wap_buy = book_wap(ask_px, ask_sz, V)
wap_sell = book_wap(bid_px, bid_sz, V)
imp_buy = wap_buy - m
imp_sell = m - wap_sell

feat = pd.DataFrame(
    {
        "instrument_id": df["instrument_id"].to_numpy(),
        "symbol": df["symbol"].to_numpy(),
        "ts_event": df["ts_event"].to_numpy(),
        "mid": m,
        "spread": s,
        "rel_spread": rel_s,
        "imb_1": imb1,
        "imb_10_wexp": imbK,
        "microprice": micro,
        "micro_prem": micro_prem,
        "depth_bid_10": Qb,
        "depth_ask_10": Qa,
        "depth_ratio_log": depth_ratio,
        "slope_ask": slope_a,
        "slope_bid": slope_b,
        "convex_ask": conv_a,
        "convex_bid": conv_b,
        "wap_buy_V5": wap_buy,
        "wap_sell_V5": wap_sell,
        "imp_buy_V5": imp_buy,
        "imp_sell_V5": imp_sell,
        "bid_px_00": b1,
        "ask_px_00": a1,
        "bid_sz_00": qb1,
        "ask_sz_00": qa1,
    }
)

g = feat.groupby("instrument_id", sort=False)
feat["d_bid_sz_00"] = g["bid_sz_00"].diff()
feat["d_ask_sz_00"] = g["ask_sz_00"].diff()
feat["d_bid_px_00"] = g["bid_px_00"].diff()
feat["d_ask_px_00"] = g["ask_px_00"].diff()

feat["nofi"] = feat["d_bid_sz_00"] - feat["d_ask_sz_00"]
feat["nnofi"] = feat["nofi"] / (feat["bid_sz_00"] + feat["ask_sz_00"] + eps)

feat["deplete_bid_1"] = (-feat["d_bid_sz_00"].clip(upper=0.0)).fillna(0.0)
feat["deplete_ask_1"] = (-feat["d_ask_sz_00"].clip(upper=0.0)).fillna(0.0)

feat["flow_10_wexp"] = np.nan
feat["deplete_bid_10"] = np.nan
feat["deplete_ask_10"] = np.nan

for inst, idx in feat.groupby("instrument_id", sort=False).groups.items():
    rows = np.array(list(idx))
    bs = bid_sz[rows]
    az = ask_sz[rows]
    dbs = np.vstack([np.full((1, K), np.nan), np.diff(bs, axis=0)])
    daz = np.vstack([np.full((1, K), np.nan), np.diff(az, axis=0)])
    feat.loc[rows, "flow_10_wexp"] = (dbs * w).sum(axis=1) - (daz * w).sum(axis=1)
    feat.loc[rows, "deplete_bid_10"] = (-np.minimum(dbs, 0.0)).sum(axis=1)
    feat.loc[rows, "deplete_ask_10"] = (-np.minimum(daz, 0.0)).sum(axis=1)

flow_cols = {
    "d_bid_sz_00",
    "d_ask_sz_00",
    "d_bid_px_00",
    "d_ask_px_00",
    "nofi",
    "nnofi",
    "deplete_bid_1",
    "deplete_ask_1",
    "flow_10_wexp",
    "deplete_bid_10",
    "deplete_ask_10",
}
snapshot_cols = [c for c in feat.columns if c not in flow_cols and c != "ts_event"]

def make_bars(feat, freq):
    out = []
    for inst, sub in feat.groupby("instrument_id", sort=False):
        sub = sub.set_index("ts_event").sort_index()
        idx = pd.date_range(sub.index.min().floor(freq), sub.index.max().ceil(freq), freq=freq, tz="UTC")
        snap = sub[snapshot_cols].reindex(sub.index.union(idx)).sort_index().ffill().reindex(idx)
        agg = (
            sub.resample(freq, label="right", closed="right")
            .agg(
                {
                    "nofi": "sum",
                    "nnofi": "sum",
                    "deplete_bid_1": "sum",
                    "deplete_ask_1": "sum",
                    "flow_10_wexp": "sum",
                    "deplete_bid_10": "sum",
                    "deplete_ask_10": "sum",
                    "d_bid_px_00": "sum",
                    "d_ask_px_00": "sum",
                }
            )
            .reindex(idx)
            .fillna(0.0)
        )
        cnt = (
            sub.resample(freq, label="right", closed="right")
            .size()
            .reindex(idx)
            .fillna(0)
            .astype(int)
            .rename("n_updates")
        )
        last_time = sub.index.to_series().resample(freq, label="right", closed="right").last().reindex(idx).ffill()
        age = (last_time.index - last_time).dt.total_seconds().rename("age_sec")
        dt = pd.Timedelta(freq).total_seconds()
        upd_rate = (cnt / dt).rename("upd_rate")
        bar = pd.concat(
            [
                snap.reset_index().rename(columns={"index": "bar_end"}),
                agg.reset_index(drop=True),
                cnt.reset_index(drop=True),
                upd_rate.reset_index(drop=True),
                age.reset_index(drop=True),
            ],
            axis=1,
        )
        bar = bar.dropna(subset=["instrument_id"]).reset_index(drop=True)
        out.append(bar)
    return pd.concat(out, ignore_index=True)

bars_1s = make_bars(feat, "1s")
bars_5s = make_bars(feat, "5s")

print(bars_1s.head(20))
print(bars_5s.head(20))