In [1]:
import pandas as pd
import numpy as np
import os

from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ts = "20240207"
mbo_pkl = "processed/mbo.%s.pkl" % ts
mbo = pd.read_pickle(mbo_pkl)

In [9]:
# get first-passage events time for each order
# order added
add_time = mbo.loc[mbo.action=='A', ["order_id", "ts_event"]].rename(columns={"ts_event": "add_time"})
assert add_time.order_id.duplicated().sum() == 0

# filled
fill_time = mbo.loc[mbo.action=='F', ["order_id", "ts_event"]].rename(columns={"ts_event": "fill_time"})
first_fill_time = fill_time.groupby('order_id')["fill_time"].min().reset_index()

# traded
trade_time = mbo.loc[mbo.action=='T', ["order_id", "ts_event"]].rename(columns={"ts_event": "fill_time"})
first_trade_time = trade_time.groupby('order_id')["fill_time"].min().reset_index()

# combine
first_fill_trade_time = pd.concat([first_fill_time, first_trade_time], axis=0).sort_values(by=["order_id", "fill_time"]).reset_index(drop=True).\
    drop_duplicates(subset=["order_id"], keep="first")

In [11]:
first_fill_trade_time.head()

Unnamed: 0,order_id,fill_time
0,0,2024-02-06 19:01:00.608812567-05:00
1,8413617207205,2024-02-07 06:55:46.910571859-05:00
2,8413617247219,2024-02-07 09:51:07.062097265-05:00
3,8413621618936,2024-02-07 09:49:04.657986843-05:00
4,8413621619757,2024-02-07 09:49:04.657986843-05:00


In [12]:
df_time = pd.merge(add_time, first_fill_trade_time, on=["order_id"], how="inner")
df_time["delta_fill_time"] = (df_time.fill_time - df_time.add_time).dt.total_seconds()

In [15]:
df_time.head()

Unnamed: 0,order_id,add_time,fill_time,delta_fill_time
0,8413621757838,2024-02-04 19:11:40.527145045-05:00,2024-02-07 09:51:10.162059201-05:00,225569.634914
1,8413621757844,2024-02-04 19:11:40.527297047-05:00,2024-02-07 09:51:10.162059201-05:00,225569.634762
2,8413621757950,2024-02-04 19:11:40.532551941-05:00,2024-02-07 09:51:10.162059201-05:00,225569.629507
3,8413621758807,2024-02-04 19:11:42.660741725-05:00,2024-02-07 09:51:10.162059201-05:00,225567.501317
4,8413621759000,2024-02-04 19:11:42.959864489-05:00,2024-02-07 09:51:10.162059201-05:00,225567.202195


In [18]:
df_time.delta_fill_time.describe()

count    257164.000000
mean        771.554596
std        7105.815916
min           0.000000
25%           0.453363
50%           9.559208
75%          73.849420
max      225569.634914
Name: delta_fill_time, dtype: float64

# Simulation

In [19]:
depth_samples = []
 # list to collect number of orders at a price after each relevant event

# Model the queue

active_orders = {}
# current active orders in the book and their states
# { order_id -> {'price': ..., 'side': ..., 'size': ...} }
# e.g. {8413639623048: {'price': 104.53125, 'side': 'B', 'size': 128}, 8413639623602: {'price': 107.421875, 'side': 'B', 'size': 20}, ...}

count_per_price = defaultdict(int)
# number of orders at a price level
# { (side, price) -> count }
# e.g. {('B', -1.5625): 1, ('A', 0.8125): 1, ...}

volume_per_price = defaultdict(int)
# total volume at a price level
# { (side, price) -> volume }
# e.g. {('A', -0.4453125): 500,('A', -0.4375): 500, ...}

price_queues = defaultdict(list)
# queue order of order_ids at each price level
# (side, price) -> list of order_ids in FIFO order
# e.g. {('A', -0.421875): [8413644184881], ('A', -0.40625): [8413644184885],...}

bid_prices_set = set()
# current unique bid prices
# e.g. {-1.5625, -0.8125, 0.8046875, ...}

ask_prices_set = set()
# current unique ask prices
# e.g. {119.0625, 122.265625, 127.65625, ...}

best_bid_price = None
# current best bid price

best_ask_price = None
# current best ask price

initial_order_features = {}
# store features for each order at add time

In [None]:
for row in mbo.itertuples(index=False):
    action, side, order_id, price, size = row.action, row.side, row.order_id, row.price, row.size

    if action == 'A':
        # ADD: new order added to active orders
        active_orders[order_id] = {"price": price, "side": side, "size": size}
        count_per_price[(side, price)] += 1
        volume_per_price[(side, price)] += size
        price_queues[(side, price)].append(order_id)
        # update best bid/ask if necessary
        if side == 'B':
            bid_prices_set.add(price)
            if best_bid_price is None or price > best_bid_price:
                best_bid_price = price
            diff_to_best_price = best_bid_price - price
            assert diff_to_best_price >= 0
        elif side == 'A':
            ask_prices_set.add(price)
            if best_ask_price is None or price < best_ask_price:
                best_ask_price = price
            diff_to_best_price = price - best_ask_price
            assert diff_to_best_price >= 0

        # queue position at this price
        queue_position = len(price_queues[(side, price)])
        # price level rank (relative level on that side of book)
        if side == 'B':
            price_level = 1 + sum(1 for p in bid_prices_set if p > price) # count how many bid prices are higher than this price
        elif side == 'A':
            price_level = 1 + sum(1 for p in ask_prices_set if p < price) # count how many ask prices are lower than this price
        # depth and volume at this price
        orders_at_price = count_per_price[(side, price)]
        volume_at_price = volume_per_price[(side, price)]
        

        orders_ahead_same = queue_position - 1
        volume_ahead_same = 0
        if orders_ahead_same > 0:
            # sum the sizes of all orders ahead in the queue
            for oid in price_queues[(side, price)][:orders_ahead_same]:
                volume_ahead_same += active_orders[oid]['size']
        # total orders/volume ahead on the same side (better prices + same price ahead)
        orders_ahead_total = orders_ahead_same
        volume_ahead_total = volume_ahead_same
        if side == 'B':
            # all orders at higher bid prices are ahead in priority
            for p in bid_prices_set:
                if p > price:
                    orders_ahead_total += count_per_price[('B', p)]
                    volume_ahead_total += volume_per_price[('B', p)]
        elif side == 'A':
            # all orders at lower ask prices are ahead in priority
            for p in ask_prices_set:
                if p < price:
                    orders_ahead_total += count_per_price[('A', p)]
                    volume_ahead_total += volume_per_price[('A', p)]

        num_bid_orders = sum([v for k,v in count_per_price.items() if k[0] == 'B'])
        volume_bid_orders = sum([v for k,v in volume_per_price.items() if k[0] == 'B'])
        num_ask_orders = sum([v for k,v in count_per_price.items() if k[0] == 'A'])
        volume_ask_orders = sum([v for k,v in volume_per_price.items() if k[0] == 'A'])
        if side == 'B':
            bid_ask_imbalance = (volume_bid_orders - volume_ask_orders) / (volume_bid_orders + volume_ask_orders) if (volume_bid_orders + volume_ask_orders) > 0 else 0
        elif side == 'A':
            bid_ask_imbalance = (volume_ask_orders - volume_bid_orders) / (volume_bid_orders + volume_ask_orders) if (volume_bid_orders + volume_ask_orders) > 0 else 0
        
        # summary: initial features when an order is added to the book
        initial_order_features[order_id] = {
            "side": side,
            "price": price,
            "size": size,
            "price_level": price_level,
            "queue_position": queue_position,
            "orders_at_price": orders_at_price,
            "volume_at_price": volume_at_price,
            "orders_ahead_total": orders_ahead_total,
            "volume_ahead_total": volume_ahead_total,
            # "is_best": (price_level == 1),
            "bid_ask_imbalance": bid_ask_imbalance,
            "diff_to_best_price": diff_to_best_price,
        }

    if action == "C":
    # CANCEL: partial or full removal, partial or full fill
        if order_id in active_orders:
            current_price = active_orders[order_id]['price']
            current_side = active_orders[order_id]['side']
            current_size = active_orders[order_id]['size']
            cancel_size = size
            if cancel_size < current_size:
                # partial cancel: reduce order size
                active_orders[order_id]['size'] = current_size - cancel_size
                volume_per_price[(current_side, current_price)] -= cancel_size
                # count_per_price remains the same, order stays in queue
            elif cancel_size == current_size:
                # full cancel: remove order completely
                del active_orders[order_id]
                count_per_price[(current_side, current_price)] -= 1
                volume_per_price[(current_side, current_price)] -= cancel_size
                price_queues[(current_side, current_price)].remove(order_id)
                new_depth = count_per_price[(current_side, current_price)]
                # depth
                if new_depth == 0:
                    count_per_price.pop((current_side, current_price), None)
                    volume_per_price.pop((current_side, current_price), None)
                    if current_side == 'B':
                        bid_prices_set.discard(current_price)
                    elif current_side == 'A':
                        ask_prices_set.discard(current_price)
                # update best price
                if current_side == 'B' and current_price == best_bid_price:
                    best_bid_price = max(bid_prices_set) if bid_prices_set else None
                if current_side == 'A' and current_price == best_ask_price:
                    best_ask_price = min(ask_prices_set) if ask_prices_set else None

    if action == "M":
        # MODIFY: modify order (price and/or size)
        if order_id in active_orders:
            current_price = active_orders[order_id]['price']
            current_side = active_orders[order_id]['side']
            current_size = active_orders[order_id]['size']
            new_price = price
            new_size = size
            if new_price != current_price:
                count_per_price[(current_side, current_price)] -= 1
                volume_per_price[(current_side, current_price)] -= current_size
                price_queues[(current_side, current_price)].remove(order_id)
                if count_per_price[(current_side, current_price)] == 0:
                    count_per_price.pop((current_side, current_price), None)
                    volume_per_price.pop((current_side, current_price), None)
                    if current_side == 'B':
                        bid_prices_set.discard(current_price)
                    elif current_side == 'A':
                        ask_prices_set.discard(current_price)
                count_per_price[(current_side, new_price)] += 1
                volume_per_price[(current_side, new_price)] += new_size
                price_queues[(current_side, new_price)].append(order_id)
                active_orders[order_id]['price'] = new_price
                # update best bid/ask if necessary
                if current_side == 'B':
                    bid_prices_set.add(new_price)
                    if best_bid_price is None or new_price > best_bid_price:
                        best_bid_price = new_price
                if current_side == 'A':
                    ask_prices_set.add(new_price)
                    if best_ask_price is None or new_price < best_ask_price:
                        best_ask_price = new_price
         
            # update the order's size (for both price-changed or same-price modifications)
            active_orders[order_id]['size'] = new_size
            if new_price == current_price:
                volume_per_price[(current_side, current_price)] += (new_size - current_size)
    
    if action in ("F", "T"):
        # FILL/TRADE: does not impact active orders directly
        pass

    if action == "R":
        # CLEAR: (e.g., end of session)
        active_orders.clear()
        count_per_price.clear()
        volume_per_price.clear()
        price_queues.clear()
        bid_prices_set.clear()
        ask_prices_set.clear()
        best_bid_price = None
        best_ask_price = None

In [30]:
# intitial features
features = pd.DataFrame.from_dict(initial_order_features, orient='index').reset_index().rename(columns={'index': 'order_id'})

In [31]:
# dataset for modeling
df = features.merge(pd.DataFrame(mbo[mbo["action"].isin(['F', 'T'])]["order_id"])\
                    .assign(y=1), on="order_id", how="left")\
             .drop_duplicates(subset=["order_id"], keep="first")\
             .merge(mbo[mbo["action"]=="A"][["order_id", "ts_event"]], on="order_id", how="left")
df["y"] = df["y"].fillna(0) # binary: fill or not


# orders still in the book by the end of this simulation
df_active_orders = pd.DataFrame.from_dict(active_orders, orient='index')\
                    .reset_index()\
                    .rename(columns={'index': 'order_id', "size": "remaining_size"})

df = pd.merge(df, df_active_orders, on=["order_id", "price", "side"], how="left")
df.loc[ (df["y"]==1) & (df["remaining_size"].isnull()), "remaining_size"] = 0
df.loc[ (df["y"]==1), "fill_size"] = df["size"] - df["remaining_size"]
df.loc[ (df["y"]==0), "fill_size"] = np.nan 
df["fill_rate"] = df["fill_size"] / df["size"] # actual fill rate for an order

In [32]:
df

Unnamed: 0,order_id,side,price,size,price_level,queue_position,orders_at_price,volume_at_price,orders_ahead_same_price,volume_ahead_same_price,orders_ahead_total,volume_ahead_total,is_best,bid_ask_imbalance,diff_to_best_price,y,ts_event,remaining_size,fill_size,fill_rate
0,8413561265369,B,110.703125,74,1,1,1,74,0,0,0,0,True,1.000000,0.0,0.0,2024-02-04 08:00:04.267042851-05:00,74.0,,
1,8413561265463,B,110.703125,111,1,2,2,185,1,74,1,74,True,1.000000,0.0,0.0,2024-02-04 08:00:04.267042851-05:00,111.0,,
2,8413561265577,B,110.703125,80,1,3,3,265,2,185,2,185,True,1.000000,0.0,0.0,2024-02-04 08:00:04.267042851-05:00,80.0,,
3,8413561265622,B,110.703125,20,1,4,4,285,3,265,3,265,True,1.000000,0.0,0.0,2024-02-04 08:00:04.267042851-05:00,20.0,,
4,8413613296221,B,110.703125,258,1,5,5,543,4,285,4,285,True,1.000000,0.0,0.0,2024-02-04 08:00:04.267042851-05:00,258.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118209,8413644236921,A,111.140625,1,1,86,86,797,85,796,85,796,True,0.000610,0.0,0.0,2024-02-07 18:59:47.891600609-05:00,1.0,,
1118210,8413644236950,B,111.125000,1,1,80,80,473,79,472,79,472,True,-0.000607,0.0,0.0,2024-02-07 18:59:55.070144357-05:00,1.0,,
1118211,8413644236951,B,111.125000,22,1,80,80,494,79,472,79,472,True,-0.000581,0.0,0.0,2024-02-07 18:59:57.501117585-05:00,22.0,,
1118212,8413644236952,B,111.125000,1,1,81,81,495,80,494,80,494,True,-0.000579,0.0,0.0,2024-02-07 18:59:57.501256723-05:00,1.0,,


In [None]:
# df.to_csv("train_set.csv", index=False)

In [None]:
df.to_csv("processed/mbo.train.%s.csv" % ts)

NameError: name 'df' is not defined