In [1]:
import json
from typing import Dict, List, Tuple
import pandas as pd
import os
from sortedcontainers import SortedDict

N_LEVELS = 25
PREFIX_PATH = "/home/davide/Desktop/phd/bitfinex-api-py/data/"
ORDERBOOK_CHANGES_SUBDIRECTORY = "orderbook_changes"

class OrderBook:
    def __init__(self):
        self._book = {"bids": SortedDict(), "asks": SortedDict()}

    def insert_snapshot(self, snapshot: Dict):
        self._insert_side_snapshot(snapshot["bids"], is_bid=True)
        self._insert_side_snapshot(snapshot["asks"], is_bid=False)

    def _insert_side_snapshot(self, snapshot: Dict, is_bid: bool):
        side = self.get_side_str(is_bid)

        for order in snapshot.values():
            self._insert_level(order, side)

    def get_side_str(self, is_bid):
        return "bids" if is_bid else "asks"

    def _insert_level(self, order: Dict, side: str):
        price = order["p"]
        amount = order["a"]

        self._book[side][price] = amount if side == "bids" else -amount

    def insert_updates_for_timestamp(self, update: Dict[str, List[float]]):
        if not (len(update["p"]) == len(update["a"]) == len(update["c"])):
            raise Exception("Update is not valid")

        for price, amount, count in zip(update["p"], update["a"], update["c"]):
            self.insert_update(price, amount, count)

    def insert_update(self, price, amount, count):
        side = self.get_side_str_for_update(amount)
        if count > 0:
            self._book[side][price] = amount if side == "bids" else -amount

        elif count == 0:
            if price in self._book[side]:
                self._book[side].pop(price)

    def get_side_str_for_update(self, amount: int):
        return "bids" if amount > 0 else "asks"

    def get_column_names(self, side: str, info_type: str):
        side = "Bid" if side == "bids" else "Ask"

        return [f"{side}{info_type}{i}" for i in range(1, N_LEVELS + 1)]

    def get_best_bid_price(self) -> float:
        return self._book["bids"].peekitem(index=-1)[0]

    def get_best_ask(self) -> float:
        return self._book["asks"].peekitem(index=0)[0]

    def get_mid_price(self) -> float:
        return (self.get_best_bid_price() + self.get_best_ask()) / 2

    def get_row_book(self) -> Dict[str, float]:
        price_size_bid_map = self._book["bids"]
        price_size_ask_map = self._book["asks"]

        # Create a new dictionary with custom keys
        row_dict = {}

        # Iterate over the items in the ask_order_book
        for i, (price, size) in enumerate(price_size_ask_map.items(), start=1):
            row_dict[f"AskPrice{i}"] = price
            row_dict[f"AskSize{i}"] = size

        for i, (price, size) in enumerate(
            reversed(price_size_bid_map.items()), start=1
        ):
            row_dict[f"BidPrice{i}"] = price
            row_dict[f"BidSize{i}"] = size

        return row_dict


def get_changes_orderbook_df(
    timestamp_snapshot: int,
    snapshot_dict: Dict[int, Dict],
    timestamp_updates_map: SortedDict[int, Dict],
) -> pd.DataFrame:
    row_dicts = []

    order_book = OrderBook()
    order_book.insert_snapshot(snapshot_dict)

    # last_mid_price = order_book.get_mid_price()

    _append_new_row(timestamp_snapshot, order_book, row_dicts)

    for timestamp, update in timestamp_updates_map.items():
        order_book.insert_updates_for_timestamp(update)

        # current_mid_price = order_book.get_mid_price()
        # if current_mid_price != last_mid_price:
        _append_new_row(timestamp, order_book, row_dicts)
        # last_mid_price = current_mid_price

    df = pd.DataFrame(row_dicts)

    return df


def _append_new_row(timestamp: int, order_book: OrderBook, row_dicts: List[Dict]):
    row_dict = order_book.get_row_book()
    row_dict["Timestamp"] = timestamp
    row_dicts.append(row_dict)


def read_orderbook_json(path: str) -> SortedDict[int, Dict]:
    with open(path, "r") as f:
        json_dict = json.load(f)

    json_dict = SortedDict({int(key): value for key, value in json_dict.items()})
    return json_dict


def pop_final_snapshot(orderbook_json: SortedDict[int, Dict]) -> Dict:
    if -1 in orderbook_json.keys():
        return orderbook_json.pop(-1)
    else:
        return None


def pop_first_timestamp_and_snapshot(
    orderbook_json: SortedDict[int, Dict]
) -> Tuple[int, Dict]:
    return orderbook_json.popitem(index=0)


def get_orderbook_changes_df(orderbook_json: SortedDict[int, Dict]) -> pd.DataFrame:
    timestamp_snapshot, initial_snapshot = pop_first_timestamp_and_snapshot(
        orderbook_json
    )

    return get_changes_orderbook_df(
        timestamp_snapshot, initial_snapshot, orderbook_json
    )

def is_final_snapshot_correct(final_snapshot_row, orderbook_changes_df):
    last_row = orderbook_changes_df.iloc[-1].to_dict()
    changed_keys = [key for key, value in final_snapshot_row.items() if value != last_row[key]]

    return len(changed_keys) == 0

def is_data_collection_in_file_interrupted(filename: str) -> bool:
    return filename.endswith("interrupted.json")

def get_file_timestamp(filename: str) -> int:
    return int(filename.split(".")[0].split("_")[1])

def get_file_timestamp_for_processed_file(filename: str) -> int:
    return int(filename.split(".")[0].split("_")[2])

def get_orderbook_changes_filename(prefix_path: str, timestamp: int, is_interrupted: bool) -> str:
    prefix = os.path.join(prefix_path, "orderbook_changes_")
    return f"{prefix}{timestamp}{'_interrupted' if is_interrupted else ''}.tsv"

def save_orderbook_changes_df(orderbook_changes_df: pd.DataFrame, timestamp: int, is_interrupted: bool, directory: str):
    orderbook_changes_df_filename = get_orderbook_changes_filename(
        directory, timestamp, is_interrupted
    )
    orderbook_changes_df.to_csv(orderbook_changes_df_filename, index=False, sep="\t")

def get_final_snapshot_row_from_json_data(orderbook_data):
    final_snapshot = pop_final_snapshot(orderbook_data)
    final_snapshot_orderbook = OrderBook()
    final_snapshot_orderbook.insert_snapshot(final_snapshot)
    final_snapshot_row = final_snapshot_orderbook.get_row_book()
    return final_snapshot_row

def get_files(directory: str) -> List[str]:
    return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

def get_timestamp_of_already_processed_files(directory: str) -> List[int]:
    files = get_files(directory)
    return [get_file_timestamp_for_processed_file(f) for f in files]

def get_json_files_to_process(main_directory: str, processed_directory: str) -> List[str]:
    already_processed_timestamps = get_timestamp_of_already_processed_files(processed_directory)
    files = get_files(main_directory)
    return [f for f in files if get_file_timestamp(f) not in already_processed_timestamps]

In [4]:
full_path_subdirectory_orderbook_changes = os.path.join(
    PREFIX_PATH, ORDERBOOK_CHANGES_SUBDIRECTORY
)

for filename in get_json_files_to_process(PREFIX_PATH, full_path_subdirectory_orderbook_changes):
    print(filename)

    timestamp = get_file_timestamp(filename)
    is_interrupted = is_data_collection_in_file_interrupted(filename)

    full_path_json = os.path.join(PREFIX_PATH, filename)

    orderbook_data = read_orderbook_json(full_path_json)
    if not is_interrupted:
        final_snapshot_row = get_final_snapshot_row_from_json_data(orderbook_data)

    orderbook_changes_df = get_orderbook_changes_df(orderbook_data)
    if not is_interrupted:
        if not is_final_snapshot_correct(final_snapshot_row, orderbook_changes_df):
            raise Exception("Final snapshot is not correct")

    save_orderbook_changes_df(
        orderbook_changes_df, timestamp, is_interrupted, full_path_subdirectory_orderbook_changes
    )



data_1707309508831_interrupted.json
data_1707280963252.json
data_1707287965872.json
data_1707220547100_interrupted.json
data_1707294968667.json
data_1707311863855_interrupted.json
data_1707216507382_interrupted.json
data_1707308973008.json
data_1707168700790.json
data_1707210945493.json
data_1707301970170.json
data_1707266958409.json
data_1707203716149.json
data_1707182706614.json
data_1707273960781.json
data_1707227557759.json
data_1707175703667.json
data_1707234560871.json


In [34]:
snapshot = {"bids": {"45300": {"p": 45300, "c": 6, "a": 50.07262815}, "45298": {"p": 45298, "c": 1, "a": 0.00755105}, "45296": {"p": 45296, "c": 2, "a": 0.0278}, "45295": {"p": 45295, "c": 1, "a": 0.03294997}, "45294": {"p": 45294, "c": 1, "a": 0.1}, "45293": {"p": 45293, "c": 1, "a": 0.0180239}, "45292": {"p": 45292, "c": 1, "a": 0.00807552}, "45286": {"p": 45286, "c": 1, "a": 0.04}, "45282": {"p": 45282, "c": 2, "a": 0.6871}, "45281": {"p": 45281, "c": 3, "a": 0.0295111}, "45272": {"p": 45272, "c": 1, "a": 0.00265}, "45271": {"p": 45271, "c": 1, "a": 0.0268}, "45269": {"p": 45269, "c": 1, "a": 0.0268}, "45268": {"p": 45268, "c": 2, "a": 0.10821855}, "45267": {"p": 45267, "c": 1, "a": 0.0268}, "45265": {"p": 45265, "c": 1, "a": 0.2298}, "45264": {"p": 45264, "c": 1, "a": 0.2962}, "45263": {"p": 45263, "c": 1, "a": 0.0268}, "45261": {"p": 45261, "c": 2, "a": 0.12013445}, "45260": {"p": 45260, "c": 1, "a": 0.0268}, "45259": {"p": 45259, "c": 1, "a": 0.08836}, "45258": {"p": 45258, "c": 1, "a": 0.11047}, "45257": {"p": 45257, "c": 2, "a": 0.0616639}, "45256": {"p": 45256, "c": 1, "a": 0.05254534}, "45255": {"p": 45255, "c": 2, "a": 0.1268}}, "asks": {"45301": {"p": 45301, "c": 15, "a": -3.16936927}, "45304": {"p": 45304, "c": 2, "a": -0.11301}, "45309": {"p": 45309, "c": 1, "a": -0.011}, "45310": {"p": 45310, "c": 1, "a": -0.011}, "45312": {"p": 45312, "c": 1, "a": -0.16962}, "45313": {"p": 45313, "c": 1, "a": -0.1103}, "45314": {"p": 45314, "c": 2, "a": -1.1905}, "45317": {"p": 45317, "c": 2, "a": -0.40166426}, "45320": {"p": 45320, "c": 1, "a": -0.05}, "45332": {"p": 45332, "c": 2, "a": -0.00924}, "45333": {"p": 45333, "c": 1, "a": -0.0054}, "45335": {"p": 45335, "c": 1, "a": -0.19472161}, "45343": {"p": 45343, "c": 1, "a": -0.05147264}, "45345": {"p": 45345, "c": 1, "a": -0.07720897}, "45346": {"p": 45346, "c": 1, "a": -0.10294529}, "45347": {"p": 45347, "c": 1, "a": -0.12868161}, "45352": {"p": 45352, "c": 1, "a": -0.15441794}, "45353": {"p": 45353, "c": 1, "a": -0.0536}, "45356": {"p": 45356, "c": 1, "a": -0.17372018}, "45359": {"p": 45359, "c": 2, "a": -0.13808101}, "45364": {"p": 45364, "c": 2, "a": -1.75334193}, "45371": {"p": 45371, "c": 2, "a": -0.00924}, "45373": {"p": 45373, "c": 2, "a": -0.36687013}, "45374": {"p": 45374, "c": 2, "a": -0.059}, "45375": {"p": 45375, "c": 1, "a": -0.0536}}}

In [44]:
diff_update = {"p": [45264, 45343, 45345, 45352, 45356, 45373, 45374, 45375, 45300, 45282, 45267, 45258, 45257, 45256, 45254, 45301, 45332, 45333, 45346, 45347, 45348, 45350, 45354, 45358, 45359, 45360, 45364, 45367, 45371, 45372], "a": [1, -1, -1, -1, -1, -1, -1, -1, 50.07062815, 1.0804, 0.7267, 0.16301534, 0.14048191, 0.10509068, 0.29105749, -3.11958927, -0.00378, -0.01086, -0.05147264, -0.07720897, -0.10294529, -0.12868161, -0.15441794, -0.17372018, -0.0233, -0.11014645, -1.5, -0.25334193, -0.00378, -0.00546], "c": [0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 2, 2, 3, 1, 4, 11, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
diff_update['p'] = diff_update['p'][0]
diff_update['a'] = diff_update['a'][0]
diff_update['c'] = diff_update['c'][0]
diff_update

{'p': 45264, 'a': 1, 'c': 0}

In [45]:
ob = OrderBook()
ob.insert_snapshot(snapshot)

first = ob.get_row_book()

ob.insert_update('bids', diff_update['p'], diff_update['a'], diff_update['c'])

second = ob.get_row_book()

In [46]:
first

{'AskPrice1': 45301,
 'AskSize1': 3.16936927,
 'AskPrice2': 45304,
 'AskSize2': 0.11301,
 'AskPrice3': 45309,
 'AskSize3': 0.011,
 'AskPrice4': 45310,
 'AskSize4': 0.011,
 'AskPrice5': 45312,
 'AskSize5': 0.16962,
 'AskPrice6': 45313,
 'AskSize6': 0.1103,
 'AskPrice7': 45314,
 'AskSize7': 1.1905,
 'AskPrice8': 45317,
 'AskSize8': 0.40166426,
 'AskPrice9': 45320,
 'AskSize9': 0.05,
 'AskPrice10': 45332,
 'AskSize10': 0.00924,
 'AskPrice11': 45333,
 'AskSize11': 0.0054,
 'AskPrice12': 45335,
 'AskSize12': 0.19472161,
 'AskPrice13': 45343,
 'AskSize13': 0.05147264,
 'AskPrice14': 45345,
 'AskSize14': 0.07720897,
 'AskPrice15': 45346,
 'AskSize15': 0.10294529,
 'AskPrice16': 45347,
 'AskSize16': 0.12868161,
 'AskPrice17': 45352,
 'AskSize17': 0.15441794,
 'AskPrice18': 45353,
 'AskSize18': 0.0536,
 'AskPrice19': 45356,
 'AskSize19': 0.17372018,
 'AskPrice20': 45359,
 'AskSize20': 0.13808101,
 'AskPrice21': 45364,
 'AskSize21': 1.75334193,
 'AskPrice22': 45371,
 'AskSize22': 0.00924,
 'AskP

In [47]:
second

{'AskPrice1': 45301,
 'AskSize1': 3.16936927,
 'AskPrice2': 45304,
 'AskSize2': 0.11301,
 'AskPrice3': 45309,
 'AskSize3': 0.011,
 'AskPrice4': 45310,
 'AskSize4': 0.011,
 'AskPrice5': 45312,
 'AskSize5': 0.16962,
 'AskPrice6': 45313,
 'AskSize6': 0.1103,
 'AskPrice7': 45314,
 'AskSize7': 1.1905,
 'AskPrice8': 45317,
 'AskSize8': 0.40166426,
 'AskPrice9': 45320,
 'AskSize9': 0.05,
 'AskPrice10': 45332,
 'AskSize10': 0.00924,
 'AskPrice11': 45333,
 'AskSize11': 0.0054,
 'AskPrice12': 45335,
 'AskSize12': 0.19472161,
 'AskPrice13': 45343,
 'AskSize13': 0.05147264,
 'AskPrice14': 45345,
 'AskSize14': 0.07720897,
 'AskPrice15': 45346,
 'AskSize15': 0.10294529,
 'AskPrice16': 45347,
 'AskSize16': 0.12868161,
 'AskPrice17': 45352,
 'AskSize17': 0.15441794,
 'AskPrice18': 45353,
 'AskSize18': 0.0536,
 'AskPrice19': 45356,
 'AskSize19': 0.17372018,
 'AskPrice20': 45359,
 'AskSize20': 0.13808101,
 'AskPrice21': 45364,
 'AskSize21': 1.75334193,
 'AskPrice22': 45371,
 'AskSize22': 0.00924,
 'AskP

In [2]:
# read json
with open('/home/davide/Desktop/phd/hawkes/file_densities_map.json', 'r') as f:
    data = json.load(f)
data

{'orderbook_changes_1705778125223.tsv': [['2024-01-20 19:08:21', '47']],
 'orderbook_changes_1705265652734.tsv': [['2024-01-14 20:35:13', '60']],
 'orderbook_changes_1705186708306.tsv': [['2024-01-13 22:35:59', '31']],
 'orderbook_changes_1705588937894_interrupted.tsv': [['2024-01-18 14:24:47',
   '66']],
 'orderbook_changes_1705803721515.tsv': [['2024-01-21 01:43:53', '77'],
  ['2024-01-21 01:08:47', '57'],
  ['2024-01-21 02:18:53', '21']],
 'orderbook_changes_1705449324601.tsv': [['2024-01-16 23:44:33', '93'],
  ['2024-01-16 22:39:09', '58']],
 'orderbook_changes_1705180936343.tsv': [['2024-01-13 21:15:36', '84']],
 'orderbook_changes_1705268954407.tsv': [['2024-01-14 21:24:28', '116']],
 'orderbook_changes_1705171545500.tsv': [['2024-01-13 18:25:23', '44']],
 'orderbook_changes_1705379075888.tsv': [['2024-01-16 03:25:03', '95'],
  ['2024-01-16 04:04:31', '64']],
 'orderbook_changes_1705262349261.tsv': [['2024-01-14 19:34:16', '88']],
 'orderbook_changes_1706102874539_interrupted.tsv

In [17]:
# read orderbook tsv
df = pd.read_csv('/home/davide/Desktop/phd/bitfinex-api-py/data/orderbook_changes/orderbook_changes_1705778125223.tsv', sep='\t')
df['Datetime'] = pd.to_datetime(df['Timestamp'], unit='ms')
df = df[(df['Datetime'] >= '2024-01-20 19:08:21') & (df['Datetime'] <= '2024-01-20 19:10:21')]
df['Datetime_truncated'] = df['Datetime'].dt.floor('s')
df['MidPrice'] = (df["AskPrice1"]+df["BidPrice1"])/2
df['Difference'] = (-df["MidPrice"]+df["MidPrice"].shift(-1))
df = df.dropna()
df = df[df['Difference'] != 0]


df.head()

Unnamed: 0,AskPrice1,AskSize1,AskPrice2,AskSize2,AskPrice3,AskSize3,AskPrice4,AskSize4,AskPrice5,AskSize5,...,BidSize23,BidPrice24,BidSize24,BidPrice25,BidSize25,Timestamp,Datetime,Datetime_truncated,MidPrice,Difference
11051,41710,1.198754,41712,0.63,41713,0.506195,41714,0.6785,41715,0.01083,...,0.255027,41668.0,0.007155,41667.0,3.477809,1705777702608,2024-01-20 19:08:22.608,2024-01-20 19:08:22,41708.5,-0.5
11058,41709,0.283664,41710,0.922322,41712,0.63,41713,0.506195,41714,0.6785,...,0.255027,41668.0,0.007155,41667.0,3.473429,1705777703878,2024-01-20 19:08:23.878,2024-01-20 19:08:23,41708.0,-0.5
11059,41708,0.283664,41710,0.922322,41712,0.63,41713,0.506195,41714,0.6785,...,3.473429,41666.0,0.1852,41665.0,0.363946,1705777704093,2024-01-20 19:08:24.093,2024-01-20 19:08:24,41707.5,0.5
11061,41709,0.283664,41710,0.922322,41712,0.63,41713,0.506195,41714,0.6785,...,0.255027,41668.0,0.007155,41667.0,3.473429,1705777704535,2024-01-20 19:08:24.535,2024-01-20 19:08:24,41708.0,-0.5
11062,41708,0.283664,41710,0.922322,41712,0.63,41713,0.506195,41714,0.6785,...,0.255027,41668.0,0.007155,41667.0,3.473429,1705777704754,2024-01-20 19:08:24.754,2024-01-20 19:08:24,41707.5,0.5


In [18]:
df['Datetime_truncated'].unique()

<DatetimeArray>
['2024-01-20 19:08:22', '2024-01-20 19:08:23', '2024-01-20 19:08:24',
 '2024-01-20 19:08:32', '2024-01-20 19:08:33', '2024-01-20 19:08:34',
 '2024-01-20 19:08:35', '2024-01-20 19:08:36', '2024-01-20 19:08:37',
 '2024-01-20 19:08:38', '2024-01-20 19:08:43', '2024-01-20 19:08:44',
 '2024-01-20 19:08:45', '2024-01-20 19:08:49', '2024-01-20 19:08:51',
 '2024-01-20 19:08:57', '2024-01-20 19:08:58', '2024-01-20 19:09:00',
 '2024-01-20 19:09:02', '2024-01-20 19:09:08', '2024-01-20 19:09:09',
 '2024-01-20 19:09:10', '2024-01-20 19:09:11', '2024-01-20 19:09:12',
 '2024-01-20 19:09:13', '2024-01-20 19:09:18', '2024-01-20 19:09:19',
 '2024-01-20 19:09:20', '2024-01-20 19:09:22', '2024-01-20 19:09:23',
 '2024-01-20 19:09:27', '2024-01-20 19:09:28', '2024-01-20 19:09:34',
 '2024-01-20 19:09:35', '2024-01-20 19:09:41', '2024-01-20 19:10:03',
 '2024-01-20 19:10:04', '2024-01-20 19:10:05', '2024-01-20 19:10:06',
 '2024-01-20 19:10:07', '2024-01-20 19:10:08', '2024-01-20 19:10:10',
 '20