# Combine raw Limit Order Book and Market Message data

Only keep data for execution of visible orders

In [None]:
import os
import re
import pathlib
import pandas as pd

from pprint import pprint
from datetime import date, time

import LOB_analysis

In [None]:
def parse_date(filename):
    import re
    
    date_regex = re.compile(r".*_(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})_.*")
    match = date_regex.match(filename)
    
    year = int(match.group("year"))
    month = int(match.group("month"))
    day = int(match.group("day"))
    
    return (year, month, day)

In [None]:
BASE_DIR = pathlib.Path(r"../data/drive")
month_dirs = list(BASE_DIR.glob("*"))

In [None]:
for month_dir in month_dirs:
    print(f"Loading directory: {month_dir}")
    orderbook_files = list(month_dir.glob("*orderbook*"))
    message_files = list(month_dir.glob("*message*"))
    
    messages   = []
    orderbooks = []
    for orderbook_file in orderbook_files:
        year, month, day = parse_date(str(orderbook_file))
        print(f"\tLoading files for date: {year}-{month}-{day}", end="\r")
        
        for message_file in message_files:
            y, m, d = parse_date(str(message_file))
            if (year == y) and (month == m) and (day == d):
                break
        
        if (year != y) or (month != m) or (day != d):
            raise Exception(f"Directory: {str(month_dir)}:\tOrderbook for date {year}-{month}-{day} has no matching message book.")
        
        odb = LOB_analysis.load_LOB(orderbook_file)
        msg = LOB_analysis.load_message(message_file)
        
        msg.loc[:, 'time'] = pd.to_datetime(date(year=year, month=month, day=day)) + pd.to_timedelta(msg.time, unit="s")
        
        msg_type_bool = msg.type == 4
        
        odb = odb.loc[msg_type_bool, :].reset_index(drop=True)
        msg = msg.loc[msg_type_bool, :].drop("type", axis=1).reset_index(drop=True)
        
        msg.loc[:, 'price'] = msg.loc[:, 'price'] / 1e4
        odb.loc[:, odb.columns.str.contains("price")] = odb.loc[:, odb.columns.str.contains("price")] / 1e4
        
        messages.append(msg)
        orderbooks.append(odb)
    
    messages   = pd.concat(messages, axis=0)
    orderbooks = pd.concat(orderbooks, axis=0)
    
    full_month_dir = os.path.abspath(month_dir)
    messages_fn    = os.path.join(full_month_dir, month_dir.name + "_msg.pickle")
    orderbooks_fn  = os.path.join(full_month_dir, month_dir.name + "_odb.pickle")
    
    messages.to_pickle(messages_fn)
    orderbooks.to_pickle(orderbooks_fn)