# Merging trades and bbos with polars

The data for SPY.P covers 2 years (2009 and 2010).

In [1]:
import polars as pl
import numpy as np

import os

dirData_raw = "data/raw/TRTH/equities/US/"
dirData_clean = dirData_raw.replace("raw","clean")

ticker = "SPY.P"

bbo   = pl.scan_parquet(dirData_clean+"/bbo/"+ticker+"_bbo.parquet").sort('index')
trade = pl.scan_parquet(dirData_clean+"/trade/"+ticker+"_trade.parquet").sort('index')

events = bbo.join(trade,on='index',how="full",coalesce=True).sort('index')

events = events.with_columns(pl.col('index').dt.date().alias('date'))
events = events.with_columns([pl.col('bid-price').forward_fill().over('date'),
                              pl.col('bid-volume').forward_fill().over('date'),
                              pl.col('ask-price').forward_fill().over('date'),
                              pl.col('ask-volume').forward_fill().over('date')])
events

In [2]:
file_events = dirData_clean+"/"+ticker+"_events.parquet"
events.collect(streaming=True).write_parquet(file_events)


How many lines, though?

In [3]:
pl.scan_parquet(file_events).select(pl.len()).collect()

len
u32
188758365


And about 1GB (compressed with zstd)

In [4]:
import os
os.path.getsize(file_events)//2**20

945

In some occasions, we do not need all the quotes between the trades. Let us generate a simpler event df.

In [5]:

bbo   = pl.scan_parquet(dirData_clean+"/bbo/"+ticker+"_bbo.parquet").sort('index')
trade = pl.scan_parquet(dirData_clean+"/trade/"+ticker+"_trade.parquet").sort('index')

events = bbo.join(trade,on='index',how="full",coalesce=True).sort('index')

events = events.with_columns(pl.col('index').dt.date().alias('date'))
events = events.with_columns([pl.col('bid-price').forward_fill().over('date'),
                              pl.col('bid-volume').forward_fill().over('date'),
                              pl.col('ask-price').forward_fill().over('date'),
                              pl.col('ask-volume').forward_fill().over('date')])
events = events.drop_nulls()   
events

In [6]:
file_events_short = dirData_clean+"/"+ticker+"_events_short.parquet"

events.collect(streaming=True).write_parquet(file_events_short)

In [7]:
pl.scan_parquet(file_events_short).select(pl.len()).collect()

len
u32
18589042


A more manageable file size:

In [8]:
import os
os.path.getsize(file_events_short)//(2**20)

146