In [2]:
import polars as pl

Data/data 2/extraction/TRTH/raw/equities/US/bbo/AAP/2010-05-03-AAP-bbo
.parquet\
Data/data/extraction/TRTH/raw/equities/US/trade/AAP/2010-05-03-AAP-trade.parquet

In [8]:
def load_TRTH_trade(filename,
            tz_exchange="America/New_York",
            only_non_special_trades=True,
            only_regular_trading_hours=True,
            merge_sub_trades=True):
    try:
        if filename.endswith("csv") or filename.endswith("csv.gz"):
            DF=pl.read_csv(filename)
        elif filename.endswith("parquet"):    
            DF=pl.read_parquet(filename)
        else:
            print("cannot load file "+filename+" : unknown format")
            return None
    except:
        print(filename+" cannot be loaded")
        return None

    excel_base_date = pl.datetime(1899, 12, 30)  # Excel starts counting from 1900-01-01, but Polars needs 1899-12-30
    DF = DF.with_columns(
        (pl.col("xltime") * pl.duration(days=1) + excel_base_date).alias("index")
    )
    DF = DF.with_columns(pl.col("index").dt.convert_time_zone(tz_exchange))
    DF.drop(["xltime","trade-rawflag","trade-stringflag"])

    if only_non_special_trades:
        DF=DF.filter(pl.col("trade-stringflag")=="uncategorized")

    if merge_sub_trades:   # average volume-weighted trade price here
        DF=DF.group_by('index',maintain_order=True).agg([(pl.col('trade-price')*pl.col('trade-volume')).sum()/(pl.col('trade-volume').sum()).alias('trade-price'),pl.sum('trade-volume')])        
    
    return DF


In [9]:
def load_TRTH_bbo(filename,
            tz_exchange="America/New_York",
            only_regular_trading_hours=True,
            hhmmss_open="09:30:00",
            hhmmss_close="16:00:00",
            merge_same_index=True):
    try:
        if filename.endswith("csv") or filename.endswith("csv.gz"):
            DF=pl.read_csv(filename)
        elif filename.endswith("parquet"):    
            DF=pl.read_parquet(filename)
        else:
            print("cannot load file "+filename+" : unknown format")
            return None
    except:
        print(filename+" cannot be loaded")
        return None

    excel_base_date = pl.datetime(1899, 12, 30)  # Excel starts counting from 1900-01-01, but Polars needs 1899-12-30
    DF = DF.with_columns(
        (pl.col("xltime") * pl.duration(days=1) + excel_base_date).alias("index")
    )
    DF = DF.with_columns(pl.col("index").dt.convert_time_zone(tz_exchange))
    DF.drop("xltime")

    # apply common sense filter
    DF = DF.filter(pl.col("ask-price")>0).filter(pl.col("bid-price")>0).filter(pl.col("ask-price")>pl.col("bid-price"))

    if merge_same_index:
        DF = DF.group_by('index',maintain_order=True).last()   # last quote of the same timestamp
    
    if only_regular_trading_hours:
        hh_open,mm_open,ss_open = [float(x) for x in hhmmss_open.split(":")]
        hh_close,mm_close,ss_close = [float(x) for x in hhmmss_close.split(":")]

        seconds_open=hh_open*3600+mm_open*60+ss_open
        seconds_close=hh_close*3600+mm_close*60+ss_close

        DF = DF.filter(pl.col('index').dt.hour().cast(float)*3600+pl.col('index').dt.minute().cast(float)*60+pl.col('index').dt.second()>=seconds_open,
                       pl.col('index').dt.hour().cast(float)*3600+pl.col('index').dt.minute().cast(float)*60+pl.col('index').dt.second()<=seconds_close)
    
    return DF

In [14]:
def wrangle_trade(DF,
            tz_exchange="America/New_York",
            only_non_special_trades=True,
            only_regular_trading_hours=True,
            merge_sub_trades=True):
    excel_base_date = pl.datetime(1899, 12, 30)  # Excel starts counting from 1900-01-01, but Polars needs 1899-12-30
    DF = DF.with_columns(
        (pl.col("xltime") * pl.duration(days=1) + excel_base_date).alias("index")
    )
    DF = DF.with_columns(pl.col("index").dt.convert_time_zone(tz_exchange))

    if only_non_special_trades:
        DF=DF.filter(pl.col("trade-stringflag")=="uncategorized")

    DF = DF.drop(["xltime","trade-rawflag","trade-stringflag"])


    if merge_sub_trades:   # average volume-weighted trade price here
        DF=DF.group_by('index',maintain_order=True).agg([(pl.col('trade-price')*pl.col('trade-volume')).sum()/(pl.col('trade-volume').sum()).alias('trade-price'),pl.sum('trade-volume')])        
    
    return DF




def load_TRTH_trade_file(filename,
            tz_exchange="America/New_York",
            only_non_special_trades=True,
            only_regular_trading_hours=True,
            merge_sub_trades=True):
    try:
        if filename.endswith("csv") or filename.endswith("csv.gz"):
            DF=pl.read_csv(filename)
        elif filename.endswith("parquet"):    
            DF=pl.read_parquet(filename)
        else:
            print("cannot load file "+filename+" : unknown format")
            return None
    except:
        print(filename+" cannot be loaded")
        return None
    
    DF = wrangle_trade(DF,
            tz_exchange=tz_exchange,
            only_non_special_trades=only_non_special_trades,
            only_regular_trading_hours=only_regular_trading_hours,
            merge_sub_trades=merge_sub_trades)

    return DF

In [19]:
def wrangle_bbo(DF,
            tz_exchange="America/New_York",
            only_regular_trading_hours=True,
            hhmmss_open="09:30:00",
            hhmmss_close="16:00:00",
            merge_same_index=True):

    excel_base_date = pl.datetime(1899, 12, 30)  # Excel starts counting from 1900-01-01, but Polars needs 1899-12-30
    DF = DF.with_columns(
        (pl.col("xltime") * pl.duration(days=1) + excel_base_date).alias("index")
    )
    DF = DF.with_columns(pl.col("index").dt.convert_time_zone(tz_exchange))
    DF = DF.drop("xltime")

    # apply common sense filter
    DF = DF.filter(pl.col("ask-price")>0).filter(pl.col("bid-price")>0).filter(pl.col("ask-price")>pl.col("bid-price"))

    if merge_same_index:
        DF = DF.group_by('index',maintain_order=True).last()   # last quote of the same timestamp
    
    if only_regular_trading_hours:
        hh_open,mm_open,ss_open = [int(x) for x in hhmmss_open.split(":")]
        hh_close,mm_close,ss_close = [int(x) for x in hhmmss_close.split(":")]

        seconds_open=hh_open*3600+mm_open*60+ss_open
        seconds_close=hh_close*3600+mm_close*60+ss_close

        DF = DF.filter(pl.col('index').dt.hour().cast(pl.Int32)*3600+pl.col('index').dt.minute().cast(pl.Int32)*60+pl.col('index').dt.second().cast(pl.Int32)>=seconds_open,
                       pl.col('index').dt.hour().cast(pl.Int32)*3600+pl.col('index').dt.minute().cast(pl.Int32)*60+pl.col('index').dt.second().cast(pl.Int32)<=seconds_close)
    return DF

def load_TRTH_bbo_file(filename,
            tz_exchange="America/New_York",
            only_regular_trading_hours=True,
            hhmmss_open="09:30:00",
            hhmmss_close="16:00:00",
            merge_same_index=True):
    try:
        if filename.endswith("csv") or filename.endswith("csv.gz"):
            DF=pl.read_csv(filename)
        elif filename.endswith("parquet"):    
            DF=pl.read_parquet(filename)
        else:
            print("cannot load file "+filename+" : unknown format")
            return None
    except:
        print(filename+" cannot be loaded")
        return None

    DF = wrangle_bbo(DF,
            tz_exchange=tz_exchange,
            only_regular_trading_hours=only_regular_trading_hours,
            hhmmss_open=hhmmss_open,
            hhmmss_close=hhmmss_close,
            merge_same_index=merge_same_index)

    
    return DF

In [20]:
trade_df = load_TRTH_trade_file("Data/data/extraction/TRTH/raw/equities/US/trade/AAP/2010-05-03-AAP-trade.parquet")
bbo_df = load_TRTH_bbo_file("Data/data 2/extraction/TRTH/raw/equities/US/bbo/AAP/2010-05-03-AAP-bbo.parquet")

In [21]:
trade_df

index,trade-price,trade-volume
"datetime[μs, America/New_York]",f64,i32
2010-05-03 09:30:00.638 EDT,45.13,200
2010-05-03 09:30:03.871 EDT,45.21,100
2010-05-03 09:30:06.277 EDT,45.2,100
2010-05-03 09:30:06.289 EDT,45.316,500
2010-05-03 09:30:10.254 EDT,45.21,100
…,…,…
2010-05-03 15:59:53.012999 EDT,46.29,600
2010-05-03 15:59:54.416 EDT,46.28,100
2010-05-03 15:59:57.126 EDT,46.3,200
2010-05-03 15:59:57.520 EDT,46.28,100


In [22]:
bbo_df

index,bid-price,bid-volume,ask-price,ask-volume
"datetime[μs, America/New_York]",f64,i32,f64,i32
2010-05-03 09:30:00.623 EDT,45.06,1,45.53,1
2010-05-03 09:30:00.645 EDT,44.99,1,45.53,1
2010-05-03 09:30:01.105 EDT,45.17,5,45.39,13
2010-05-03 09:30:01.421 EDT,45.18,2,45.39,13
2010-05-03 09:30:01.663 EDT,45.18,2,45.38,2
…,…,…,…,…
2010-05-03 15:59:59.204 EDT,46.28,275,46.29,2
2010-05-03 15:59:59.253 EDT,46.28,275,46.29,1
2010-05-03 15:59:59.607 EDT,46.28,275,46.29,4
2010-05-03 15:59:59.800 EDT,46.28,276,46.29,4


In [23]:
events = bbo_df.join(trade_df,on='index',how="full",coalesce=True).sort('index')

events = events.with_columns(pl.col('index').dt.date().alias('date'))
events = events.with_columns([pl.col('bid-price').forward_fill().over('date'),
                              pl.col('bid-volume').forward_fill().over('date'),
                              pl.col('ask-price').forward_fill().over('date'),
                              pl.col('ask-volume').forward_fill().over('date')])
events = events.drop_nulls()   
events

index,bid-price,bid-volume,ask-price,ask-volume,trade-price,trade-volume,date
"datetime[μs, America/New_York]",f64,i32,f64,i32,f64,i32,date
2010-05-03 09:30:00.638 EDT,45.06,1,45.53,1,45.13,200,2010-05-03
2010-05-03 09:30:03.871 EDT,45.21,1,45.37,4,45.21,100,2010-05-03
2010-05-03 09:30:06.277 EDT,45.19,4,45.36,2,45.2,100,2010-05-03
2010-05-03 09:30:06.289 EDT,45.19,6,45.36,2,45.316,500,2010-05-03
2010-05-03 09:30:10.254 EDT,45.21,1,45.34,2,45.21,100,2010-05-03
…,…,…,…,…,…,…,…
2010-05-03 15:59:53.012999 EDT,46.28,249,46.3,4,46.29,600,2010-05-03
2010-05-03 15:59:54.416 EDT,46.28,249,46.3,4,46.28,100,2010-05-03
2010-05-03 15:59:57.126 EDT,46.28,247,46.3,7,46.3,200,2010-05-03
2010-05-03 15:59:57.520 EDT,46.28,247,46.3,7,46.28,100,2010-05-03
