In [1]:
# default_exp core

# ParseDeep

> Parse IEX DEEP files

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#export
from pcap import pcap
from struct import unpack, unpack_from, calcsize

In [4]:
#hide
import pyarrow as pa 
import pyarrow.parquet as pq
import pandas as pd


In [5]:
#export
class Msg():
    "Class for working with a DEEP message"
    def __init__(self, sequence_number, msg):
        self.sequence_number = sequence_number
        self.type = msg[0:1]        

In [6]:
#export
class PriceLevelUpdateMsg(Msg):
    "Class for working with a Price Level Update message"
    
    def __init__(self, sequence_number, msg):
        fmt = "<bq8siq"    
        
        self.event_flags, self.timestamp, self.symbol, self.size, self.price = \
            unpack_from(fmt, msg[1:])

        self.symbol = self.symbol.rstrip()
        self.price = self.price * .0001
        
        super().__init__(sequence_number, msg)        

In [7]:
#export
class BuySidePriceLevelUpdageMsg(PriceLevelUpdateMsg):
    "Class for working with a Buy Side Price Level Update message"

In [8]:
#export
class SellSidePriceLevelUpdageMsg(PriceLevelUpdateMsg):
    "Class for working with a Sell Side Price Level Update message"

In [9]:
#export
class UnsupportedMsg(Msg):
    "Class for catching unsupported messages"

In [10]:
#export
class Pkt():
    "Class for working with a DEEP packet"
    def __init__(self, pkt): 
        self.index = -1
        
        self.version     = b"\x01"
        self.reserved    = b"\x00"
        self.protocol_id = b"\x04\x80"        
        self.channel_id  = b"\x01\x00\x00\x00"
            
        header = (
            self.version + 
            self.reserved + 
            self.protocol_id + 
            self.channel_id
        )
    
        pkt = pkt[pkt.find(header) + len(header):]

        fmt = "<lhhqqq"
        
        self.session_id, self.payload_len, self.msg_count, self.stream_offset, self.first_msg_seq_num, self.send_time = \
            unpack_from(fmt, pkt)
                  
        self.msgs = pkt[calcsize(fmt):]        

    def next_msg(self):
        self.index += 1
        if self.index == self.msg_count:
            return None
        else:
            frag, msgs = self.msgs[0:2], self.msgs[2:]            
            msg_len = unpack("<h", frag)[0]
            msg, self.msgs = msgs[0:msg_len], msgs[msg_len:]
            
            bare_message = Msg(self.first_msg_seq_num + self.index, msg)
            if bare_message.type == b'8': # buy side
                return BuySidePriceLevelUpdageMsg(self.first_msg_seq_num + self.index, msg)    
            elif bare_message.type == b'5': # sell side
                return SellSidePriceLevelUpdageMsg(self.first_msg_seq_num + self.index, msg)
            
            return UnsupportedMsg(self.first_msg_seq_num + self.index, msg)

In [11]:
#export 
class Deep():
    "Class for working with a DEEP export file"
    def __init__(self, path):
        self._pkt = None
        self.pcap = pcap(path)

    def __set_pkt(self, ts, _pkt):
        self._pkt = Pkt(_pkt) 
        
    def next_pkt(self):
        if self.pcap.dispatch(1, self.__set_pkt) == 1:
            return(self._pkt) 
        else:
            return None

In [12]:
deep = Deep('input\data_feeds_20210924_20210924_IEXTP1_DEEP1.0.pcap')

In [13]:
chunk_size = 1_000_000

In [14]:
def write_chunk(msgs):
    df = pd.DataFrame({
        'timestamp': [m.timestamp for m in msgs],
        'symbol': [m.symbol for m in msgs],
        'size': [m.size for m in msgs],
        'price': [m.price for m in msgs]
    })
    table = pa.Table.from_pandas(df)
    pq.write_table(table, 'output\price_level_updates_{}.parquet'.format(msgs[0].timestamp))

In [15]:
i = 1
msgs = []
while pkt := deep.next_pkt():
    while msg := pkt.next_msg():
        if type(msg) != UnsupportedMsg:
            msgs.append(msg)
        if len(msgs) >= chunk_size:
            write_chunk(msgs)
            msgs = []
            i += 1
            print(".", end="")

# write remaining less than chunk_size left overs
write_chunk(msgs)
print(".")

.....................................................................................