# Pre-processing Tick Data

Let's collect all of our cleanup and processing of tick data in one notebook. Of course this really belongs in a separate python script...

In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import datetime

def show_time( label_string ):
    ts = time.time()
    st = datetime.datetime.fromtimestamp( ts ).strftime( '%Y-%m-%d %H:%M:%S:%f' )
    print( label_string + ' : ' + st )

### Get Market Data

In [2]:
# load trades
show_time( 'started loading trades' )
trades = pd.read_csv( 'data/TWTR-trades-201901.zip' )
show_time( 'finished loading trades')

# index trades
show_time( 'started fixing trade index')

# fix column names
trades = trades.rename( index=str, columns={ 'SIZE' : 'trade_qty', 'PRICE' : 'trade_price' } )

# fix index
times = trades[ 'DATE' ].astype( str ) + trades[ 'TIME_M' ].astype( str )
formatStr = '%Y%m%d%H:%M:%S.%f'
trades.index = pd.to_datetime( times, format = formatStr )

# clean up unused columns
trades = trades.drop( columns=[ 'DATE', 'TIME_M', 'SYM_ROOT', 'SYM_SUFFIX', 'TR_CORR', 'TR_SEQNUM', 'TR_RF' ] )
show_time( 'finished fixing trade index' )

started loading trades : 2019-04-19 16:56:01:272872
finished loading trades : 2019-04-19 16:56:04:169623
started fixing trade index : 2019-04-19 16:56:04:169774
finished fixing trade index : 2019-04-19 16:56:12:071125


In [3]:
# load quotes
show_time( 'started loading quotes' )
quotes = pd.read_csv( 'data/TWTR-quotes-201901.zip' )
show_time( 'finished loading quotes')

# index trades
show_time( 'started fixing quotes index' )

# fix column names
quotes = quotes.rename( index=str, columns={ 'BID' : 'bid_price', 'BIDSIZ' : 'bid_qty', 'ASK' : 'ask_price', 'ASKSIZ' : 'ask_qty' } )

# fix index
times = quotes[ 'DATE' ].astype( str ) + quotes[ 'TIME_M' ].astype( str )
formatStr = '%Y%m%d%H:%M:%S.%f'
quotes.index = pd.to_datetime( times, format = formatStr )

# cleanup unused columns
quotes = quotes.drop( columns=[ 'DATE', 'TIME_M', 'QU_SEQNUM', 'QU_SOURCE', 'SYM_ROOT', 'SYM_SUFFIX' ] )
show_time( 'finished fixing quotes index' )

started loading quotes : 2019-04-19 16:57:48:476146
finished loading quotes : 2019-04-19 16:59:36:075458
started fixing quotes index : 2019-04-19 16:59:36:096172
finished fixing quotes index : 2019-04-19 17:04:32:852462


In [7]:
daytrades = trades['2019-01-31 09:30:00':'2019-01-31 16:00:00']
dayquotes = quotes['2019-01-31 09:30:00':'2019-01-31 16:00:00']

In [8]:
# now merge quote and tick data
show_time( 'start merge' )
taq = dayquotes.merge( daytrades, how = 'outer', left_index = True, right_index = True )
show_time( 'end merge' )

start merge : 2019-04-19 17:06:46:411107
end merge : 2019-04-19 17:06:47:711826


In [9]:
taq.tail()

Unnamed: 0,EX_x,bid_price,bid_qty,ask_price,ask_qty,QU_COND,NATBBO_IND,QU_CANCEL,EX_y,TR_SCOND,trade_qty,trade_price,TR_SOURCE
2019-01-31 16:00:00.481339704,K,33.23,10.0,33.55,1.0,R,A,,,,,,
2019-01-31 16:00:00.481352000,,,,,,,,,K,TI,5.0,33.55,C
2019-01-31 16:00:00.625473000,,,,,,,,,T,6 X,5000.0,33.44,C
2019-01-31 16:00:00.625593000,,,,,,,,,T,M,5000.0,33.44,C
2019-01-31 16:00:00.947814217,T,33.46,5.0,33.56,1.0,R,A,,,,,,


In [10]:
taq.to_pickle("TWTR_taq_20190131.gz")

In [11]:
trades.to_pickle("TWTR_trades_201901.gz")

In [12]:
quotes.to_pickle("TWTR_quotes_201901.gz")

In [13]:
print(len(taq))
print(len(trades))
print(len(quotes))

2104802
1582553
45506707
