# Final Project Features

### Data Processing

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import datetime
import functools
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from statsmodels.stats.stattools import durbin_watson

In [2]:
path_to_trade_data = 'D:/academics/trading/final/trade_data/'
where_to_save_data = 'features/'

### Functions

In [3]:
def clean_etf_df(etf_df):
    """take etf dataframe and clean it up."""

    etf_trades.drop('SYM_SUFFIX',axis=1,inplace=True)
    etf_trades.dropna(inplace=True)

    etf_df['DATE'] = pd.to_datetime(etf_df['DATE'],format='%Y%m%d')
    etf_df['received'] = etf_df['DATE'].astype(str)+' '+etf_df['TIME_M']
    etf_df['received'] = etf_df['received'].apply(pd.Timestamp)

    etf_df['bid_ask_spread'] = etf_df['NBO']-etf_df['NBB']
    etf_df['bid_ask_over_price'] = etf_df['bid_ask_spread']/etf_df['PRICE']
    etf_df['bid_ask_over_price_timestamp_sum'] = etf_df.groupby('received')['bid_ask_over_price'].transform(sum)
    etf_df['bid_ask_spread_timestamp_sum'] = etf_df.groupby('received')['bid_ask_spread'].transform(sum)
    etf_df['dummy'] =1
    etf_df['timestamp_count'] = etf_df.groupby('received')['dummy'].transform(sum) # trades per timestamp
    etf_df['timestamp_volume'] = etf_df.groupby('received')['SIZE'].transform(sum) # volume in timestamp

    etf_df.sort_values(['received','TR_SEQNUM'],inplace=True)
    etf_df['cumulative_trade_count'] = etf_df['dummy'].cumsum()
    etf_df['cumulative_volume'] = etf_df['SIZE'].cumsum() # cumulative volume

    for i in [1,2]:  # 1 lag, then 2 lag
        where0 = (etf_df.LeeReady==0)
        lag = etf_df.shift(i)
        # curr price is lower than prev price (seller initiated (-1))
        etf_df[f"lag{i}"] = (etf_df[where0].PRICE < lag[where0].PRICE) * -1
        # curr price is higher than prev price (buyer initiated (+1))
        etf_df[f"lag{i}"] += (etf_df[where0].PRICE > lag[where0].PRICE) * 1
        etf_df[f"lag{i}"] = etf_df[f"lag{i}"].fillna(0)

        etf_df.LeeReady += etf_df[f"lag{i}"]

    etf_df = etf_df.drop(columns=[f"lag{i}"])

    etf_df['direction_size'] = etf_df['SIZE']*etf_df['LeeReady']
    etf_df['direction_size'] = etf_df.groupby('received')['direction_size'].transform(sum) # sum per timestamp
    
    etf_df['dollar_direction'] = etf_df['PRICE']*etf_df['SIZE']*etf_df['LeeReady']
    etf_df['dollar_direction'] = etf_df.groupby('received')['dollar_direction'].transform(sum) # sum per timestamp
    
    etf_df = etf_df.groupby('received').tail(1) # take last trade per timestamp
    etf_df.set_index('received',inplace=True) # set index to received timestamp

    time_criteria = (((etf_df.index.hour >= 9)&(etf_df.index.minute >= 30))|
                     (etf_df.index.hour>9))&(etf_df.index.hour < 16)

    etf_df = etf_df.loc[time_criteria].sort_index()
    
    etf_df['order_imbalance'] = (etf_df['NBBqty']-etf_df['NBOqty'])/etf_df['NBBqty']+etf_df['NBOqty']
    
    return etf_df

In [4]:
def calc_trades_in_period(trade_counts,Tau):
    """Take count of trades and find how many were in the prior Tau time interval."""
    cumulative_trade_count = trade_counts.cumsum()
    
    T_cumulative_trade_count = cumulative_trade_count.reindex(cumulative_trade_count.index-pd.Timedelta(Tau), method='bfill')
    T_cumulative_trade_count.index = cumulative_trade_count.index

    trades_in_period = cumulative_trade_count-T_cumulative_trade_count
    
    return trades_in_period

def tau_average(field,trade_counts,Tau):
    field_cum = field.cumsum()
    T_field_cum = field_cum.reindex(field_cum.index-pd.Timedelta(Tau), method='bfill')
    T_field_cum.index = field_cum.index

    field_sum_in_period = field_cum-T_field_cum
    trades_in_period = calc_trades_in_period(trade_counts,Tau)

    tau_mean_field = field_sum_in_period/trades_in_period
    
    return tau_mean_field

In [5]:
def tau_trade_flow(cummulative_trades,Tau):
    """Calculate trade flow from cummulative trade sides over Tau time period."""

    T_cummulative_trades = cummulative_trades.reindex(cummulative_trades.index-pd.Timedelta(Tau), method='bfill')
    T_cummulative_trades.index = cummulative_trades.index

    trade_flow = cummulative_trades-T_cummulative_trades
    trade_flow_i = trade_flow.shift(1)
    
    trade_flow_i.name = 'flow'
    
    return trade_flow_i

def T_ewm_vol(price_series,T):
    """Calculate cum log returns up to and not including T.
        EWMVar: Mean squared T return with decay = T.
        Vol = square root of EWMVar."""
    #avoid jumps in the return series from close to open (could remove if we want to include these)
    new_day = np.where(price_series.reset_index()['received'].dt.day.diff()!=0,np.nan,1)
    returns = np.log(price_series/price_series.shift()) * new_day
    T_returns = returns.rolling(T,closed='left').sum()
    
    ewm_var = (T_returns**2).ewm(halflife = T, times=returns.index,ignore_na=True).mean()
    ewm_vol = np.sqrt(ewm_var)
    ewm_vol.name = 'exp weighted volatility'
    return ewm_vol

def T_fwd_rtn(price_series,T):
    """Calculate T forward returns"""

    T_fwd_prices = price_series.reindex(price_series.index+pd.Timedelta(T), method='ffill')
    T_fwd_prices.index = price_series.index
    T_fwd_rtns = T_fwd_prices/price_series-1
    
    T_fwd_rtns.name = 'fwd_rtn'

    return T_fwd_rtns

def T_rtn(price_series,T):
    """Calculate T forward returns"""

    T_prices = price_series.reindex(price_series.index-pd.Timedelta(T), method='bfill')
    T_prices.index = price_series.index
    T_rtns = price_series/T_prices-1
    
    T_rtns.name = 'rtn'

    return T_fwd_rtns


In [6]:
def calc_fwd_rtns(etf_trades):
    "Calculate different forward return periods"
    fwd_rtns_5min = T_fwd_rtn(etf_trades['PRICE'],'300s')
    fwd_rtns_10min = T_fwd_rtn(etf_trades['PRICE'],'600s')
    fwd_rtns_15min = T_fwd_rtn(etf_trades['PRICE'],'900s')
    
    fwd_rtns = pd.concat([fwd_rtns_5min, fwd_rtns_10min, fwd_rtns_15min],axis=1)
    fwd_rtns.columns = ['fwd_rtn_5min','fwd_rtn_10min','fwd_rtn_15min']
    fwd_rtns.sort_index(inplace=True)
    
    return fwd_rtns

In [7]:
def calc_ewm_vol(price_series):
    """Calculate EWMA Volatility Metrics"""
    
    intervals = [1,2,4,5,15,20,25,30,45,60]
    times = [str(60*t)+'s' for t in intervals]
    vols = pd.DataFrame(index=price_series.index)
    for time,interval in zip(times,intervals):
        annualize = 6.5*(60/interval)*252
        vols['ewm_vol_'+time] = np.sqrt(T_ewm_vol(price_series,time)**2*annualize).replace(0,np.nan)
    
    return vols

In [8]:
def calc_flow_metrics(sized_directions,dollar_sized_directions):
    """Calculate Flow Metrics"""
    
    dollar_flow_1min = tau_trade_flow((dollar_sized_directions).cumsum(),'60s')
    dollar_flow_2min = tau_trade_flow((dollar_sized_directions).cumsum(),'120s')
    dollar_flow_4min = tau_trade_flow((dollar_sized_directions).cumsum(),'240s')
    dollar_flow_5min = tau_trade_flow((dollar_sized_directions).cumsum(),'300s')
    dollar_flow_15min = tau_trade_flow((dollar_sized_directions).cumsum(),'900s')
    dollar_flow_20min = tau_trade_flow((dollar_sized_directions).cumsum(),'1200s')
    dollar_flow_25min = tau_trade_flow((dollar_sized_directions).cumsum(),'1500s')
    dollar_flow_30min = tau_trade_flow((dollar_sized_directions).cumsum(),'1800s')
    dollar_flow_45min = tau_trade_flow((dollar_sized_directions).cumsum(),'2700s')
    dollar_flow_60min = tau_trade_flow((dollar_sized_directions).cumsum(),'3600s')
    
    flow_1min = tau_trade_flow((sized_directions).cumsum(),'60s')
    flow_2min = tau_trade_flow((sized_directions).cumsum(),'120s')
    flow_4min = tau_trade_flow((sized_directions).cumsum(),'240s')
    flow_5min = tau_trade_flow((sized_directions).cumsum(),'300s')
    flow_15min = tau_trade_flow((sized_directions).cumsum(),'900s')
    flow_20min = tau_trade_flow((sized_directions).cumsum(),'1200s')
    flow_25min = tau_trade_flow((sized_directions).cumsum(),'1500s')
    flow_30min = tau_trade_flow((sized_directions).cumsum(),'1800s')
    flow_45min = tau_trade_flow((sized_directions).cumsum(),'2700s')
    flow_60min = tau_trade_flow((sized_directions).cumsum(),'3600s')

    flow_1min_EWMA = flow_1min.ewm(halflife='120s',times=flow_1min.index).mean()
    flow_2min_EWMA = flow_2min.ewm(halflife='240s',times=flow_2min.index).mean()
    flow_4min_EWMA = flow_4min.ewm(halflife='480s',times=flow_4min.index).mean()
    flow_5min_EWMA = flow_5min.ewm(halflife='600s',times=flow_5min.index).mean()
    flow_15min_EWMA = flow_15min.ewm(halflife='1800s',times=flow_15min.index).mean()
    flow_20min_EWMA = flow_20min.ewm(halflife='2400s',times=flow_20min.index).mean()
    flow_25min_EWMA = flow_25min.ewm(halflife='3000s',times=flow_25min.index).mean()
    flow_30min_EWMA = flow_30min.ewm(halflife='3600s',times=flow_30min.index).mean()
    flow_45min_EWMA = flow_45min.ewm(halflife='5400s',times=flow_45min.index).mean()
    flow_60min_EWMA = flow_60min.ewm(halflife='7200s',times=flow_60min.index).mean()
    
    dollar_flows = pd.concat([dollar_flow_1min,dollar_flow_2min,dollar_flow_4min,dollar_flow_5min, dollar_flow_15min,
                              dollar_flow_20min,dollar_flow_25min, dollar_flow_30min, dollar_flow_45min, dollar_flow_60min],
                             axis=1)
    dollar_flows.columns = ['dollar_flow_1min','dollar_flow_2min','dollar_flow_4min','dollar_flow_5min',
                            'dollar_flow_15min','dollar_flow_20min','dollar_flow_25min','dollar_flow_30min',
                            'dollar_flow_45min','dollar_flow_60min']
    
    flows = pd.concat([flow_1min,flow_2min,flow_4min,flow_5min, flow_15min, flow_20min,
                       flow_25min, flow_30min, flow_45min, flow_60min],
                      axis=1)
    flows.columns = ['flow_1min','flow_2min','flow_4min','flow_5min','flow_15min','flow_20min',
                     'flow_25min', 'flow_30min', 'flow_45min', 'flow_60min']

    EWMA_flows = pd.concat([flow_1min_EWMA,flow_2min_EWMA,flow_4min_EWMA,flow_5min_EWMA, flow_15min_EWMA,
                            flow_20min_EWMA,flow_25min_EWMA, flow_30min_EWMA, flow_45min_EWMA, flow_60min_EWMA],
                           axis=1)
    
    EWMA_flows.columns = ['EWMA_flow_1min','EWMA_flow_2min','EWMA_flow_4min','EWMA_flow_5min','EWMA_flow_15min',
                          'EWMA_flow_20min','EWMA_flow_25min', 'EWMA_flow_30min', 'EWMA_flow_45min', 'EWMA_flow_60min']

    all_flows = flows.join([EWMA_flows,dollar_flows])
    
    return all_flows

In [9]:
def calc_bid_ask_metrics(bid_ask_spread_sum, bid_ask_over_price_sum, trade_counts):
    
    bid_ask_1min = tau_average(field=bid_ask_spread_sum, trade_counts=trade_counts, Tau='60s')
    bid_ask_2min = tau_average(field=bid_ask_spread_sum, trade_counts=trade_counts, Tau='120s')
    bid_ask_5min = tau_average(field=bid_ask_spread_sum, trade_counts=trade_counts, Tau='300s')
    bid_ask_10min = tau_average(field=bid_ask_spread_sum, trade_counts=trade_counts, Tau='600s')
    bid_ask_15min = tau_average(field=bid_ask_spread_sum, trade_counts=trade_counts, Tau='900s')
    bid_ask_30min = tau_average(field=bid_ask_spread_sum, trade_counts=trade_counts, Tau='1800s')
    bid_ask_60min = tau_average(field=bid_ask_spread_sum, trade_counts=trade_counts, Tau='3600s')

    bid_ask = pd.concat([bid_ask_1min, bid_ask_2min, bid_ask_5min, bid_ask_10min,
                         bid_ask_15min, bid_ask_30min, bid_ask_60min], axis=1)

    bid_ask.columns = ['avg_bid_ask_1min','avg_bid_ask_2min','avg_bid_ask_5min','avg_bid_ask_10min',
                       'avg_bid_ask_15min','avg_bid_ask_30min','avg_bid_ask_60min']    
    
    bid_ask_price_1min = tau_average(field=bid_ask_over_price_sum,trade_counts=trade_counts,Tau='60s')
    bid_ask_price_2min = tau_average(field=bid_ask_over_price_sum,trade_counts=trade_counts,Tau='120s')
    bid_ask_price_5min = tau_average(field=bid_ask_over_price_sum,trade_counts=trade_counts,Tau='300s')
    bid_ask_price_10min = tau_average(field=bid_ask_over_price_sum,trade_counts=trade_counts,Tau='600s')
    bid_ask_price_15min = tau_average(field=bid_ask_over_price_sum,trade_counts=trade_counts,Tau='900s')
    bid_ask_price_30min = tau_average(field=bid_ask_over_price_sum,trade_counts=trade_counts,Tau='1800s')
    bid_ask_price_60min = tau_average(field=bid_ask_over_price_sum,trade_counts=trade_counts,Tau='3600s')
    
    bid_ask_price = pd.concat([bid_ask_price_1min, bid_ask_price_2min, bid_ask_price_5min, bid_ask_price_10min,
                           bid_ask_price_15min, bid_ask_30min, bid_ask_price_60min], axis=1)

    bid_ask_price.columns = ['avg_bid_ask_price_1min','avg_bid_ask_price_2min','avg_bid_ask_price_5min',
                             'avg_bid_ask_price_10min','avg_bid_ask_price_15min','avg_bid_ask_price_30min',
                             'avg_bid_ask_price_60min']

    bid_ask = pd.concat([bid_ask_1min, bid_ask_2min, bid_ask_5min, bid_ask_10min,
                         bid_ask_15min, bid_ask_30min, bid_ask_60min], axis=1)

    bid_ask.columns = ['avg_bid_ask_1min','avg_bid_ask_2min','avg_bid_ask_5min','avg_bid_ask_10min',
                       'avg_bid_ask_15min','avg_bid_ask_30min','avg_bid_ask_60min']

    bid_ask_met = bid_ask_price.join(bid_ask)
    
    return bid_ask_met

In [10]:
def calc_imbalance_metrics(imbalances, trade_counts):
    """Calculate tau time period average order imbalances. for 1,2,4,5,10,15,30 minutes"""
    order_imbalance_1min = tau_average(imbalances, trade_counts, '60s')
    order_imbalance_2min = tau_average(imbalances, trade_counts, '120s')
    order_imbalance_4min = tau_average(imbalances, trade_counts, '240s')
    order_imbalance_5min = tau_average(imbalances, trade_counts, '300s')
    order_imbalance_10min = tau_average(imbalances, trade_counts, '600s')
    order_imbalance_15min = tau_average(imbalances, trade_counts, '900s')
    order_imbalance_30min = tau_average(imbalances, trade_counts, '1800s')

    order_imbalances = pd.concat([order_imbalance_1min,order_imbalance_2min, order_imbalance_4min,order_imbalance_5min,
                                  order_imbalance_10min, order_imbalance_15min, order_imbalance_30min], axis=1)
    
    order_imbalances.columns = ['order_imbalance_1min','order_imbalance_2min','order_imbalance_4min','order_imbalance_5min',
                                'order_imbalance_10min', 'order_imbalance_15min', 'order_imbalance_30min']
    
    return order_imbalances

In [11]:
def calc_all_metrics(etf_trades, iNAV):
    """Calculate all the trade data metrics, forward returns, and join it together."""
    etf_trades = clean_etf_df(etf_trades)

    flow_metrics = calc_flow_metrics(sized_directions = etf_trades['direction_size'],
                                     dollar_sized_directions = etf_trades['dollar_direction'])

    bid_ask_metrics = calc_bid_ask_metrics(bid_ask_spread_sum = etf_trades['bid_ask_spread_timestamp_sum'],
                                           bid_ask_over_price_sum = etf_trades['bid_ask_over_price_timestamp_sum'],
                                           trade_counts = etf_trades['timestamp_count'])
    
    vol_metrics = calc_ewm_vol(price_series = etf_trades['PRICE'])

    fields_from_trade_book = etf_trades.loc[:,['PRICE','NBB','NBO','NBOqty','NBBqty',
                                               'cumulative_trade_count','cumulative_volume','order_imbalance']]
    
    order_imbalances = calc_imbalance_metrics(etf_trades['order_imbalance'], 
                                              trade_counts=etf_trades['timestamp_count'])
    
    # join all the fields from trade book together
    trade_book_variables = flow_metrics.join([bid_ask_metrics,vol_metrics,fields_from_trade_book])
    
    independent_variables = pd.merge_asof(iNAV,
                                          trade_book_variables,
                                          left_index=True,
                                          right_index=True,
                                          direction='backward',
                                          allow_exact_matches=False)

    independent_variables['nav_discount_bid'] = independent_variables['NBB']/independent_variables['iNAV']-1
    independent_variables['nav_discount_ask'] = independent_variables['NBO']/independent_variables['iNAV']-1
    
    fwd_rtns = calc_fwd_rtns(etf_trades)
    
    fwd_rtns.index = fwd_rtns.index - pd.Timedelta(50,'milli') # subtract 15 milliseconds for latency

    all_vars = pd.merge_asof(independent_variables,
                             fwd_rtns,
                             left_index=True,
                             right_index=True,
                             direction='forward',
                             allow_exact_matches=False).dropna()
    
    return all_vars 

#### HYG

In [12]:
# HYG
etf_trades = pd.read_csv(path_to_trade_data+'HYG_2020.csv')

etf_iNAV = pd.read_csv('iNAVs/hygiv.csv', index_col = 'date',parse_dates=True)
iNAV = etf_iNAV.loc[:,'iNAV']

all_vars = calc_all_metrics(etf_trades = etf_trades, iNAV=iNAV)
all_vars.to_csv(where_to_save_data+'HYG_metrics.csv')

#### JNK

In [13]:
etf_trades = pd.read_csv(path_to_trade_data+'JNK_2020.csv')

etf_iNAV = pd.read_csv('iNAVs/jnkiv.csv', index_col = 'date',parse_dates=True)
iNAV = etf_iNAV.loc[:,'iNAV']

all_vars = calc_all_metrics(etf_trades = etf_trades, iNAV=iNAV)
all_vars.to_csv(where_to_save_data+'JNK_metrics.csv')

#### BKLN

In [14]:
etf_trades = pd.read_csv(path_to_trade_data+'BKLN_2020.csv')

etf_iNAV = pd.read_csv('iNAVs/bklniv.csv', index_col = 'date',parse_dates=True)
iNAV = etf_iNAV.loc[:,'iNAV']

all_vars = calc_all_metrics(etf_trades = etf_trades, iNAV=iNAV)
all_vars.to_csv(where_to_save_data+'BKLN_metrics.csv')

#### SRLN

In [15]:
etf_trades = pd.read_csv(path_to_trade_data+'SRLN_2020.csv')

etf_iNAV = pd.read_csv('iNAVs/srlniv.csv', index_col = 'date',parse_dates=True)
iNAV = etf_iNAV.loc[:,'iNAV']

all_vars = calc_all_metrics(etf_trades = etf_trades, iNAV=iNAV)
all_vars.to_csv(where_to_save_data+'SRLN_metrics.csv')

#### PFF

In [16]:
etf_trades = pd.read_csv(path_to_trade_data+'PFF_2020.csv')

etf_iNAV = pd.read_csv('iNAVs/pffiv.csv', index_col = 'date',parse_dates=True)
iNAV = etf_iNAV.loc[:,'iNAV']

all_vars = calc_all_metrics(etf_trades = etf_trades, iNAV=iNAV)
all_vars.to_csv(where_to_save_data+'PFF_metrics.csv')

#### PGX

In [17]:
etf_trades = pd.read_csv(path_to_trade_data+'PGX_2020.csv')

etf_iNAV = pd.read_csv('iNAVs/pgxiv.csv', index_col = 'date',parse_dates=True)
iNAV = etf_iNAV.loc[:,'iNAV']

all_vars = calc_all_metrics(etf_trades = etf_trades, iNAV=iNAV)
all_vars.to_csv(where_to_save_data+'PGX_metrics.csv')

#### SPHY

In [18]:
etf_trades = pd.read_csv(path_to_trade_data+'SPHY_2020.csv')

etf_iNAV = pd.read_csv('iNAVs/sphyiv.csv', index_col = 'date',parse_dates=True)
iNAV = etf_iNAV.loc[:,'iNAV']

all_vars = calc_all_metrics(etf_trades = etf_trades, iNAV=iNAV)
all_vars.to_csv(where_to_save_data+'SPHY_metrics.csv')

#### HYGH

In [19]:
etf_trades = pd.read_csv(path_to_trade_data+'HYGH_2020.csv')

etf_iNAV = pd.read_csv('iNAVs/hyghiv.csv', index_col = 'date',parse_dates=True)
iNAV = etf_iNAV.loc[:,'iNAV']

all_vars = calc_all_metrics(etf_trades = etf_trades, iNAV=iNAV)
all_vars.to_csv(where_to_save_data+'HYGH_metrics.csv')