In [6]:
# default_exp feature_eng

In [13]:
# export
from mlfinlab.microstructural_features import (
    get_roll_measure,
    get_roll_impact,
    get_bar_based_kyle_lambda,
    get_bar_based_amihud_lambda,
)
from copy import deepcopy
import pandas as pd
import numpy as np
import logging
from mlbt.load_data import load_feat, save_feat
from mlbt.frac_diff import frac_diff_ffd
from mlbt.load_data import get_data, SYMBOLS_CSV, safe_feat_name, process_bars

SYMBOLS_CSV = SYMBOLS_CSV.copy()
SYMBOLS_CSV.columns = SYMBOLS_CSV.columns.str.lower()
SYMBOLS_DICT = SYMBOLS_CSV.T.to_dict()


def roll_measure(df, window, price="Close"):
    """The Roll measure attempts to estimate the bid-ask spread (i.e. liquidity) of an instrument"""
    return get_roll_measure(df[price], window)


def roll_impact(df, window, price="Close", dollar_volume="Dollar Volume"):
    """The Roll measure divided by dollar volume"""
    return roll_measure(df, window, price) / df[dollar_volume] * 1e9


def kyle(df, window, price="Close", volume="Volume"):
    """A measure of market impact cost (i.e. liquidity) from Kyle (1985)"""
    return get_bar_based_kyle_lambda(df[price], df[volume], window) * 1e9


def amihud(df, window, price="Close", dollar_volume="Dollar Volume"):
    """A measure of market impact cost (i.e. liquidity) from Amihud (2002)"""
    return get_bar_based_amihud_lambda(df[price], df[dollar_volume], window) * 1e9


def autocorr(df, window, lag, column="Close"):
    """The raw price series' serial correlation"""
    return df[column].rolling(window).apply(lambda x: x.autocorr(lag=lag), raw=False)


def stdev(df, window, column="Close"):
    """The raw price series' standard deviation"""
    return df[column].rolling(window).std()


def log_ret(df, periods=1, column="Close"):
    """First difference of log-transformed prices"""
    return np.log(df[column]).diff(periods=periods)


def ffd(df, d, column="Close"):
    """Fractionally differentiated prices"""
    return frac_diff_ffd(np.log(df[column].to_frame('Close')), d)['Close']


def volratio(df, com, volume="Volume", buy_volume="Buy Volume"):
    """
    EWM of bar-by-bar buy volume divided by total volume
    (i.e. a value >0.50 would indicate buyers driving the market)
    """
    return (df[buy_volume] / df[volume]).ewm(com=com).mean()


def stdev_from_ma(df, window, column="Close"):
    rolly = df[column].rolling(window)
    return (df[column] - rolly.mean()) / rolly.std()


def close(df, column="Close"):
    return df[column]


def lag(df, lag, column="Close"):
    return df[column].shift(lag)


def lag_change(df, lag, column="Close"):
    return df[column].pct_change(lag)


def lag_diff(df, lag, column="Close"):
    col = df[column]
    return col - col.shift(lag)


def ema(df, com, column="Close"):
    return df[column].ewm(com=com).mean()

    
# Dates
def month(df, column="Time"):
    return df[column].dt.month


def week(df, column="Time"):
    return df[column].dt.week

    
def day(df, column="Time"):
    return df[column].dt.day

    
def weekday(df, column="Time"):
    return df[column].dt.weekday

    
def hour(df, column="Time"):
    return df[column].dt.hour


def tick_bars(df, size, column="Close"):
    return process_bars(df, size, "tick")[column]

    
FEATURES = {
    "auto": autocorr,
    "stdev": stdev,
    "roll": roll_measure,
    "rollimp": roll_impact,
    "kyle": kyle,
    "amihud": amihud,
    "volratio": volratio,
    "log_ret": log_ret,
    "ffd": ffd,
    "close": close,
    "lag": lag,
    "lag_change": lag_change,
    "ema": ema,
    "stdev_from_ma": stdev_from_ma,

    "time_bars": tick_bars,

    "weekday": weekday,
    "hour": hour,
}

def run_feature_engineering(config, deck):
    """Load already-engineered features or engineer if we can't"""
    for symbol, symbol_deck in deck.items():
        logging.debug(f"{symbol}: Feature engineering for {len(config['features'])} features")
        bars = symbol_deck['bars']
        feats = []
        for feat_config in config["features"]:
            # We pass a copy in so the feat_eng code can modify that to its hearts content,
            # while for us the information remains non-redundant
            name = safe_feat_name(feat_config, safe_for_fs=False)
            feat = engineer_feature(deck, symbol, config, feat_config)["Close"]
            logging.debug(f'Got {feat.shape} shape for feature: {name}')
            feat.name = name
            bars_index = deck[symbol]['bars'].index
            if feat.index.shape != bars_index.shape:
                # We're only interested in values we have prices for
                # Do this now so concat below is fast (and has the same set of indices across)
                feat = feat.reindex(index=bars_index, method='ffill')
            
            feats.append(feat)
        feats2 = pd.concat(feats, axis=1)
        logging.debug(f"Joined {len(feats)} features into {feats2.shape} shape")
        # Reindex in case of outside feats
        deck[symbol]['feats'] = feats2
    return deck

def get_bars(deck, symbol, config):
    if symbol in deck:
        # TODO: Remove deep copy
        bars = deck[symbol]['bars'].copy(deep=True)
    else:
        # We're loading a feature external to the price data of our trading universe
        bars = get_data(symbol, "minutely", config["start_date"], config["end_date"])
        
    return bars

def fill_out_symbol(feat_conf, for_symbol):
    symbol = feat_conf['symbol'] = feat_conf.get('symbol', for_symbol)
    if isinstance(symbol, dict):
        feat_conf['symbol'] = fill_out_symbol(symbol, for_symbol)
    return feat_conf
    

def engineer_feature(deck, for_symbol, config, feat_conf):
    """Parse and compute a feature"""
    feat_conf = deepcopy(feat_conf)
    fill_out_symbol(feat_conf, for_symbol)

    symbol = feat_conf['symbol']

    feat = load_feat(config, feat_conf)
    if feat is not None:
        return feat

    
    if isinstance(symbol, dict):
        # We're computing a feature on a feature
        df = engineer_feature(deck, for_symbol, config, symbol)
    else:
        df = get_bars(deck, symbol, config)
    
    feat = compute_feature(deck, for_symbol, config, feat_conf, symbol, df)

    if config["save_to_disk"]:
        save_feat(config, feat_conf, feat)
    return feat

def compute_feature(deck, for_symbol, config, feat_conf, symbol, df):
    logging.debug(f"Computing {feat_conf['name']} for {for_symbol}: {feat_conf}")
    drop = ['name', 'symbol']
    params = {k:v for k, v in feat_conf.items() if not k in drop}

    feat_name = feat_conf['name']
    if feat_name in ['sector', 'exchange']:
        categories = list(sorted(set(SYMBOLS_CSV[feat_name])))
        category = SYMBOLS_DICT[symbol][feat_name]
        feat = pd.Series(categories.index(category), index=df.index)
    else:
        feat = FEATURES[feat_name](df, **params)
            
    # Every feature's column is called Close to enable easy recursion
    feat = feat.to_frame("Close")
        
    return feat


def define_feature_configs():
    """Stake out the list of features that is the basis for our features matrix"""
    ffd_f = {"name": "ffd", "d": 0.3}
    log_ret = {"name": "log_ret"}
    vix_1h = {"name": "time_bars", "size": 60, "symbol": 'VIX.XO'}
    log_ret_vix = {"name": "log_ret", "symbol": vix_1h}

    features = [
        log_ret,
        log_ret_vix,
        ffd_f,
    ]
    windows = [10, 25, 50, 250]

    for window in windows:
        features.append({"name": "log_ret", "periods": window})
        
        features.append({"name": "roll", "window": window})
        features.append({"name": "rollimp", "window": window})
        features.append({"name": "amihud", "window": window})
        features.append({"name": "kyle", "window": window})

        volratio = {"name": "volratio", "com": window}
        stdev_volratio = {"name": "stdev", "window": window, "symbol": volratio}

        features.append(volratio)
        features.append({"name": "lag", "lag": window, "symbol": volratio})

        # Volatilty
        stdev_log_ret_vix = {"name": "stdev", "window": window, "symbol": log_ret_vix}
        features.append(stdev_log_ret_vix)

        stdev_log_ret = {"name": "stdev", "window": window, "symbol": log_ret}
        features.append(stdev_log_ret)
        features.append({"name": "stdev", "window": window, "symbol": stdev_log_ret})

        stdev_ffd = {"name": "stdev", "window": window, "symbol": ffd_f} 
        features.append(stdev_ffd)

        for lag in windows:
            if lag < window:
                features.append({"name": "auto", "window": window, "lag": lag, "symbol": log_ret_vix})
                features.append({"name": "auto", "window": window, "lag": lag, "symbol": volratio})
                features.append({"name": "auto", "window": window, "lag": lag, "symbol": stdev_volratio})
    
        
        
        
    return features


In [14]:
feats = define_feature_configs()

In [4]:
feats

[{'name': 'close', 'symbol': 'VIX.XO'},
 {'name': 'log_ret'},
 {'name': 'log_ret',
  'symbol': {'name': 'time_bars', 'size': 60, 'symbol': 'VIX.XO'}},
 {'name': 'ffd', 'd': 0.5},
 {'name': 'close', 'column': 'Num Ticks'},
 {'name': 'roll', 'window': 25},
 {'name': 'rollimp', 'window': 25},
 {'name': 'amihud', 'window': 25},
 {'name': 'kyle', 'window': 25},
 {'name': 'volratio', 'com': 25},
 {'name': 'stdev',
  'window': 25,
  'symbol': {'name': 'time_bars', 'size': 60, 'symbol': 'VIX.XO'}},
 {'name': 'stdev',
  'window': 25,
  'symbol': {'name': 'log_ret',
   'symbol': {'name': 'time_bars', 'size': 60, 'symbol': 'VIX.XO'}}},
 {'name': 'stdev', 'window': 25, 'symbol': {'name': 'log_ret'}},
 {'name': 'stdev',
  'window': 25,
  'symbol': {'name': 'stdev', 'window': 25, 'symbol': {'name': 'log_ret'}}},
 {'name': 'stdev', 'window': 25, 'symbol': {'name': 'ffd', 'd': 0.5}},
 {'name': 'stdev',
  'window': 25,
  'symbol': {'name': 'stdev',
   'window': 25,
   'symbol': {'name': 'ffd', 'd': 0.5

In [5]:
len(feats)

265

In [6]:
from mlbt.load_data import load_bars
symbol = '@NQ#C'
config = {
    'bar_type': 'dollar', 
    'load_from_disk': True, 
    'save_to_disk': True, 
    'start_date': None, 
    'end_date': None, 
    'features': feats[:20],
}
nq = load_bars(symbol, {**config, **{'load_from_disk': True}})

In [7]:
nq

Unnamed: 0_level_0,Time,Open,High,Low,Close,Volume,Dollar Volume,Num Ticks,Buy Volume
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-04-13 11:08:00,2006-04-13 11:08:00,1981.75,1992.00,1979.25,1991.75,66106,1.313477e+08,53,37749
2006-04-13 13:19:00,2006-04-13 13:19:00,1991.25,1994.25,1986.50,1988.50,65487,1.303668e+08,131,32228
2006-04-17 09:32:00,2006-04-17 09:32:00,1988.25,1991.75,1982.00,1984.00,66338,1.317613e+08,526,31144
2006-04-17 11:04:00,2006-04-17 11:04:00,1983.75,1989.50,1983.75,1985.75,65487,1.300959e+08,92,36849
2006-04-17 13:21:00,2006-04-17 13:21:00,1986.25,1986.25,1968.75,1968.75,69706,1.378751e+08,137,21531
...,...,...,...,...,...,...,...,...,...
2020-01-16 09:32:00,2020-01-16 09:32:00,9097.50,9097.50,9091.75,9091.75,16330,1.485183e+08,51,3563
2020-01-16 09:37:00,2020-01-16 09:37:00,9101.25,9106.00,9099.25,9099.25,15156,1.379647e+08,5,12168
2020-01-16 09:45:00,2020-01-16 09:45:00,9103.75,9113.50,9103.75,9112.75,15522,1.413976e+08,8,11926
2020-01-16 09:53:00,2020-01-16 09:53:00,9114.75,9114.75,9103.25,9105.25,16058,1.462472e+08,8,7723


In [8]:
deck = {symbol: {'bars': nq}}
for_symbol = symbol

In [9]:
FORMAT = "%(asctime)-15s %(message)s"
logging.basicConfig(format=FORMAT, level=logging.DEBUG)

x = run_feature_engineering(config, deck)

2020-02-10 19:48:05,301 @NQ#C: Feature engineering for 20 features
2020-02-10 19:48:05,302 Getting close(symbol=VIX.XO)
2020-02-10 19:48:05,321 Got (1632576, 1) for close(symbol=VIX.XO)
2020-02-10 19:48:05,469 Getting log_ret(symbol=@NQ#C)
2020-02-10 19:48:05,474 Got (31156, 1) for log_ret(symbol=@NQ#C)
2020-02-10 19:48:05,475 Getting log_ret(symbol=time_bars(size=60,symbol=VIX.XO))
2020-02-10 19:48:05,475 Got None for log_ret(symbol=time_bars(size=60,symbol=VIX.XO))
2020-02-10 19:48:05,476 Getting time_bars(size=60,symbol=VIX.XO)
2020-02-10 19:48:05,476 Got None for time_bars(size=60,symbol=VIX.XO)
  mask |= (ar1 == a)
2020-02-10 19:48:11,385 Computing time_bars for @NQ#C: {'name': 'time_bars', 'size': 60, 'symbol': 'VIX.XO'}


Reading data in batches:
Batch number: 0


2020-02-10 19:48:19,995 Computing log_ret for @NQ#C: {'name': 'log_ret', 'symbol': {'name': 'time_bars', 'size': 60, 'symbol': 'VIX.XO'}}
2020-02-10 19:48:20,005 Getting ffd(d=0.5,symbol=@NQ#C)
2020-02-10 19:48:20,010 Got (31156, 1) for ffd(d=0.5,symbol=@NQ#C)
2020-02-10 19:48:20,010 Getting close(column=Num Ticks,symbol=@NQ#C)
2020-02-10 19:48:20,015 Got (31156, 1) for close(column=Num Ticks,symbol=@NQ#C)
2020-02-10 19:48:20,015 Getting roll(symbol=@NQ#C,window=25)
2020-02-10 19:48:20,019 Got (31156, 1) for roll(symbol=@NQ#C,window=25)
2020-02-10 19:48:20,020 Getting rollimp(symbol=@NQ#C,window=25)
2020-02-10 19:48:20,024 Got (31156, 1) for rollimp(symbol=@NQ#C,window=25)
2020-02-10 19:48:20,025 Getting amihud(symbol=@NQ#C,window=25)
2020-02-10 19:48:20,030 Got (31156, 1) for amihud(symbol=@NQ#C,window=25)
2020-02-10 19:48:20,030 Getting kyle(symbol=@NQ#C,window=25)
2020-02-10 19:48:20,035 Got (31156, 1) for kyle(symbol=@NQ#C,window=25)
2020-02-10 19:48:20,035 Getting volratio(com=25,

Returning bars 



2020-02-10 19:48:20,062 Got (27209, 1) for log_ret(symbol=time_bars(size=60,symbol=VIX.XO))
2020-02-10 19:48:20,063 Computing stdev for @NQ#C: {'name': 'stdev', 'window': 25, 'symbol': {'name': 'log_ret', 'symbol': {'name': 'time_bars', 'size': 60, 'symbol': 'VIX.XO'}}}
2020-02-10 19:48:20,073 Getting stdev(symbol=log_ret(symbol=@NQ#C),window=25)
2020-02-10 19:48:20,078 Got (31156, 1) for stdev(symbol=log_ret(symbol=@NQ#C),window=25)
2020-02-10 19:48:20,078 Getting stdev(symbol=stdev(symbol=log_ret(symbol=@NQ#C),window=25),window=25)
2020-02-10 19:48:20,083 Got (31156, 1) for stdev(symbol=stdev(symbol=log_ret(symbol=@NQ#C),window=25),window=25)
2020-02-10 19:48:20,084 Getting stdev(symbol=ffd(d=0.5,symbol=@NQ#C),window=25)
2020-02-10 19:48:20,088 Got (31156, 1) for stdev(symbol=ffd(d=0.5,symbol=@NQ#C),window=25)
2020-02-10 19:48:20,088 Getting stdev(symbol=stdev(symbol=ffd(d=0.5,symbol=@NQ#C),window=25),window=25)
2020-02-10 19:48:20,093 Got (31156, 1) for stdev(symbol=stdev(symbol=ffd

In [10]:
%debug

2020-02-10 19:48:20,153 No traceback has been produced, nothing to debug.


In [12]:
feats = x[symbol]['feats']

In [13]:
feats.columns

Index(['close(symbol=VIX.XO)', 'log_ret()',
       'log_ret(symbol=time_bars(size=60,symbol=VIX.XO))', 'ffd(d=0.5)',
       'close(column=Num Ticks)', 'roll(window=25)', 'rollimp(window=25)',
       'amihud(window=25)', 'kyle(window=25)', 'volratio(com=25)',
       'stdev(symbol=time_bars(size=60,symbol=VIX.XO),window=25)',
       'stdev(symbol=log_ret(symbol=time_bars(size=60,symbol=VIX.XO)),window=25)',
       'stdev(symbol=log_ret(),window=25)',
       'stdev(symbol=stdev(symbol=log_ret(),window=25),window=25)',
       'stdev(symbol=ffd(d=0.5),window=25)',
       'stdev(symbol=stdev(symbol=ffd(d=0.5),window=25),window=25)',
       'stdev(symbol=volratio(com=25),window=25)',
       'ema(com=25,symbol=time_bars(size=60,symbol=VIX.XO))',
       'ema(com=25,symbol=log_ret(symbol=time_bars(size=60,symbol=VIX.XO)))',
       'ema(com=25,symbol=stdev(symbol=log_ret(symbol=time_bars(size=60,symbol=VIX.XO)),window=25))'],
      dtype='object')

In [14]:
feats

Unnamed: 0_level_0,close(symbol=VIX.XO),log_ret(),"log_ret(symbol=time_bars(size=60,symbol=VIX.XO))",ffd(d=0.5),close(column=Num Ticks),roll(window=25),rollimp(window=25),amihud(window=25),kyle(window=25),volratio(com=25),"stdev(symbol=time_bars(size=60,symbol=VIX.XO),window=25)","stdev(symbol=log_ret(symbol=time_bars(size=60,symbol=VIX.XO)),window=25)","stdev(symbol=log_ret(),window=25)","stdev(symbol=stdev(symbol=log_ret(),window=25),window=25)","stdev(symbol=ffd(d=0.5),window=25)","stdev(symbol=stdev(symbol=ffd(d=0.5),window=25),window=25)","stdev(symbol=volratio(com=25),window=25)","ema(com=25,symbol=time_bars(size=60,symbol=VIX.XO))","ema(com=25,symbol=log_ret(symbol=time_bars(size=60,symbol=VIX.XO)))","ema(com=25,symbol=stdev(symbol=log_ret(symbol=time_bars(size=60,symbol=VIX.XO)),window=25))"
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-04-13 11:08:00,,,,,53,,,,,0.571037,,,,,,,,,,
2006-04-13 13:19:00,,-0.001633,,,131,,,,,0.530809,,,,,,,,,,
2006-04-17 09:32:00,,-0.002266,,,526,,,,,0.509557,,,,,,,,,,
2006-04-17 11:04:00,,0.000882,,,92,,,,,0.523632,,,,,,,,,,
2006-04-17 13:21:00,,-0.008598,,,137,,,,,0.477249,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-16 09:32:00,12.05,-0.000577,-0.004120,0.170628,51,6.565345,44.205613,0.006043,496860.525400,0.498259,0.236095,0.018395,0.000984,0.000053,0.001217,0.000103,0.010502,12.538711,-0.001364,0.016518
2020-01-16 09:37:00,12.04,0.000825,-0.004120,0.171594,5,6.322117,45.824158,0.006241,513274.689318,0.509974,0.236095,0.018395,0.000997,0.000049,0.001228,0.000106,0.009930,12.538711,-0.001364,0.016518
2020-01-16 09:45:00,12.04,0.001483,-0.004120,0.172633,8,7.352182,51.996527,0.006486,533601.210673,0.519911,0.236095,0.018395,0.001028,0.000044,0.001311,0.000113,0.009985,12.538711,-0.001364,0.016518
2020-01-16 09:53:00,12.14,-0.000823,-0.004120,0.170922,8,5.382843,36.806469,0.006522,536712.436035,0.518412,0.236095,0.018395,0.001035,0.000040,0.001313,0.000118,0.010066,12.538711,-0.001364,0.016518


In [15]:
series = pd.Series([1,2,34,4,5, 8, 9, 100, 50])
df = pd.DataFrame({'Close': series})
nplog = np.log(series).diff()






In [31]:
column = "Close"
window = 4
rolly = df[column].rolling(window)
(df[column] - rolly.mean()) / rolly.std()



0         NaN
1         NaN
2         NaN
3   -0.393518
4   -0.410702
5   -0.332907
6    1.050210
7    1.498992
8    0.189722
Name: Close, dtype: float64