In [1]:
import numpy as np
import pandas as pd
import warnings
import statsmodels.api as sm
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import statsmodels.api as sm
from statsmodels.robust import norms
from sklearn.metrics import mean_squared_error

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [2]:
def get_metrics(file):
    df = pd.read_csv(file,parse_dates=['date'],index_col='date')
    df.dropna(inplace=True)
    df = df[~((df.index.hour==16) & (df.index.minute>0))]
    return df

In [3]:
data = get_metrics('HYG_metrics.csv')

In [4]:
def filter_cols(df,col_strings):
    col_list = []
    for col in col_strings:
        col_list+=list(filter(lambda x: col in x,df.columns))
    return col_list

In [5]:
def split_data(df,end_train,start_test):
    y = filter_cols(df,['rtn'])
    drop_cols = filter_cols(df,y+['iNAV','PRICE','NBB','NBO','NBOqty', 'NBBqty'])
    features = df.drop(columns=drop_cols).columns
    X_train = df[features].loc[:end_train]
    X_test = df[features].loc[start_test:]
    y_train = df[y].loc[:end_train]
    y_test = df[y].loc[start_test:]
    
    return X_train, X_test, y_train, y_test

In [6]:
def metric_cats(df):
    flow_metrics = filter_cols(df,['flow','imbalance'])
    liquidity_metrics = filter_cols(df,['bid_ask','volume','cumulative'])
    nav_metrics = filter_cols(df,['nav'])
    vol_metrics = filter_cols(df,['ewm_vol'])
    return flow_metrics, liquidity_metrics, nav_metrics, vol_metrics

# Group together metrics of similar category and examine the categories one by one

In [7]:
flow_metrics, liquidity_metrics, nav_metrics, vol_metrics = metric_cats(data)

# Use LASSO to see which features in each metric category (liquidity, flow, volatility) may be good regressors

In [8]:
def multi_LASSO(alphas,metrics,x,y,skScaler):
    lasso = Lasso(fit_intercept=True)
    sc = skScaler
    features = sc.fit_transform(x[metrics])
    features = pd.DataFrame(data = features, columns = x[metrics].columns, index=x[metrics].index)
    scale = sc.scale_
    
    coeffs = np.zeros(len(metrics))
    alphas = {'alpha':alphas}
    
    results = pd.DataFrame(index=metrics)
    
    while np.sum(coeffs)==0:
        lasso_reg = GridSearchCV(lasso,alphas,scoring='neg_mean_squared_error',cv=5,n_jobs=-1)
        lasso_reg.fit(features,y)
        alpha_opt = lasso_reg.best_params_
        mse_opt = lasso_reg.best_score_
        coeffs = lasso_reg.best_estimator_.coef_
        alphas['alpha']/=10
        
    results['coeffs'] = coeffs
    results['scale'] = scale
    results['importance'] = np.abs(coeffs)
    results = results[results['coeffs']!=0]
    results.sort_values('importance',ascending=False,inplace=True)
    
    return alpha_opt['alpha'],np.sqrt(-mse_opt), results

In [9]:
x_train, x_test, y_train, y_test = split_data(data,'2020-06','2020-07')

# LASSO on all flow variables

In [10]:
alphas = np.linspace(0.01,1,10)
alpha, rmse, coeffs = multi_LASSO(alphas, flow_metrics, x_train, y_train['fwd_rtn_5min'], StandardScaler())
print('Lambda = {}'.format(alpha))
print('RMSE = {}'.format(rmse))
coeffs

Lambda = 1e-05
RMSE = 0.0015723623085693013


Unnamed: 0,coeffs,scale,importance
flow_5min_EWMA,-6.27664e-05,118456.5,6.27664e-05
flow_15min,3.493509e-05,400779.7,3.493509e-05
flow_2min,-3.042662e-05,144415.9,3.042662e-05
order_imbalance_1min,-2.278753e-05,15388.0,2.278753e-05
flow_5min,-2.124823e-05,227524.3,2.124823e-05
flow_1min_EWMA,1.182483e-05,57885.1,1.182483e-05
dollar_flow_15min,9.007938e-06,32906590.0,9.007938e-06
flow_45min_EWMA,3.146832e-06,351332.2,3.146832e-06
flow_60min_EWMA,8.510093e-07,415554.2,8.510093e-07
flow_4min_EWMA,-2.153062e-07,108584.4,2.153062e-07


# LASSO on all liquidity variables

In [11]:
alpha, rmse, coeffs = multi_LASSO(alphas, liquidity_metrics, x_train, y_train['fwd_rtn_5min'], StandardScaler())
print('Lambda = {}'.format(alpha))
print('RMSE = {}'.format(rmse))
coeffs

Lambda = 1.0000000000000002e-06
RMSE = 0.0015757988010931913


Unnamed: 0,coeffs,scale,importance
bid_ask_price_30min,-0.000271,2.583514e-05,0.000271
volume_4min,0.000235,724062900000.0,0.000235
bid_ask_price_15min,0.000201,2.719063e-05,0.000201
bid_ask_price_10min,0.000194,2.724129e-05,0.000194
bid_ask_price_2min,-0.000135,2.500493e-05,0.000135
cumulative_trade_count,-0.000104,1748446.0,0.000104
bid_ask_price_60min,0.000101,2.562191e-05,0.000101
volume_10min,-9.3e-05,1593594000000.0,9.3e-05
volume_2min,-8.6e-05,418241500000.0,8.6e-05
cumulative_volume,7.7e-05,913008200.0,7.7e-05


# LASSO on vol variables

In [12]:
alpha, rmse, coeffs = multi_LASSO(alphas, vol_metrics, x_train, y_train['fwd_rtn_5min'], StandardScaler())
print('Lambda = {}'.format(alpha))
print('RMSE = {}'.format(rmse))
coeffs

Lambda = 1.0000000000000002e-07
RMSE = 0.0015846873815548983


Unnamed: 0,coeffs,scale,importance
ewm_vol_60s,0.000425,0.267865,0.000425
ewm_vol_120s,-0.000248,0.218746,0.000248
ewm_vol_1500s,0.000209,0.124887,0.000209
ewm_vol_3600s,-0.000195,0.110949,0.000195
ewm_vol_240s,0.000132,0.169128,0.000132
ewm_vol_2700s,-9.1e-05,0.115731,9.1e-05
ewm_vol_1800s,8.5e-05,0.121548,8.5e-05
ewm_vol_1200s,-8.1e-05,0.129672,8.1e-05
ewm_vol_900s,-6.1e-05,0.13355,6.1e-05
ewm_vol_300s,2.6e-05,0.161951,2.6e-05


# LASSO on NAV

In [13]:
alpha, rmse, coeffs = multi_LASSO(alphas, nav_metrics, x_train, y_train['fwd_rtn_5min'], StandardScaler())
print('Lambda = {}'.format(alpha))
print('RMSE = {}'.format(rmse))
coeffs

Lambda = 1.0000000000000003e-10
RMSE = 0.001574350254216579


Unnamed: 0,coeffs,scale,importance
nav_discount_bid,-0.000331,0.008986,0.000331
nav_discount_ask,0.000251,0.008991,0.000251


# LASSO on everything

In [14]:
combined_metrics = flow_metrics+liquidity_metrics+vol_metrics
alpha, rmse, coeffs = multi_LASSO(alphas, combined_metrics, x_train, y_train['fwd_rtn_5min'], StandardScaler())
print('Lambda = {}'.format(alpha))
print('RMSE = {}'.format(rmse))
coeffs

Lambda = 1.0000000000000003e-09
RMSE = 0.0016259509355722407


Unnamed: 0,coeffs,scale,importance
flow_20min,-0.000958,4.626030e+05,0.000958
dollar_flow_20min,0.000908,3.797060e+07,0.000908
flow_15min,0.000856,4.007797e+05,0.000856
bid_ask_price_5min,0.000830,2.706129e-05,0.000830
bid_ask_5min,-0.000820,2.063602e-03,0.000820
...,...,...,...
dollar_flow_60min,-0.000008,6.547583e+07,0.000008
order_imbalance_5min,0.000008,1.220755e+04,0.000008
order_imbalance_15min,0.000006,1.100856e+04,0.000006
volume_5min,0.000005,8.893301e+11,0.000005


# LASSO with best from each category

In [15]:
best_metrics = ['flow_5min_EWMA','bid_ask_price_10min','nav_discount_bid','ewm_vol_60s']
alpha, rmse, coeffs = multi_LASSO(alphas, best_metrics, x_train, y_train['fwd_rtn_5min'], StandardScaler())
print('Lambda = {}'.format(alpha))
print('RMSE = {}'.format(rmse))
coeffs

Lambda = 1.0000000000000003e-10
RMSE = 0.0015867114815708051


Unnamed: 0,coeffs,scale,importance
ewm_vol_60s,0.000253,0.267865,0.000253
nav_discount_bid,-0.000105,0.008986,0.000105
bid_ask_price_10min,0.000104,2.7e-05,0.000104
flow_5min_EWMA,-3.9e-05,118456.531045,3.9e-05


# Try different robust regressions using statsmodels (all default values)

# Train on first half of 2020 and test on back half

In [16]:
def robust_regression(train_x, train_y, test_x, test_y, norms):
    train_x = sm.add_constant(train_x)
    test_x = sm.add_constant(test_x)
    betas = pd.DataFrame(index=train_x.columns,columns=list(robust_norms.keys()))
    tstats = betas.copy()
    pvalues = betas.copy()
    std_error = betas.copy()
    predictions = pd.DataFrame()
    for n in norms:
        model = sm.RLM(train_y,train_x,M=norms[n])
        results = model.fit()
        tstats[n] = results.tvalues.values
        betas[n] = results.params.values
        pvalues[n] = results.pvalues.values
        std_error[n] = results.bse.values
        results = model.fit()
        preds = results.predict(test_x)
        predictions[n] = preds
    predictions['actual'] = test_y
    errors = pd.DataFrame()
    for column in predictions.columns:
        errors[column] = (predictions[column] - predictions['actual'])**2
    errors.drop(columns=['actual'],inplace=True)
    coeff_stats = {'betas':betas, 'stats':tstats, 'pvalues':pvalues, 'std_error':std_error}
    return coeff_stats, predictions, errors

In [17]:
x_train, x_test, y_train, y_test = split_data(data,'2020-06','2020-07')

In [18]:
robust_norms = {'LeastSquares':norms.LeastSquares(),'AndrewWave':norms.AndrewWave(), 
                'Hampel':norms.Hampel(), 'HuberT':norms.HuberT(), 
                 'RamsayE':norms.RamsayE(), 'TrimmedMean':norms.TrimmedMean()}

In [19]:
coeff_stats, preds, resids = \
    robust_regression(
    x_train['flow_5min_EWMA'],
    y_train['fwd_rtn_5min'],
    x_test['flow_5min_EWMA'],
    y_test['fwd_rtn_5min'],
    norms = robust_norms)

In [21]:
pd.DataFrame(np.sqrt(resids.mean()),columns=['RMSE']).sort_values('RMSE')

Unnamed: 0,RMSE
RamsayE,0.00039
Hampel,0.00039
AndrewWave,0.00039
HuberT,0.00039
TrimmedMean,0.00039
LeastSquares,0.000391


In [22]:
signals = pd.DataFrame(index=preds.index, columns=preds.columns[:-1])
for column in preds.columns[:-1]:
    signals[column] = np.where(preds[column]>0, 1, 0)

In [23]:
print('Actual Return',preds['actual'].sum())
signals.multiply(preds['actual'],axis=0).sum().sort_values(ascending=False)

Actual Return -0.009573022999999993


HuberT          0.034233
LeastSquares    0.027445
TrimmedMean    -0.001757
Hampel         -0.005722
AndrewWave     -0.009573
RamsayE        -0.015249
dtype: float64