In [None]:
# Import libraries

import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from statsmodels.api import add_constant, OLS
from xgboost import XGBRegressor
from IPython.display import display
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

___

### Auxiliary functions for data parsing and processing.

In [None]:
def remove_outliers(df):
    """
    Removes outliers based on the median absolute deviation or
    the mean absolute deviation when the former is zero.

    :param df: input dataframe
    :return: dataframe with outlier rows removed
    """
    # Median Absolute Deviation
    term = (df - df.median()).abs()
    mad = 1.4826 * term.median()

    mask = (mad != 0) if isinstance(mad, float) else mad.all()
    if mask:
        df = df[term / mad <= 3]
    else:
        # Mean Absolute Deviation
        mad = 0.7979 * term.mean()
        df = df[term / mad <= 3]

    return df


def time_histograms(df, unit='hour'):
    """
    Histogram plots of trading distribution per ticker
    for a particular unit of time (e.g. day, hour, etc).

    :param df: input dataframe
    :param unit: time unit
    :return: None
    """
    tickers = df['ticker'].unique()
    fig, ax = plt.subplots(3, 1, figsize=(7, 10))
    for i in range(3):
        # determine the number of bins based on the time unit
        nbins = 24 if unit == 'hour' else 60

        # plot and configure histogram
        N, bins, patches = ax[i].hist(df.loc[df['ticker'] == tickers[i], unit],
                                      density=True, bins=np.arange(0, nbins) - 0.5)
        ax[i].yaxis.set_major_formatter(PercentFormatter(xmax=1))
        ax[i].set_xticks(range(0, nbins + 1, 2))
        ax[i].set_yticks(np.arange(0, 0.175, 0.025))
        ax[i].tick_params(axis='both', which='major', labelsize=14)
        ax[i].text(0.8, 0.8, tickers[i], fontdict={'size': 15},
                   transform=ax[i].transAxes)
        ax[i].set_xlabel(unit.capitalize(), fontsize=17)
        ax[i].set_ylabel('Percentage', fontsize=17)

        # set colomap and bin colors
        norm = colors.Normalize(N.min(), N.max())
        for thisfrac, thispatch in zip(N, patches):
            color = plt.cm.viridis(norm(thisfrac))
            thispatch.set_facecolor(color)

    ax[0].set_title('Histograms of daily trading activity', fontsize=18)
    fig.tight_layout()
    fig.show()


def trade_direction(df):
    """
    Feature engineering for spread first difference and direction of trade
    using the "tick-test" method of Lee and Ready (1991) (see exercise 4).

    :param df: input data
    :return: data updated with direction of trade
    """
    # spread first difference by ticker and replace the missing value for
    # the first trade per ticker with a positive value to force a "buy" label
    df['dspread'] = df.groupby('ticker')['spread'].diff()
    df.loc[df['dspread'].isna(), 'dspread'] = 0.0001

    # create the trade flow indicator based on spread first difference Δs:
    # Δs > 0 -> buy, Δs < 0 -> sell, set Δs = 0 cases to missing for now
    df['direction'] = np.select([df['dspread'] > 0, df['dspread'] < 0],
                                ['buy', 'sell'], default=None)

    # label zero-tick trades in the same direction as their preceding trades
    df['direction'] = df['direction'].ffill()

    return df


def trade_imbalance(df):
    """
    Feature engineering for trade imbalance K = |cumsum(S) - cumsum(B)|,
    where S = # of sell trades and B = # of buy trades during a day.

    :param df: input data
    :return: data updated with trade imbalance for each ticker
    """
    # cumulative number of daily buy and sell orders per ticker
    sells = df.assign(valid=df['direction'] == 'sell')\
        .groupby(['ticker', 'date']).valid.cumsum()
    buys = df.assign(valid=df['direction'] == 'buy')\
        .groupby(['ticker', 'date']).valid.cumsum()

    # define the trade imbalance feature
    df['imbalance'] = np.abs(sells - buys)

    return df

___

### Data parsing, processing and feature engineering

**Trades with identical timestamp for each security**

As already discussed in exercises 3 and 4, duplicate records for a security with identical timestampσ are possibly different trades rather than data errors. Trades with the same timestamp imply that the time interval between them was less than a second, but still non-zero because of latency. Trades in the data with a seemingly zero interval would add noise to any model that aims to predict arrival times. Therefore, I keep only the first trade that has the smallest latency and remove the others from sample, because noise increases with latency.

**Notional value outliers and scale effects**

The raw values for the notional include a heavy left tail. Although values below $10^5$ are likely fat finger trades (see below), their removal doesn't substantially mitigate the tail and scale effects as they consitute less than 1% of all trades in the sample. A log transform is more effective in mitigating left tail skewness and varying scale effects for this feature.

**Fat finger trades**

These are trades where either the notional or the spread (or both) entered the database with the wrong values, possibly because of human error or other sources of noise. Fat finger trades will be unidentifiable when the values recorded are in the vicinity of the sample distribution's mean and median. However, outlier trades are likely fat finger trades, especially if the trades that immediately precede and follow them have less extreme values. Therefore, I remove the outliers based on the Median Absolute Deviation (MAD) criteria (or the Mean Absolute Deviation if the former is zero). Outliers are identified for each ticker separately. Spread variables in this dataset seem to have fewer relatively extreme values per ticker compared to the notional. These values add explanatory power to the models that were tested. Therefore, they are not treated as outliers and are included in the sample.

**Trading activity**

Although OTC derivatives aren't subject to exchange hours, the histograms below show that ~90% of trading activity is concentrated primarily between 8 am - 4 pm for CDXIG5 and 3 am - 12 pm for the two European securities. Trades beyond these intervals will result in biased estimates for trade arrival intervals. Therefore, I restrict the sample to the hours of highest trading activity as specified above for each security. Although there is a small peak across all 3 securities at 10 am, each ticker's distribution can be considered roughly as uniform after the restriction is applied. As a result, there is no need to create separate models across the time of day. Also, there is a mild clustering of trades on the minute mark (0th second) for the European securities (roughtly 4% of total trades relatively to a uniform ~2% for every other second during a minute). Trading activity across minutes is also uniformly distributed.

**Dependent variables**

The dependent variables are *log(notional)* (explained before) and *log(dt)*, where *dt* is the vector of daily time intervals between successive trades in seconds. The logarithm on dt mitigates scale effects. The time intervals are constructed separately for each ticker and for each day. This is done to prevent spuriously large intervals between the last trade of a day and the first trade in the following day. The goal is to predict time intervals during periods of substantial trading activity.

**Feature engineering**

The list of features includes the following:  
1. Time-related variables such as second, minute, hour and day.  
2. The log-spread *log(spread)* and the nominal spread's first difference *Δspread*.  
3. Daily trade imbalance (absolute difference between sells and buys).  
4. Time-lagged (t-1) versions of the dependent variables.  
5. Ticker dummies are also used in the regressions.

The features *log(spread)*, *Δspread* and *imbalance* are lagged by one period, because the goal of the models is to forecast future values using the information available in the present.

In [None]:
# read dataset
df = pd.read_csv("credit_derivatives_trades.csv").drop(columns=['Unnamed: 0'])

# keep only the first trade per ticker among those with identical timestamps
df = df.drop_duplicates(subset=['ticker', 'timestamp'], keep='first')

# convert timestamp to datetime and create auxiliary date variable
df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S")
df['date'] = df['timestamp'].dt.date

# sort by ticker and timestamp
df = df.sort_values(by=['ticker', 'timestamp']).reset_index(drop=True)

# sanity check and log transform the notional
# to mitigate the feature's heavy left tail
df = df[df['notional'] > 0]
df['log(notional)'] = np.log(df['notional'])

# get the daily time intervals in log-second units and log-spread
df['logdt'] = np.log(df.groupby(['ticker', 'date'])
                     ['timestamp'].diff().dt.total_seconds())
df['log(spread)'] = np.log(df['spread'])

# feature engineering of time variables
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df['second'] = df['timestamp'].dt.second

# feature engineering of spread first difference and
# trade imbalance (relies on direction of trade)
df = trade_direction(df)
df = trade_imbalance(df)

# remove the outliers from log-notional and logdt.
# extreme imbalance and spreads seem to improve forecasting.
# cols = ['log(notional)', 'logdt', 'log(spread)', 'dspread', 'imbalance']
cols = ['log(notional)', 'logdt']
df[cols] = df.groupby('ticker')[cols].apply(remove_outliers)

# show histogram plots and restrict the sample
# within the hours of highest trading activity
time_histograms(df, unit='hour')
cols = ['ITXEB5', 'ITXES5']
cond1 = (df['ticker'] == 'CDXIG5') & (df['hour'].between(8, 15))
cond2 = (df['ticker'].isin(cols)) & (df['hour'].between(3, 11))
df = df[(cond1) | (cond2)]

# create ticker dummies
df['D'] = df['ticker']
df = pd.get_dummies(df, columns=['D'], drop_first=True, dtype=int)

# lag the non-time related features
cols = ['log(spread)', 'dspread', 'imbalance']
df[cols] = df.groupby('ticker')[cols].shift()

# create features by lagging the dependent variables
lag_cols = ['laglog(notional)', 'laglogdt']
df[lag_cols] = df.groupby('ticker')[['log(notional)', 'logdt']].shift()

# clean up
df = df.drop(columns=['direction', 'date', 'spread', 'notional'])

# summary statistics for the new variable
prc = [0.01, 0.25, 0.5, 0.75, 0.99]
cols = ['log(notional)', 'logdt', 'log(spread)', 'dspread', 'imbalance']
print('\033[1m' + 'Summary Statistics' + '\033[0m')
display(df[cols].describe(percentiles=prc))

___

### Auxiliary functions for model estimation

In [None]:
def get_oos_r2(df, yvar):
    """
    Estimates the performance metric (out-of-sample R-squared)
    for a candidate model in the stepwise regression method.

    :param df: dataframe with dependent variables and candidate features
    :param yvar: dependent variable label
    :return: model performance metric
    """
    # configuration of dependent and independent variables
    core = ['timestamp', yvar]
    xcol = [col for col in df.columns if col not in core]

    # sample test/train split
    df_train = df[df['timestamp'].dt.year < 2017]
    df_test = df[df['timestamp'].dt.year == 2017]

    # train the model to estimate the OLS betas
    exog = add_constant(df_train[xcol])
    betas = OLS(df_train[yvar], exog, missing='drop').fit().params

    # get OOS forecast by multiplying the betas from training
    # with the feature values from the testing sample
    exog = add_constant(df_test[xcol])
    yhat = exog.mul(betas).sum(axis=1)

    # estimate R-squared metric by fitting OOS predicted values
    # with the observed ones for the same period
    res = OLS(df_test[yvar], add_constant(yhat), missing='drop').fit()
    oos_r2 = res.rsquared.round(4)

    return oos_r2


def stepwise(df, yvar):
    """
    Performs a customized forward stepwise regression
    for model estimation and feature importance.

    :param df: dataframe with input features and regressors
    :param yvar: dependent variable label
    :return: dictionary with the summary of results
    """
    # minimum OOS R-squared improvement threshold and feature containers
    min_oos_r2 = 0.0001
    selected_features = []
    candidate_feature_pool = ['day', 'hour', 'minute', 'second', 'log(spread)',
                              'dspread', 'imbalance', 'ticker',
                              'laglog(notional)', 'laglogdt']

    # append timestamp to yvar (for train/test split) and set dummy labels
    core = ['timestamp', yvar]
    dummies = ['D_ITXEB5', 'D_ITXES5']

    # single run per candidate feature to restrict to those
    # that contribute individually above the min threshold
    oos_r2s_with_candidate_features = []
    for candidate_feature in candidate_feature_pool:
        if candidate_feature != 'ticker':
            dft = df[core + [candidate_feature]]
        else:
            dft = df[core + dummies]
        oos_r2 = get_oos_r2(dft, yvar)
        oos_r2s_with_candidate_features.append((oos_r2, candidate_feature))
    remaining_features = [tup[1] for tup in oos_r2s_with_candidate_features
                          if tup[0] >= min_oos_r2]

    # perform stepwise regression
    current_oos_r2, best_new_oos_r2, delta_oos_r2 = 0.0, 0.0, -1.0
    while (remaining_features and current_oos_r2 == best_new_oos_r2 and
           delta_oos_r2 != 0.0):
        oos_r2s_with_candidate_features = []
        for candidate_feature in remaining_features:
            if candidate_feature != 'ticker':
                dft = df[core + selected_features + [candidate_feature]]
            else:
                dft = df[core + selected_features + dummies]
            oos_r2 = get_oos_r2(dft, yvar)
            oos_r2s_with_candidate_features.append((oos_r2, candidate_feature))

        oos_r2s_with_candidate_features.sort()
        best_new_oos_r2, best_candidate = oos_r2s_with_candidate_features.pop()
        delta_oos_r2 = best_new_oos_r2 - current_oos_r2
        if delta_oos_r2 > min_oos_r2:
            current_oos_r2 = best_new_oos_r2
            remaining_features.remove(best_candidate)
            if best_candidate != 'ticker':
                selected_features.append(best_candidate)
            else:
                selected_features.extend(dummies)

    # get the best-performing basket of features with its OOS R-squared
    if selected_features:
        df_final = df[core + selected_features]
        max_oos_r2 = get_oos_r2(df_final, yvar)
    else:
        selected_features = 'None'
        max_oos_r2 = 0.0

    # print results
    print('\033[1m' + 'Stepwise regression results for ' + yvar + '\033[0m')
    print('Model performance metric (OOS R-squared): {}'.format(max_oos_r2))
    print('Feature importance (left-to-right): {}'.format(selected_features))


def xgboost(df):
    """
    Performs XGBoost regressions and feature importance with Shapley values for
    each dependent variable separately to benchmark with the stepwise process.

    :param df: dataframe with input features and regressors
    :return: None
    """
    # sample test/train split (drop all NaN values)
    df_train = df[df['timestamp'].dt.year < 2017].dropna()
    df_test = df[df['timestamp'].dt.year == 2017].dropna()

    # features list
    xcol = ['day', 'hour', 'minute', 'second', 'log(spread)', 'dspread',
            'imbalance', 'laglog(notional)', 'laglogdt', 'D_ITXEB5', 'D_ITXES5']

    # load the boosted tree regressor class
    xgr = XGBRegressor()

    # loop over each dependent variable
    for yvar in ['logdt', 'log(notional)']:
        # fit the model in the training set
        xgr.fit(df_train[xcol], df_train[yvar])

        # get the model's OOS predictions in the testing set
        yhat = xgr.predict(df_test[xcol])

        # estimate OOS R-squared by OLS regression between observed
        # and predicted values in the testing set, the intercept
        # enforces the metric within [0, 1]
        R2 = OLS(df_test[yvar], add_constant(yhat),
                 missing='drop').fit().rsquared.round(4)

        # print results
        print('\033[1m' + 'XGBoost regression results for ' + yvar + '\033[0m')
        print('Model performance metric (OOS R-squared): {}'.format(R2))
        print('Feature importance plots with Shapley values')

        # visualize feature importance results with Shapley values, they are
        # more reliable that the internal importance methods of XGBoost
        explainer = shap.TreeExplainer(xgr)
        shap_values = explainer.shap_values(df_test[xcol])
        shap.summary_plot(shap_values, df_test[xcol], plot_type="bar")
        plt.show()


def naive_benchmark(df):
    """
    Performs an AR(1) regressions for each dependent variable to gauge
    the performance improvement of the stepwise and boosting processe.

    :param df: dataframe with input features and regressors
    :return: None
    """
    # use the testing sample only
    cols = ['logdt', 'log(notional)', 'laglogdt', 'laglog(notional)']
    dfb = df.loc[df['timestamp'].dt.year == 2017, cols].copy()

    # loop over each dependent variable
    for yvar in ['logdt', 'log(notional)']:
        # get the model's OOS predictions in the testing set
        yhat = 'lag' + yvar

        # estimate OOS R-squared by OLS regression between observed
        # and predicted values in the testing set, the intercept
        # enforces the metric within [0, 1]
        R2 = OLS(dfb[yvar], add_constant(dfb[yhat]),
                 missing='drop').fit().rsquared.round(4)

        # print results
        print('\033[1m' + 'Naive benchmark results for ' + yvar + '\033[0m')
        print('Model performance metric (OOS R-squared): {}'.format(R2))

___

## Model estimation

The task, as it is phrased, is to develop a model that predicts both the arrival timestamp and the notional. A likely model to estimate both dependent variables simultaneously is VARX, i.e. a vector autoregression with $y_t = [dt_t, Q_t]$ across all securities ($Q_t$: notional) and additional exogenous features $X_t$. The intuition is that VARX enforces the coupling of lagged $y_{t-k}$ and $X_{t-k}$ values during the simultaneous estimation of vector $y_t$. This is something than can be justified by the presence of RV trades, where the exogenous features of one security can affect the $y_t$ values of the other security.

However, the VARX solver from statsmodels uses MLE to estimate the model parameters and the solver is too slow for this data even for one regression. I use alternative methods that provide for the decoupling of the dependent variables. This implies that there will be a separate model estimation for the time interval $log(dt_t)$ and notional $log(Q_t)$, and the feature importance ranking will also differ for each dependent variable.

The proposed estimation method is a custom forward stepwise regression. This method relies on a cascade of OLS regressions that provide for linearity, straightforward economic interpretability and feature importance with a parsimonious set of selected features. First, each feature is tested separately to gauge whether it has any explanatory power out-of-sample (OOS). Those that lack any power are removed from the list of candidate features. Those that have explanatory power are ranked in terms of OOS performance. The metric used for model selection is the OOS R-squared.

The single feature with the best performance sets the core of features for the next round of OLS cascade estimations. During the second round, baskets of two features are tested successively by OLS. The basket for each of the regressions includes the core feature from the previous round and one of the remaining features in the candidate feature pool. These models are ranked by performance OOS again, and the pair of features that perform the best become the new core. The process is repeated recursively until the remaining features no longer provide any marginal performance improvement OOS, marking the end of the feature selection process.

This process also provides for a feature importance method, as the first feature that is selected during the first round is the most important, the one selected in the second round is the second most important, etc. Features that are not selected don't provide any marginal performance improvement OOS, implying that they are not important for the estimation of the particular dependent variable.

The stepwise regression method is benchmarked against two alternative methods. The first is a tree-based non-linear regression with XGBoost. The list of candidate features is relatively small, so boosting is preferred compared to random forests (RF). The reason is that RFs ensemble across multiple weak learners and work better with large lists of features. However, boosting methods recursively improve on the residuals of weak learners and can be effective even with smaller feature lists. The tree method is used to test weather potential non-linearity in the data can affect the end results of the stepwise regression and/or improve OOS performance substantially.

XGBoost can also handle the ticker categorical feature, provided that it is encoded as dummies. Unlike OLS-based methods where one dummy must be excluded to prevent estimation failure from perfect collinearity, tree methods can handle that internally without any impact on model performance. However, the order of feature importance can be affected. Therefore, I exclude one of the dummies similarly to the stepwise method to mitigate the concerns of collinearity during the feature importance analysis.

The second benchmark is a simple AR(1) process. The rationale is that the most naive forecast is a martingale, where the best prediction for next period's timestamp and notional are the currently observed values. The performance of the naive benchmark helps gauge the performance of the proposed models. For instance, an R-squared metric of 90% using sophisticated models would mean little if the naive benchmark had similar performance. For the test on the naive benchmark I use an OLS regression of OOS observed $y_t$ values on those predicted by the model $\hat y_t = y_{t-1}$. Given that the predicted values are practically the lagged observed values, the estimation process is an $AR(1)$ rather than a strictly $I(1)$ martingale process, because the OLS beta won't likely be exactly equal to 1. However, this approach makes the test consistent with the estimation of OOS R-squared within the stepwise and boosting methods. Using OLS with an intercept to estimate the R-squared between $y_t$ and $\hat y_t$ has the additional benefit that the metric is guarenteed to range $R^2\in[0, 1]$ which facilitates the economic interpretability.

In [None]:
# stepwise regressions
stepwise(df, 'logdt')
stepwise(df, 'log(notional)')

# XGBoost benchmark estimation
xgboost(df)

# naive benchmark, predicted values are the one-period
# lags of observed values in the testing sample
naive_benchmark(df)

________

## Conclusion

Model performance for both the stepwise and XGBoost regressions outperforms the naive benchmark by roughly double the amount for each dependent variable. On the other hand, the OOS R-squared values are relatively low, because of the crude train/test data split and model setup. Trade data have intraday frequency and financial data overall tend to have a low signal-to-noise ratio. I usually test time series models OOS with one-period-ahead forecasting, because is it very hard to get robust multi-period forecasts with financial time series. However, for the purpose of demonstrating a complete ML time series analysis, the proposed data split suffices.

The stepwise and XGBoost regressions have similar feature importance rankings. They both highlight first and foremost the importance of the security's contracting details (ticker/dummy variables). In order of importance, timestamp prediction is most sensitive to the most recently recorded interval, the trade imbalance and the spread respectively. The notional is harder to predict than the timestamp. Other than ticker ID, it is most sensitive to the spread. Compared to the stepwise regression, the XGBoost attributes more importance on the most recently recorded notional at the cost of dimished importance for the trade imbalance. However, the big picture on the feature importance results between the two methods is consistent.