In [None]:
# Import libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

___
### Auxiliary functions

In [None]:
def rv_trade_count(df):
    """
    Process the data with identical timestamps to identify RV trades.

    :param df: dataframe with candidate RV trades
    :return: dataframe with indicator of daily RV trades,
             dataframe with timestamps of trade pairs
    """
    # get all rows with duplicate timestamps
    dft = df[df.duplicated(subset=['timestamp'], keep=False)]

    # group by timestamp and count the tickers in each group, ignore the other 
    # identifying variable log_sn for now. The rows with duplicate timestamps 
    # will include only the valid security pairs, because duplicates rows with 
    # only ITXEB5 trades are aggregated to a single row by value_counts.
    mask_cols = ['timestamp', 'ticker']
    mask = dft[mask_cols].groupby('timestamp').value_counts().reset_index()
    mask = mask[mask.duplicated(subset=['timestamp'], keep=False)]

    # get the log_sn values and calculate their differences for the candidate pairs
    # dft = pd.merge(dft[mask_cols + ['date', 'log_sn']], mask[mask_cols], 
    #                how='inner', on=mask_cols)
    dft = pd.merge(dft, mask[mask_cols], how='inner', on=mask_cols)
    dft['Dlog_sn'] = dft.groupby('timestamp')['log_sn'].diff().abs()

    # identify valid trade pairs with identical timestamps by enforcing the
    # spread/notional similarity criteria Δlog(SxN) <= 0.5
    dft = dft[dft['Dlog_sn'] <= 0.5]

    # get the minimum daily number of RV trades and the detailed timestamps
    # of confirmed trades (those with identical timestamps) during the day
    out = dft.groupby('date')['Dlog_sn'].count().rename('RV_trades').reset_index()
    tstamps = dft[['date', 'timestamp']]

    return out, tstamps

___
### Data parsing and processing

**CDXIG5 notional left tail:**  
The notional value for CDXIG5 has a more skewed left tail than the other two securities. Instead of coding a process for outlier identification and removal (e.g. by using the median absolute deviation or other criteria), it is more practical to cut off CDXIG5's nominal to values above the approximate minimum for the other two securities. The reason is that we're interested in RV trades with similar notional and similar spreads. The summary statistics show similar distributions across the security spreads, implying that the CDXIG5 nominal values below the minima of the other two securities are practically useless.

**Duplicate rows for each security:**  
Although there is latency from physical restrictions within exchanges, the time accuracy within the data is 1 second, which is much larger than the average latency in the real world. Therefore, duplicate records for a security at exactly the same timestamp are possibly different trades rather than data errors. For instance, it is possible that one of the duplicates for ITXEB5 is part of an RV trade with CDXIG5 and the other a different RV trade with ITXES5. On the other hand, we're not interested in RV trades between CDXIG5 and ITXES5. As a result, the duplicates for these securities can be dropped to improve code efficiency.

**Potential scale differences in spreads/notionals:**  
According to the instructions, scale differences between the spreads of two securities within an RV trade are inversely proportional to scale differences in the nominals. Therefore, the product of spread and nominal is relatively invariant to scale differences. The log of this product (to mitigate scale effects from the average nominal values) is the variable that identifies relative value trades.

In [None]:
# read dataset
df = pd.read_csv("credit_derivatives_trades.csv").drop(columns=['Unnamed: 0'])

# convert timestamp to datetime and define date variable
df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S")
df['date'] = df['timestamp'].dt.date

# sanity check
df = df[df['notional'] > 0]

# sort by timestamp
df = df.sort_values(by='timestamp')

# drop the duplicates of CDXIG5 and ITXES5, keep the duplicates of ITXEB5
df = df[~((df['timestamp'].duplicated()) &
          (df['ticker'].isin(['CDXIG5', 'ITXES5'])))]

# create auxiliary variable as the product of spread and nominal,
# take the log to remove scale effects
df['log_sn'] = np.log(df['spread'] * df['notional'])

# summary statistics of processed data
data = df.copy()
prc = [0.01, 0.25, 0.5, 0.75, 0.99]
print('\033[1m' + 'Summary Statistics' + '\033[0m')
display(data.groupby('ticker').describe(percentiles=prc).T)

In [None]:
# correlation matrices of daily median values on
# spread and nominal across the three securities
dft = data.groupby(['ticker', 'date']).median().reset_index()
corr_s = dft.pivot(index='date', columns='ticker', values='spread').corr()
corr_n = dft.pivot(index='date', columns='ticker', values='notional').corr()

# plot the correlation heatmaps
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(5, 6))
heatmap_s = sns.heatmap(round(corr_s, 2), vmin=-1, vmax=1, annot=True, ax=ax1, cmap='viridis')
heatmap_s.set_title('Correlations Among Daily Median Spreads', fontdict={'fontsize': 15}, pad=12)
heatmap_n = sns.heatmap(round(corr_n, 2), vmin=-1, vmax=1, annot=True, ax=ax2, cmap='viridis')
heatmap_n.set_title('Correlations Among Daily Median Nominals', fontdict={'fontsize': 15}, pad=12)
fig.tight_layout()
fig.show()

___

### Daily RV trade indicator

The granularity limitations in the time stamp works in our favor. The two trades of the RV strategy that happen within a time lag less than a second will be recorded in the data with identical timestamps. Identifying the trade pairs with identical timestamps (either with CDXIG5/ITXEB5 or ITXES5/ITXEB5) gives the minimum number of confirmed RV trades.

Certain trade pairs of valid RV trades may have time lags greater than a second. The key to identifying them is to search near the timestamps of previously confirmed RV trades (those with identical timestamps). The intuition is that arbitrageurs attempt to exploit the profitable opportunities as soon as they arise in the market. Therefore, it is unlikely to have in the data isolated RV trades with time lags larger than 1 second and no other arbitrageur has identified the profitable opportunity around the same time.

The definition of "around the same time" is arbitrary. The time window considered for candidate trade pairs with >1sec lags is 15 minutes before and after the timestamps of the initially confirmed RV trades.

The product of spread x nominal (SxN) should be similar for a valid trade pair. Define $Δlog(SxN) <= 0.5$ as the similarity threshold. Visual inspection of the data gives similar nominal values between the two securities when this limit is enforced for the differences in *log_sn*.

In [None]:
# step 1: get the minimum number of confirmed RV trades from
# trade pairs with identical timestamps in the raw data
conf, tstamps = rv_trade_count(data)
conf = conf.rename(columns={'RV_trades': 'min_RV_trades'})

# step 2: add the RV trades with timestamps within 15 minutes BEFORE the
# confirmed trades of step 1 (use bfill to define the group timestamp)
dft = tstamps.copy()
dft['group'] = dft['timestamp'] - pd.Timedelta(minutes=15)
dft = pd.merge(data, dft.drop(columns='date'), on='timestamp', how='left').bfill().dropna()
dft = dft[dft['timestamp'] >= dft['group']]
dft = dft.drop(columns='timestamp').rename(columns={'group': 'timestamp'})

dfL, _ = rv_trade_count(dft)
dfL = dfL.rename(columns={'RV_trades': 'RV_trades_L'})
dfL = pd.merge(conf, dfL, on='date', how='outer')

# step 3: add the RV trades with timestamps within 15 minutes AFTER the
# confirmed trades of step 1 (use ffill to define the group timestamp)
dft = tstamps.copy()
dft['group'] = dft['timestamp'] + pd.Timedelta(minutes=15)
dft = pd.merge(data, dft.drop(columns='date'), on='timestamp', how='left').ffill().dropna()
dft = dft[dft['timestamp'] <= dft['group']]
dft = dft.drop(columns='timestamp').rename(columns={'group': 'timestamp'})

dfU, _ = rv_trade_count(dft)
dfU = dfU.rename(columns={'RV_trades': 'RV_trades_U'})

# step 4: add the RV trades to get the total daily count
tot = pd.merge(dfL, dfU, on='date', how='outer')
tot['RV_trades'] = tot.sum(axis=1).astype(int)
tot = tot[['date', 'min_RV_trades', 'RV_trades']]

# print results
print('\033[1m' + 'Daily RV trade indicator' + '\033[0m')
display(tot)
print('\033[1m' + 'min_RV_trades:' + '\033[0m' +
      ' daily RV trades where the trade pairs have identical timestamps')
print('\033[1m' + 'RV_trades:' + '\033[0m' + ' total daily RV trades')