In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression

from optiver import Directories
from optiver.bench import rmspe
from optiver.utils import realized_volatility, generate_dfs

dirs = Directories("../..")

ImportError: cannot import name 'realized_volatility' from 'optiver.utils' (/home/gchristensen/repos/optiver-realized-volatility/optiver/utils.py)

Let's look at stocks 21, 62, 92, 110, and 125 to try to find interesting patterns within the order book data.

In [None]:
stock_id = 21

order_book = pd.read_hdf(dirs.processed / "book_train" / f"stock_{stock_id}.h5").loc[stock_id]
trade_book = pd.read_hdf(dirs.processed / "trade_train" / f"stock_{stock_id}.h5").loc[stock_id]
all_targets = pd.read_hdf(dirs.processed / "targets_train.h5")
targets = all_targets.loc[stock_id]

In [None]:
order_book

In [None]:
def describe_df(df):
    data_dict = {}
    
    data_dict["mean"] = df.mean()
    data_dict["median"] = df.median()
    data_dict["mode"] = df.mode().iloc[0]
    
    data_dict["range"] = df.max() - df.min()
    data_dict["IQR"] = df.quantile(q=0.75) - df.quantile(q=0.25)
    
    return pd.DataFrame(data_dict)

In [None]:
describe_df(order_book)

There appears to be some positive skew in the bid and ask sizes.

In [None]:
price_columns = ["bid_price1", "ask_price1", "bid_price2", "ask_price2"]
size_columns = ["bid_size1", "ask_size1", "bid_size2", "ask_size2"]

In [None]:
_ = order_book.boxplot(column=price_columns)

In [None]:
_ = order_book.boxplot(column=size_columns)

In [None]:
bin_size = 100

In [None]:
_ = order_book.hist(column=price_columns, bins=bin_size)

In [None]:
_ = order_book.hist(column=size_columns, bins=bin_size)

In [None]:
features = pd.DataFrame({"past_vol1": realized_volatility(order_book)})
features["target"] = targets

In [None]:
wap1 = (order_book["bid_price1"] * order_book["ask_size1"] + order_book["ask_price1"] * order_book["bid_size1"]) / \
       (order_book["bid_size1"] + order_book["ask_size1"])

wap1_groupby = wap1.groupby(level="time_id")
features["wap1_mean"] = wap1_groupby.mean()
features["wap1_std"] = wap1_groupby.std()
features["wap1_min"] = wap1_groupby.min()
features["wap1_max"] = wap1_groupby.max()
features["wap1_med"] = wap1_groupby.median()

In [None]:
features["hello"] = np.log(abs((wap1 - 1).groupby(level="time_id").mean()))

In [None]:
wap2 = (order_book["bid_price2"] * order_book["ask_size2"] + order_book["ask_price2"] * order_book["bid_size2"]) / \
       (order_book["bid_size2"] + order_book["ask_size2"])

wap2_groupby = wap2.groupby(level="time_id")
features["wap2_mean"] = wap2_groupby.mean()
features["wap2_std"] = wap2_groupby.std()
features["wap2_min"] = wap2_groupby.min()
features["wap2_max"] = wap2_groupby.max()
features["wap2_med"] = wap2_groupby.median()

In [None]:
log_return1 = np.log(wap1).groupby(level="time_id").diff().dropna()

lr1_groupby = log_return1.groupby(level="time_id")
features["lr1_mean"] = lr1_groupby.mean()
features["lr1_std"] = lr1_groupby.std()
features["lr1_min"] = lr1_groupby.min()
features["lr1_max"] = lr1_groupby.max()
features["lr1_med"] = lr1_groupby.median()

In [None]:
log_return2 = np.log(wap2).groupby(level="time_id").diff().dropna()

lr2_groupby = log_return2.groupby(level="time_id")
features["lr2_mean"] = lr2_groupby.mean()
features["lr2_std"] = lr2_groupby.std()
features["lr2_min"] = lr2_groupby.min()
features["lr2_max"] = lr2_groupby.max()
features["lr2_med"] = lr2_groupby.median()

In [None]:
features["past_vol2"] = np.sqrt((log_return2**2).groupby(level="time_id").sum())

In [None]:
features["time_id"] = features.index.get_level_values("time_id")

In [None]:
bid_ask_spread1 = order_book["ask_price1"] / order_book["bid_price1"] - 1
bid_ask_spread2 = order_book["ask_price2"] / order_book["bid_price2"] - 1

bas1_groupby = bid_ask_spread1.groupby(level="time_id")
bas2_groupby = bid_ask_spread2.groupby(level="time_id")

features["bas1_mean"] = bas1_groupby.mean()
features["bas1_med"] = bas1_groupby.median()
features["bas1_std"] = bas1_groupby.std()
features["bas1_min"] = bas1_groupby.min()
features["bas1_max"] = bas1_groupby.max()

features["bas2_mean"] = bas2_groupby.mean()
features["bas2_med"] = bas2_groupby.median()
features["bas2_std"] = bas2_groupby.std()
features["bas2_min"] = bas2_groupby.min()
features["bas2_max"] = bas2_groupby.max()

In [None]:
exchanges = trade_book["price"] * trade_book["size"]
exchanges_groupby = exchanges.groupby(level="time_id")
features["exchange_total"] = exchanges_groupby.sum()
features["exchange_total_debug"] = (trade_book["price"] * trade_book["size"] * trade_book["order_count"]).groupby(level="time_id").sum()

In [None]:
features["exchange_mean"] = exchanges_groupby.mean()
features["exchange_std"] = exchanges_groupby.std()
features["exchange_min"] = exchanges_groupby.min()
features["exchange_max"] = exchanges_groupby.max()
features["exchange_med"] = exchanges_groupby.median()

**Order Book Linear Features**: past_vol1, past_vol_2, wap1_std, wap2_std, lr1_std, lr1_min, lr1_max, lr2_std, lr2_min, lr2_max, bas1_mean, bas1_std, bas1_med, bas1_max, bas2_mean, bas2_std, bas2_med, bas2_max

**Order Book Potential Linear Features**: wap1_min, wap1_max, wap2_min, wap2_max

**Order Book Decision Tree Features**: bas1_min, bas2_min

**Order Book NN Features**: wap1_mean, wap1_med, wap2_mean, wap2_med

**Trade Book Linear Features**: exchange_total

**Trade Book Decsion Tree Features**: exchange_mean, exchange_std, **exchange_min**, exchange_med

**Trade Book NN Features**: exchange_max

NOTE: wap_mean and lr_mean may be more useful if you do a transformation on them where you take the absolute value

In [None]:
chosen_feature = "exchange_total"
chosen_target = "target"

# graph_df = pd.DataFrame({chosen_feature: features[chosen_feature], "target": targets})

_ = features.plot(x=chosen_feature, y=chosen_target, kind="scatter")

In [None]:
_ = features.boxplot(column="target")

Features from the order book data that are highly correlated with target seem to be measures of variance(`wap{1/2}_std`, `lr{1/2}_std`), past volatilities for both best and second best prices, the mininums and maximums of log return measures, and all bid-ask-spread features except minimum, which seems to be useful for decision trees.

The `exchange_total` has a variable but evident positive association with the target. A feature that seems really useful from the trade data is the mininum exchange, but seems more useful for a decision tree type of logic.

The question is how correlated are these variables with each other.

In [None]:
useful_linear_features = ["past_vol1", "past_vol2", "wap1_std", "wap2_std", "lr1_std", "lr2_std", "lr1_min", "lr2_min", "lr1_max", "lr2_max"]
useful_linear_features_trade = useful_linear_features + ["exchange_total"]

useful_linear_df = features[useful_linear_features]
_ = pd.plotting.scatter_matrix(useful_linear_df)

In [None]:
# naive_linear = features[["past_vol1", "past_vol2", "lr1_std", "lr2_std", "lr1_min", "lr2_min", "exchange_min"]]
# naive_linear = features[["past_vol1", "lr1_std", "lr1_min"]]
naive_linear = useful_linear_df
# naive_linear = features[useful_linear_features_trade]

naive_linear_val = naive_linear.sample(frac=0.2, random_state=10).sort_index()
naive_linear_train = naive_linear.drop(naive_linear_val.index).sort_index()

targets_val = targets.loc[naive_linear_val.index]
targets_train = targets.loc[naive_linear_train.index]

In [None]:
regressor = LinearRegression()
regressor.fit(naive_linear_train, targets_train)

past_regressor = LinearRegression()
past_regressor.fit(naive_linear_train[["past_vol1"]], targets_train)

In [None]:
past_baseline = rmspe(naive_linear_val["past_vol1"], targets_val)
past_linear_baseline = rmspe(past_regressor.predict(naive_linear_val[["past_vol1"]]), targets_val)
linear_baseline = rmspe(regressor.predict(naive_linear_val), targets_val)

past_baseline, past_linear_baseline, linear_baseline

In [None]:
def fit_predict_split(df, stock_id):
    df_feature = pd.DataFrame({"past_vol1": realized_volatility(df)})
    
    wap1 = (df["bid_price1"] * df["ask_size1"] + df["ask_price1"] * df["bid_size1"]) / \
       (df["bid_size1"] + df["ask_size1"])
    
    wap2 = (df["bid_price2"] * df["ask_size2"] + df["ask_price2"] * df["bid_size2"]) / \
       (df["bid_size2"] + df["ask_size2"])
    
    log_return1 = np.log(wap1).groupby(level="time_id").diff().dropna()
    log_return2 = np.log(wap2).groupby(level="time_id").diff().dropna()
    
    lr1_groupby = log_return1.groupby(level="time_id")
    lr2_groupby = log_return2.groupby(level="time_id")

    df_feature["past_vol2"] = np.sqrt((log_return2**2).groupby(level="time_id").sum())
    df_feature["lr1_std"] = lr1_groupby.std()
    df_feature["lr2_std"] = lr2_groupby.std()
    df_feature["lr1_min"] = lr1_groupby.min()
    df_feature["lr2_min"] = lr2_groupby.min()
    
    df_val = df_feature.sample(frac=0.2).sort_index()
    df_train = df_feature.drop(df_val.index).sort_index()
    
    train_targets = all_targets.loc[stock_id].loc[df_train.index]
    val_targets = all_targets.loc[stock_id].loc[df_val.index]
    
    regressor = LinearRegression()
    regressor.fit(df_train, train_targets)
    
    return stock_id, pd.DataFrame({"target": regressor.predict(df_val)}, index=df_val.index), val_targets

In [None]:
# from itertools import islice

# preds_actuals = [fit_predict_split(df, stock_id) for stock_id, df in islice(tqdm(generate_dfs(dirs.processed / "book_train"), total=92), 0, 90)]

# stock_ids, preds_list, actuals_list = zip(*preds_actuals)

# preds = pd.concat({stock_id:pred for stock_id, pred in zip(stock_ids, preds_list)}, names=("stock_id", "time_id"))
# actuals = pd.concat({stock_id:actual for stock_id, actual in zip(stock_ids, actuals_list)}, names=("stock_id", "time_id"))

In [None]:
# rmspe(preds["target"], actuals)