In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
trades_path = "../data/trade_data_2024_04_01.csv"
quotes_path = "../data/quote_data_2024_04_01.csv"
merged_path = "../data/merged_trade_quote_2024_04_01.csv"

trades = pd.read_csv(trades_path)
quotes = pd.read_csv(quotes_path)
merged = pd.read_csv(merged_path)

trades["transaction_timestamp"] = pd.to_datetime(trades["transaction_timestamp"])
quotes["transaction_timestamp"] = pd.to_datetime(quotes["transaction_timestamp"])
merged["transaction_timestamp"] = pd.to_datetime(merged["transaction_timestamp"])

merged.head()

Unnamed: 0,transaction_timestamp,trade_exchange_code,symbol_id,trade_size,trade_price,NBBO_bid_price,NBBO_bid_size,NBBO_ask_price,NBBO_ask_size,price_decimal
0,2024-04-01 16:47:06.517034589,D,7614,400,9002600,9123000,100,9126700,100,4
1,2024-04-01 16:47:06.593876000,D,7614,10,9002599,9123000,100,9126700,100,4
2,2024-04-01 16:47:06.593946000,D,7614,10,9002500,9123000,100,9126700,100,4
3,2024-04-01 16:47:06.593996000,D,7614,12,9002599,9123000,100,9126700,100,4
4,2024-04-01 16:47:06.594017000,D,7614,12,9002500,9123000,100,9126700,100,4


In [3]:
symbol_map = {
    2178: "CMG",
    7614: "NVDA",
    9574: "SIRI",
    10407: "TLT",
}

trades_by_symbol = {}
for symbol_id, symbol_name in symbol_map.items():
    if symbol_id not in trades["symbol_id"].unique():
        print(f"Symbol ID {symbol_id} not found in trades, skipping.")
        continue
    sub = trades[trades["symbol_id"] == symbol_id].copy()
    sub = sub.sort_values("transaction_timestamp")
    price_scaler = np.power(10.0, -sub["price_decimal"])
    sub["trade_px"] = sub["trade_price"] * price_scaler
    sub["log_ret"] = np.log(sub["trade_px"]).diff()
    sub = sub.set_index("transaction_timestamp")
    sub["vol_1s"] = sub["log_ret"].rolling("1s").std()
    sub = sub.reset_index()

    trades_by_symbol[symbol_name] = sub

nvda_df = trades_by_symbol["NVDA"].copy()
nvda_df = nvda_df.sort_values("transaction_timestamp")
nvda_df = nvda_df.set_index("transaction_timestamp")
nvda_df["rolling_mid"] = nvda_df["trade_px"].rolling("1s").mean()
nvda_df = nvda_df.reset_index()
nvda_df["locally_normalized_px"] = nvda_df["trade_px"] / nvda_df["rolling_mid"] - 1

nvda_df


Symbol ID 9574 not found in trades, skipping.


Unnamed: 0,transaction_timestamp,trade_exchange_code,symbol_id,trade_size,trade_price,price_decimal,trade_px,log_ret,vol_1s,rolling_mid,locally_normalized_px
0,2024-04-01 16:47:06.517034589,D,7614,400,9002600,4,900.2600,,,900.260000,0.000000e+00
1,2024-04-01 16:47:06.593876000,D,7614,10,9002599,4,900.2599,-1.110790e-07,,900.259950,-5.553951e-08
2,2024-04-01 16:47:06.593946000,D,7614,10,9002500,4,900.2500,-1.099688e-05,0.000008,900.256633,-7.368269e-06
3,2024-04-01 16:47:06.593996000,D,7614,12,9002599,4,900.2599,1.099688e-05,0.000011,900.257450,2.721444e-06
4,2024-04-01 16:47:06.594017000,D,7614,12,9002500,4,900.2500,-1.099688e-05,0.000011,900.255960,-6.620339e-06
...,...,...,...,...,...,...,...,...,...,...,...
85,2024-04-01 16:47:10.576438000,K,7614,10,9000000,4,900.0000,0.000000e+00,0.000077,900.058479,-6.497197e-05
86,2024-04-01 16:47:10.576539560,Q,7614,8,9000000,4,900.0000,0.000000e+00,0.000076,900.056462,-6.273170e-05
87,2024-04-01 16:47:10.576551249,S,7614,25,9000000,4,900.0000,0.000000e+00,0.000073,900.054580,-6.064077e-05
88,2024-04-01 16:47:10.576551249,S,7614,10,9000000,4,900.0000,0.000000e+00,0.000075,900.052819,-5.868473e-05


### Gradient boosting classifier


In [4]:
nvda_df = nvda_df.sort_values("transaction_timestamp")
nvda_df["next_ret"] = nvda_df["log_ret"].shift(-1)
nvda_df = nvda_df.dropna(subset=["next_ret"])
nvda_df["y"] = np.where(nvda_df["next_ret"] > 0, 1, -1)
nvda_GBRT = nvda_df.copy()

feature_cols = [
    "trade_size",
    "trade_px",
    "locally_normalized_px",
    "vol_1s",
    "log_ret",
]

X = nvda_GBRT[feature_cols].copy()
y = nvda_GBRT["y"].values

mask = X.notna().all(axis=1)
X = X[mask]
y = y[mask]

tscv2 = TimeSeriesSplit(n_splits=10)
gb_model = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("clf", GradientBoostingClassifier(random_state=42)),
    ]
)
gb_scores = cross_val_score(gb_model, X, y, cv=tscv2, scoring="accuracy")
print("GradientBoosting accuracy:", gb_scores)
print("Mean accuracy:", gb_scores.mean())

GradientBoosting accuracy: [0.57142857 0.57142857 0.71428571 0.57142857 0.42857143 0.57142857
 0.42857143 0.71428571 0.85714286 1.        ]
Mean accuracy: 0.6428571428571428
