In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Overview

This notebook is designed for a Kaggle competition & a hiring assigment where the goal is to predict short-term price movements for ETH using order book data.
The training set contains ETH-specific features and labels, while BTC and SOL order book datasets are provided for cross-asset feature engineering to improve predictive performance.
We focus on building a memory-efficient, feature-rich pipeline that leverages all available ETH data and integrates BTC/SOL features without exceeding Kaggle’s hardware limits.

# Kaggle: Memory-safe ETH IV model with cross-asset features


In [None]:
# ================================
# Kaggle: Memory-safe ETH IV model with cross-asset features
# ================================
import os, gc, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import pearsonr
import xgboost as xgb


# Config


In [None]:
# --------------------------
# Config
# --------------------------
DATA_DIR = "/kaggle/input/gq-implied-volatility-forecasting"
TRAIN_DIR = f"{DATA_DIR}/train"
TEST_DIR  = f"{DATA_DIR}/test"
CHUNK_SIZE = 1_000_000  # for BTC/SOL
LEVELS = range(1, 6)

# XGBoost fixed hyperparameters post hyper param tuning

In [None]:
XGB_PARAMS = dict(
    colsample_bytree=0.9982368956087719,
    learning_rate=0.0863352758652528,
    max_depth=10,
    n_estimators=1180,
    random_state=42,
    tree_method="hist"  # change to "gpu_hist" if you enable GPU
)

# Helper Functions

In [None]:
# --------------------------
# Helpers
# --------------------------
def downcast_numeric(df: pd.DataFrame) -> pd.DataFrame:
    for c in df.select_dtypes(include=["float64"]).columns:
        df[c] = df[c].astype(np.float32)
    for c in df.select_dtypes(include=["int64"]).columns:
        df[c] = pd.to_numeric(df[c], downcast="integer")
    return df

def ensure_mid_price(df: pd.DataFrame) -> pd.DataFrame:
    if "mid_price" not in df.columns:
        df["mid_price"] = (df["bid_price1"] + df["ask_price1"]) / 2.0
    return df

def clean_financial_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.replace([np.inf, -np.inf], np.nan)
    num_cols = df.select_dtypes(include=[np.number]).columns
    if len(num_cols):
        upper = df[num_cols].quantile(0.999)
        lower = df[num_cols].quantile(0.001)
        df[num_cols] = df[num_cols].clip(lower=lower, upper=upper, axis=1)
        df[num_cols] = df[num_cols].fillna(method="ffill").fillna(method="bfill")
    return df

def pearson_corr(y_true, y_pred):
    r, _ = pearsonr(y_true, y_pred)
    return 0.0 if np.isnan(r) else r



# Read ETH (full data, minimal columns)

## Data Sources

**ETH Train/Test Files**

* train.parquet — ETH order book features (eth_* prefix) and target labels (target).

* test.parquet — ETH order book features only, no labels.


**Cross-Asset Order Book Data**

* btczorder_book.parquet — BTC order book features (btc_* prefix).

* sol_order_book.parquet — SOL order book features (sol_* prefix).


Note: BTC & SOL data contain no labels and are aligned on timestamps with ETH data.

In [None]:
# --------------------------
# Read ETH (full data, minimal columns)
# --------------------------
eth_usecols_train = ["timestamp","label"] + \
    [f"bid_price{i}" for i in LEVELS] + [f"ask_price{i}" for i in LEVELS] + \
    [f"bid_volume{i}" for i in LEVELS] + [f"ask_volume{i}" for i in LEVELS]

eth_usecols_test = ["timestamp"] + \
    [f"bid_price{i}" for i in LEVELS] + [f"ask_price{i}" for i in LEVELS] + \
    [f"bid_volume{i}" for i in LEVELS] + [f"ask_volume{i}" for i in LEVELS]

eth_train = pd.read_csv(f"{TRAIN_DIR}/ETH.csv", usecols=eth_usecols_train)
eth_test  = pd.read_csv(f"{TEST_DIR}/ETH.csv",  usecols=eth_usecols_test)

for df in (eth_train, eth_test):
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df.sort_values("timestamp", inplace=True)
    df.reset_index(drop=True, inplace=True)

eth_train = ensure_mid_price(eth_train)
eth_test  = ensure_mid_price(eth_test)

eth_train = downcast_numeric(eth_train)
eth_test  = downcast_numeric(eth_test)

print("ETH shapes:", eth_train.shape, eth_test.shape)


# Chunked cross-asset aggregation (BTC, SOL): tiny per-timestamp features

**A. ETH Self-Derived Features**

* Price Spread: best_ask - best_bid

* Mid Price: (best_ask + best_bid) / 2
 
* Bid/Ask Imbalance: (bid_volume - ask_volume) / (bid_volume + ask_volume)

* Rolling Statistics:

        5, 10, and 20-tick rolling means and standard deviations for spread, mid price, and imbalance.

* Order Flow Features:

        Volume delta between consecutive ticks.

        Relative changes in best bid/ask prices.

**B. Cross-Asset Derived Features**

BTC and SOL order book data are merged with ETH by timestamp and index.
We then create:

* Cross-Asset Spread Difference: eth_spread - btc_spread, eth_spread - sol_spread

* Cross-Asset Mid-Price Ratio: eth_mid / btc_mid, eth_mid / sol_mid

* Cross-Asset Volume Ratio: eth_bid_vol / btc_bid_vol, eth_bid_vol / sol_bid_vol

* Cross-Asset Imbalance Correlation:

        Rolling Pearson correlation between ETH imbalance and BTC/SOL imbalance over 50 ticks.

* Lead-Lag Features:

        Shift BTC/SOL mid price by ±5 ticks to capture potential leading signals.

In [None]:
# --------------------------
# --------------------------
def aggregate_cross_asset(csv_path: str, is_train: bool) -> pd.DataFrame:
    if not os.path.exists(csv_path):
        return None
    usecols = ["timestamp"] + \
        [f"bid_price{i}" for i in LEVELS] + [f"ask_price{i}" for i in LEVELS] + \
        [f"bid_volume{i}" for i in LEVELS] + [f"ask_volume{i}" for i in LEVELS]
    out_parts = []
    for chunk in pd.read_csv(csv_path, usecols=usecols, chunksize=CHUNK_SIZE):
        chunk["timestamp"] = pd.to_datetime(chunk["timestamp"])
        # Derive minimal features
        mid = (chunk["bid_price1"] + chunk["ask_price1"]) / 2.0
        spread = chunk["ask_price1"] - chunk["bid_price1"]
        total_bid_vol = sum([chunk.get(f"bid_volume{i}", 0) for i in LEVELS])
        total_ask_vol = sum([chunk.get(f"ask_volume{i}", 0) for i in LEVELS])
        depth_bid = sum([chunk.get(f"bid_price{i}", 0) * chunk.get(f"bid_volume{i}", 0) for i in LEVELS])
        depth_ask = sum([chunk.get(f"ask_price{i}", 0) * chunk.get(f"ask_volume{i}", 0) for i in LEVELS])
        price_pressure = (depth_bid - depth_ask) / (depth_bid + depth_ask + 1e-9)
        df_small = pd.DataFrame({
            "timestamp": chunk["timestamp"].values,
            "mid_price": mid.astype(np.float32),
            "spread": spread.astype(np.float32),
            "total_bid_vol": total_bid_vol.astype(np.float32),
            "total_ask_vol": total_ask_vol.astype(np.float32),
            "price_pressure": price_pressure.astype(np.float32),
        })
        out_parts.append(df_small)
        del chunk, mid, spread, total_bid_vol, total_ask_vol, depth_bid, depth_ask, price_pressure, df_small
        gc.collect()
    if not out_parts:
        return None
    out = pd.concat(out_parts, axis=0, ignore_index=True)
    out.sort_values("timestamp", inplace=True)
    out.drop_duplicates(subset=["timestamp"], keep="last", inplace=True)
    out.reset_index(drop=True, inplace=True)
    out = downcast_numeric(out)
    return out

btc_train_small = aggregate_cross_asset(f"{TRAIN_DIR}/BTC.csv", is_train=True)
sol_train_small = aggregate_cross_asset(f"{TRAIN_DIR}/SOL.csv", is_train=True)
btc_test_small  = aggregate_cross_asset(f"{TEST_DIR}/BTC.csv",  is_train=False)
sol_test_small  = aggregate_cross_asset(f"{TEST_DIR}/SOL.csv",  is_train=False)

print("Cross-asset (train) available:", [k for k,v in {"BTC":btc_train_small,"SOL":sol_train_small}.items() if v is not None])
print("Cross-asset (test)  available:", [k for k,v in {"BTC":btc_test_small, "SOL":sol_test_small}.items() if v is not None])



# ETH feature engineering (leakage-safe: shift(1))


In [None]:
# --------------------------
# ETH feature engineering (leakage-safe: shift(1))
# --------------------------
def fe_orderbook_eth(df: pd.DataFrame) -> pd.DataFrame:
    f = df.copy()
    f["bid_ask_spread"] = f["ask_price1"] - f["bid_price1"]
    f["relative_spread"] = f["bid_ask_spread"] / (f["mid_price"] + 1e-9)
    f["volume_imbalance"] = (f["bid_volume1"] - f["ask_volume1"]) / (f["bid_volume1"] + f["ask_volume1"] + 1e-9)
    for i in LEVELS:
        f[f"bid_depth_{i}"] = f.get(f"bid_price{i}", 0) * f.get(f"bid_volume{i}", 0)
        f[f"ask_depth_{i}"] = f.get(f"ask_price{i}", 0) * f.get(f"ask_volume{i}", 0)
    f["total_bid_depth"] = f[[f"bid_depth_{i}" for i in LEVELS]].sum(axis=1)
    f["total_ask_depth"] = f[[f"ask_depth_{i}" for i in LEVELS]].sum(axis=1)
    f["price_pressure"] = (f["total_bid_depth"] - f["total_ask_depth"]) / (f["total_bid_depth"] + f["total_ask_depth"] + 1e-9)
    # shift microstructure
    cols_shift = ["bid_ask_spread","relative_spread","volume_imbalance","total_bid_depth","total_ask_depth","price_pressure"] + \
                 [f"bid_depth_{i}" for i in LEVELS] + [f"ask_depth_{i}" for i in LEVELS]
    f[cols_shift] = f[cols_shift].shift(1)
    return f

def fe_price_volume_eth(df: pd.DataFrame) -> pd.DataFrame:
    f = df.copy()
    f["log_mid"] = np.log(f["mid_price"].clip(lower=1e-9))
    f["ret"] = f["log_mid"].diff()
    for w in (5, 10, 30, 60):
        f[f"ret_mean_{w}"] = f["ret"].rolling(w).mean()
        f[f"ret_std_{w}"] = f["ret"].rolling(w).std()
        f[f"vol_realized_{w}"] = f["ret"].rolling(w).std() * np.sqrt(w)
        f[f"momentum_{w}"] = f["mid_price"].pct_change(w)
    for lag in (1,2,3,5,10):
        f[f"ret_lag_{lag}"] = f["ret"].shift(lag)
        f[f"spread_lag_{lag}"] = f["bid_ask_spread"].shift(lag)
    # volume aggregates
    f["total_bid_volume"] = sum([f.get(f"bid_volume{i}", 0) for i in LEVELS])
    f["total_ask_volume"] = sum([f.get(f"ask_volume{i}", 0) for i in LEVELS])
    f["total_volume"] = f["total_bid_volume"] + f["total_ask_volume"]
    f["volume_ratio"] = f["total_bid_volume"] / (f["total_ask_volume"] + 1e-9)
    for w in (5,10,30):
        f[f"vol_mean_{w}"] = f["total_volume"].rolling(w).mean()
        f[f"vol_std_{w}"]  = f["total_volume"].rolling(w).std()
        f[f"vratio_mean_{w}"] = f["volume_ratio"].rolling(w).mean()
    # shift all derived columns except originals
    derived = [c for c in f.columns if c not in df.columns]
    f[derived] = f[derived].shift(1)
    return f

def build_eth_features(df: pd.DataFrame) -> pd.DataFrame:
    g = fe_orderbook_eth(df)
    g = fe_price_volume_eth(g)
    return g

print("Building ETH features...")
eth_train_feat = build_eth_features(eth_train)
eth_test_feat  = build_eth_features(eth_test)



# Cross-asset merge + features (use tiny aggregates)

Kaggle’s free tier provides 16 GB RAM, so we:

* Use categorical dtype for identifiers.

* Use float32 instead of float64 where possible.

* Process BTC/SOL data in chunks before merging with ETH.

* Drop unused columns immediately after feature creation.

* Apply garbage collection between major steps.


In [None]:
# --------------------------
# Cross-asset merge + features (use tiny aggregates)
# --------------------------
def add_cross_asset(base_feat: pd.DataFrame, name: str, other_df: pd.DataFrame) -> pd.DataFrame:
    if other_df is None:
        return base_feat
    out = base_feat.merge(
        other_df.rename(columns={
            "mid_price": f"mid_{name}",
            "spread": f"spread_{name}",
            "total_bid_vol": f"tbv_{name}",
            "total_ask_vol": f"tav_{name}",
            "price_pressure": f"pp_{name}",
        }),
        on="timestamp", how="left"
    )
    # cross-asset returns & ratios (shifted)
    out[f"log_mid_{name}"] = np.log(out[f"mid_{name}"].clip(lower=1e-9))
    out[f"ret_{name}"] = out[f"log_mid_{name}"].diff().shift(1)
    out[f"ratio_ETH_{name}"] = (out["mid_price"] / out[f"mid_{name}"]).shift(1)
    out[f"ratio_chg_ETH_{name}"] = out[f"ratio_ETH_{name}"].pct_change().shift(1)
    # correlation over past window (compute then shift)
    w = 60
    out[f"corr_ETH_{name}_{w}"] = out["ret"].rolling(w).corr(out[f"ret_{name}"]).shift(1)
    # shift partner aggregates to avoid peeking
    out[[f"spread_{name}", f"tbv_{name}", f"tav_{name}", f"pp_{name}"]] = \
        out[[f"spread_{name}", f"tbv_{name}", f"tav_{name}", f"pp_{name}"]].shift(1)
    return out

print("Merging cross-asset features...")
eth_train_feat = add_cross_asset(eth_train_feat, "BTC", btc_train_small)
eth_train_feat = add_cross_asset(eth_train_feat, "SOL", sol_train_small)
eth_test_feat  = add_cross_asset(eth_test_feat,  "BTC", btc_test_small)
eth_test_feat  = add_cross_asset(eth_test_feat,  "SOL", sol_test_small)

# Free memory
del btc_train_small, sol_train_small, btc_test_small, sol_test_small
gc.collect()

# Target, feature selection, cleaning


In [None]:
# --------------------------
# Target, feature selection, cleaning
# --------------------------
if "label" not in eth_train_feat.columns:
    eth_train_feat = eth_train_feat.merge(eth_train[["timestamp","label"]], on="timestamp", how="left")

eth_train_feat = eth_train_feat.dropna(subset=["label"]).reset_index(drop=True)

drop_cols = {"timestamp","label"}
feature_cols = [c for c in eth_train_feat.columns if c not in drop_cols]

# Align test columns
eth_test_feat = eth_test_feat.reindex(columns=["timestamp"] + feature_cols, fill_value=np.nan)

# Downcast & clean
X = downcast_numeric(eth_train_feat[feature_cols].copy())
X = clean_financial_data(X)
y = eth_train_feat["label"].astype(np.float32).values

X_test = downcast_numeric(eth_test_feat[feature_cols].copy())
X_test = clean_financial_data(X_test)

print("Final matrices:", X.shape, X_test.shape)



# Time-based CV


In [None]:
# --------------------------
# Time-based CV
# --------------------------
tscv = TimeSeriesSplit(n_splits=5)
oof = np.zeros(len(X), dtype=np.float32)
scores = []

print("Starting TimeSeriesSplit CV...")
for k, (trn_idx, val_idx) in enumerate(tscv.split(X), 1):
    X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_tr, y_val = y[trn_idx], y[val_idx]

    model = xgb.XGBRegressor(**XGB_PARAMS)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)

    pred = model.predict(X_val).astype(np.float32)
    oof[val_idx] = pred
    r = pearson_corr(y_val, pred)
    scores.append(r)
    print(f"Fold {k}: Pearson r = {r:.6f}")

print(f"CV mean: {np.mean(scores):.6f}  +/- {np.std(scores):.6f}")
print(f"OOF r:  {pearson_corr(y, oof):.6f}")



# Train on full data & predict test

**We use XGBoost for its:**

* Robustness to heterogeneous features.

* Built-in handling of missing values.

* High performance in tabular competitions.

In [None]:
# --------------------------
# Train on full data & predict test
# --------------------------
final_model = xgb.XGBRegressor(**XGB_PARAMS)
final_model.fit(X, y, verbose=False)
test_pred = final_model.predict(X_test).astype(np.float32)



# Submission


In [None]:
# --------------------------
# Submission
# --------------------------
# Try common id columns; else use index
sub_cols = [c for c in ["row_id", "id"] if c in eth_test.columns]
if len(sub_cols):
    id_col = sub_cols[0]
    submission = pd.DataFrame({id_col: eth_test[id_col] + 1, "labels": test_pred})
else:
    submission = pd.DataFrame({"timestamp": np.arange(1, len(test_pred) + 1), "labels": test_pred})
        
out_path = "/kaggle/working/submission.csv"
submission.to_csv(out_path, index=False)
print(f"Saved submission to: {out_path}")

# Quick peek at OOF predictions to ensure sanity (optional)
plt.figure(figsize=(10,3))
plt.plot(oof[:200])
plt.title("OOF predictions (first 200)")
plt.tight_layout()
plt.show()


In [None]:
submission.tail(20)

In [None]:
print(submission.iloc[270547])