In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/high-frequency-crypto-limit-order-book-data/BTC_5min.csv
/kaggle/input/high-frequency-crypto-limit-order-book-data/BTC_1sec.csv
/kaggle/input/high-frequency-crypto-limit-order-book-data/ETH_1min.csv
/kaggle/input/high-frequency-crypto-limit-order-book-data/BTC_1min.csv
/kaggle/input/high-frequency-crypto-limit-order-book-data/ETH_5min.csv
/kaggle/input/high-frequency-crypto-limit-order-book-data/ADA_5min.csv
/kaggle/input/high-frequency-crypto-limit-order-book-data/ETH_1sec.csv
/kaggle/input/high-frequency-crypto-limit-order-book-data/ADA_1sec.csv
/kaggle/input/high-frequency-crypto-limit-order-book-data/ADA_1min.csv


In [6]:
import pandas as pd
df = pd.read_csv('/kaggle/input/high-frequency-crypto-limit-order-book-data/BTC_1sec.csv')

In [7]:
df.columns

Index(['Unnamed: 0', 'system_time', 'midpoint', 'spread', 'buys', 'sells',
       'bids_distance_0', 'bids_distance_1', 'bids_distance_2',
       'bids_distance_3',
       ...
       'asks_market_notional_5', 'asks_market_notional_6',
       'asks_market_notional_7', 'asks_market_notional_8',
       'asks_market_notional_9', 'asks_market_notional_10',
       'asks_market_notional_11', 'asks_market_notional_12',
       'asks_market_notional_13', 'asks_market_notional_14'],
      dtype='object', length=156)

In [9]:
df.rename(columns={"Unnamed: 0": "index"}, inplace=True)


In [12]:
print(list(df.columns))

['index', 'system_time', 'midpoint', 'spread', 'buys', 'sells', 'bids_distance_0', 'bids_distance_1', 'bids_distance_2', 'bids_distance_3', 'bids_distance_4', 'bids_distance_5', 'bids_distance_6', 'bids_distance_7', 'bids_distance_8', 'bids_distance_9', 'bids_distance_10', 'bids_distance_11', 'bids_distance_12', 'bids_distance_13', 'bids_distance_14', 'bids_notional_0', 'bids_notional_1', 'bids_notional_2', 'bids_notional_3', 'bids_notional_4', 'bids_notional_5', 'bids_notional_6', 'bids_notional_7', 'bids_notional_8', 'bids_notional_9', 'bids_notional_10', 'bids_notional_11', 'bids_notional_12', 'bids_notional_13', 'bids_notional_14', 'bids_cancel_notional_0', 'bids_cancel_notional_1', 'bids_cancel_notional_2', 'bids_cancel_notional_3', 'bids_cancel_notional_4', 'bids_cancel_notional_5', 'bids_cancel_notional_6', 'bids_cancel_notional_7', 'bids_cancel_notional_8', 'bids_cancel_notional_9', 'bids_cancel_notional_10', 'bids_cancel_notional_11', 'bids_cancel_notional_12', 'bids_cancel_no

In [13]:
# Classification: Will price go up (1) or down (0) in next 10 seconds?
df['future_midpoint'] = df['midpoint'].shift(-10)
df['price_direction'] = (df['future_midpoint'] > df['midpoint']).astype(int)

In [21]:
df["future_midpoint"]

0          56035.995
1          56035.995
2          56035.995
3          56035.995
4          56035.995
             ...    
1030723          NaN
1030724          NaN
1030725          NaN
1030726          NaN
1030727          NaN
Name: future_midpoint, Length: 1030728, dtype: float64

In [30]:
# Fix the error in the pipeline - the issue is in the create_targets method

class LOBPipeline:
    """
    Complete Limit Order Book Machine Learning Pipeline - Fixed Version
    Supports both classification and regression with swappable algorithms
    """
    
    def __init__(self, task_type='classification', scaler_type='standard', 
                 prediction_horizon=10, feature_engineering=True):
        """
        Initialize the LOB Pipeline
        
        Parameters:
        - task_type: 'classification' or 'regression'
        - scaler_type: 'standard', 'robust', or None
        - prediction_horizon: seconds ahead to predict
        - feature_engineering: whether to create additional features
        """
        self.task_type = task_type
        self.scaler_type = scaler_type
        self.prediction_horizon = prediction_horizon
        self.feature_engineering = feature_engineering
        self.scaler = None
        self.model = None
        self.feature_columns = None
        
    def _create_engineered_features(self, df):
        """Create financial-specific features from LOB data"""
        df_eng = df.copy()
        
        # Order Imbalance Features
        bid_volume_total = df_eng[[c for c in df_eng.columns if 'bids_notional_' in c]].sum(axis=1)
        ask_volume_total = df_eng[[c for c in df_eng.columns if 'asks_notional_' in c]].sum(axis=1)
        df_eng['order_imbalance'] = (bid_volume_total - ask_volume_total) / (bid_volume_total + ask_volume_total + 1e-8)
        
        # Weighted Mid Price (using top 5 levels)
        bid_prices = []
        ask_prices = []
        bid_volumes = []
        ask_volumes = []
        
        for i in range(5):  # Top 5 levels
            # Calculate actual prices from midpoint and distance
            bid_price = df_eng['midpoint'] * (1 - df_eng[f'bids_distance_{i}'] / 100)
            ask_price = df_eng['midpoint'] * (1 + df_eng[f'asks_distance_{i}'] / 100)
            
            bid_prices.append(bid_price)
            ask_prices.append(ask_price)
            bid_volumes.append(df_eng[f'bids_notional_{i}'])
            ask_volumes.append(df_eng[f'asks_notional_{i}'])
        
        # Volume-weighted average prices
        total_bid_vol = sum(bid_volumes)
        total_ask_vol = sum(ask_volumes)
        
        vwap_bid = sum([p * v for p, v in zip(bid_prices, bid_volumes)]) / (total_bid_vol + 1e-8)
        vwap_ask = sum([p * v for p, v in zip(ask_prices, ask_volumes)]) / (total_ask_vol + 1e-8)
        
        df_eng['vwap_bid'] = vwap_bid
        df_eng['vwap_ask'] = vwap_ask
        df_eng['vwap_midpoint'] = (vwap_bid + vwap_ask) / 2
        
        # Price Impact Features
        df_eng['relative_spread'] = df_eng['spread'] / df_eng['midpoint']
        
        # Market/Limit Order Ratios
        bid_market_total = df_eng[[c for c in df_eng.columns if 'bids_market_notional_' in c]].sum(axis=1)
        bid_limit_total = df_eng[[c for c in df_eng.columns if 'bids_limit_notional_' in c]].sum(axis=1)
        ask_market_total = df_eng[[c for c in df_eng.columns if 'asks_market_notional_' in c]].sum(axis=1)
        ask_limit_total = df_eng[[c for c in df_eng.columns if 'asks_limit_notional_' in c]].sum(axis=1)
        
        df_eng['bid_market_ratio'] = bid_market_total / (bid_market_total + bid_limit_total + 1e-8)
        df_eng['ask_market_ratio'] = ask_market_total / (ask_market_total + ask_limit_total + 1e-8)
        
        # Cancel Order Pressure
        bid_cancel_total = df_eng[[c for c in df_eng.columns if 'bids_cancel_notional_' in c]].sum(axis=1)
        ask_cancel_total = df_eng[[c for c in df_eng.columns if 'asks_cancel_notional_' in c]].sum(axis=1)
        
        df_eng['cancel_pressure'] = (bid_cancel_total - ask_cancel_total) / (bid_cancel_total + ask_cancel_total + 1e-8)
        
        # Price momentum features
        df_eng['midpoint_return_1'] = df_eng['midpoint'].pct_change(1)
        df_eng['midpoint_return_5'] = df_eng['midpoint'].pct_change(5)
        df_eng['midpoint_volatility'] = df_eng['midpoint_return_1'].rolling(window=30).std()
        
        return df_eng
    
    def create_targets(self, df):
        """Create prediction targets based on task type - FIXED VERSION"""
        df_targets = df.copy()
        
        # Create future midpoint
        df_targets['future_midpoint'] = df_targets['midpoint'].shift(-self.prediction_horizon)
        
        if self.task_type == 'classification':
            # Binary classification: price goes up (1) or down/stays (0)
            df_targets['price_direction'] = (df_targets['future_midpoint'] > df_targets['midpoint']).astype(int)
            df_targets['target'] = df_targets['price_direction']
        else:
            # Regression: predict returns
            df_targets['future_return'] = (df_targets['future_midpoint'] - df_targets['midpoint']) / df_targets['midpoint']
            df_targets['target'] = df_targets['future_return']
        
        # Remove rows with NaN targets (end of dataset)
        df_targets = df_targets.dropna(subset=['target'])
        
        return df_targets
    
    def prepare_features(self, df):
        """Prepare feature matrix"""
        if self.feature_engineering:
            df = self._create_engineered_features(df)
        
        # Select feature columns (exclude non-feature columns)
        exclude_cols = ['index', 'system_time', 'future_midpoint', 'price_direction', 
                       'future_return', 'target']
        
        feature_cols = [col for col in df.columns if col not in exclude_cols]
        self.feature_columns = feature_cols
        
        X = df[feature_cols].fillna(0)  # Handle any remaining NaN values
        
        return X
    
    def fit(self, df, model):
        """Fit the pipeline with given model"""
        # Create targets
        df_with_targets = self.create_targets(df)
        
        # Prepare features
        X = self.prepare_features(df_with_targets)
        y = df_with_targets['target']
        
        # Scale features if requested
        if self.scaler_type == 'standard':
            self.scaler = StandardScaler()
            X_scaled = self.scaler.fit_transform(X)
        elif self.scaler_type == 'robust':
            self.scaler = RobustScaler()
            X_scaled = self.scaler.fit_transform(X)
        else:
            X_scaled = X.values
        
        # Fit model
        self.model = model
        self.model.fit(X_scaled, y)
        
        return self
    
    def predict(self, df):
        """Make predictions on new data"""
        # Create targets for consistency (but we won't use them for prediction)
        df_with_targets = self.create_targets(df)
        X = self.prepare_features(df_with_targets)
        
        if self.scaler is not None:
            X_scaled = self.scaler.transform(X)
        else:
            X_scaled = X.values
        
        if self.task_type == 'classification':
            predictions = self.model.predict(X_scaled)
            probabilities = self.model.predict_proba(X_scaled)[:, 1] if hasattr(self.model, 'predict_proba') else None
            return predictions, probabilities
        else:
            predictions = self.model.predict(X_scaled)
            return predictions
    
    def evaluate(self, df):
        """Evaluate model performance"""
        df_with_targets = self.create_targets(df)
        X = self.prepare_features(df_with_targets)
        y_true = df_with_targets['target']
        
        if self.scaler is not None:
            X_scaled = self.scaler.transform(X)
        else:
            X_scaled = X.values
        
        if self.task_type == 'classification':
            y_pred = self.model.predict(X_scaled)
            y_prob = self.model.predict_proba(X_scaled)[:, 1] if hasattr(self.model, 'predict_proba') else None
            
            metrics = {
                'accuracy': accuracy_score(y_true, y_pred),
                'f1_score': f1_score(y_true, y_pred),
                'precision': precision_score(y_true, y_pred),
                'recall': recall_score(y_true, y_pred),
            }
            
            if y_prob is not None:
                metrics['auc_roc'] = roc_auc_score(y_true, y_prob)
            
            # Calculate simple Sharpe ratio simulation
            # Get the corresponding returns for the prediction period
            df_returns = df_with_targets.copy()
            df_returns['actual_return'] = (df_returns['future_midpoint'] - df_returns['midpoint']) / df_returns['midpoint']
            
            # Simple strategy: go long when predicting up, short when predicting down  
            strategy_returns = np.where(y_pred == 1, 
                                      df_returns['actual_return'].fillna(0), 
                                      -df_returns['actual_return'].fillna(0))
            strategy_returns = strategy_returns[~np.isnan(strategy_returns)]
            
            if len(strategy_returns) > 0 and np.std(strategy_returns) > 0:
                metrics['sharpe_ratio'] = np.mean(strategy_returns) / np.std(strategy_returns) * np.sqrt(252 * 24 * 60)  # Annualized
            else:
                metrics['sharpe_ratio'] = 0
                
        else:
            y_pred = self.model.predict(X_scaled)
            
            metrics = {
                'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
                'mae': mean_absolute_error(y_true, y_pred),
                'r2': r2_score(y_true, y_pred)
            }
        
        return metrics

print("✅ Fixed LOB Pipeline created successfully!")

✅ Fixed LOB Pipeline created successfully!


In [26]:
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nBasic statistics:")
print(df.describe())

Dataset shape: (1030728, 158)

Column names:
['index', 'system_time', 'midpoint', 'spread', 'buys', 'sells', 'bids_distance_0', 'bids_distance_1', 'bids_distance_2', 'bids_distance_3', 'bids_distance_4', 'bids_distance_5', 'bids_distance_6', 'bids_distance_7', 'bids_distance_8', 'bids_distance_9', 'bids_distance_10', 'bids_distance_11', 'bids_distance_12', 'bids_distance_13', 'bids_distance_14', 'bids_notional_0', 'bids_notional_1', 'bids_notional_2', 'bids_notional_3', 'bids_notional_4', 'bids_notional_5', 'bids_notional_6', 'bids_notional_7', 'bids_notional_8', 'bids_notional_9', 'bids_notional_10', 'bids_notional_11', 'bids_notional_12', 'bids_notional_13', 'bids_notional_14', 'bids_cancel_notional_0', 'bids_cancel_notional_1', 'bids_cancel_notional_2', 'bids_cancel_notional_3', 'bids_cancel_notional_4', 'bids_cancel_notional_5', 'bids_cancel_notional_6', 'bids_cancel_notional_7', 'bids_cancel_notional_8', 'bids_cancel_notional_9', 'bids_cancel_notional_10', 'bids_cancel_notional_11

In [32]:
df_clean = df.dropna()

In [37]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

# Enable tqdm for pandas - this MUST come before using progress_apply
tqdm.pandas()
def calc_order_imbalance(row):
    bid_vol = sum(row[f'bids_notional_{i}'] for i in range(15))
    ask_vol = sum(row[f'asks_notional_{i}'] for i in range(15))
    return (bid_vol - ask_vol) / (bid_vol + ask_vol)

df['order_imbalance'] = df.progress_apply(calc_order_imbalance, axis=1)


  0%|          | 0/1030728 [00:00<?, ?it/s]

In [39]:
# 3. Temporal split (same as before)
split = int(len(df) * 0.8)
train, test = df.iloc[:split], df.iloc[split:]

# 4. Initialise pipeline
pipeline = LOBPipeline(
    task_type='classification',
    scaler_type='standard',
    prediction_horizon=10,
    feature_engineering=True,
)


In [40]:


# 5. Model with verbose iterations so tqdm can hook in
model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    verbose=0              # keep sklearn quiet; tqdm will handle output
)


In [41]:
# 6. Fit with progress bar on iterations
print("Training model ...")
with tqdm(total=model.n_estimators, desc="GB Boost rounds") as pbar:
    # monkey-patch the model’s internal _fit_stage to update pbar
    orig_fit_stage = model._fit_stage
    def _fit_stage_with_pbar(*args, **kwargs):
        pbar.update(1)
        return orig_fit_stage(*args, **kwargs)
    model._fit_stage = _fit_stage_with_pbar

    pipeline.fit(train, model)
    pbar.close()


Training model ...


GB Boost rounds:   0%|          | 0/300 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# 7. Evaluation with progress on batches
print("Evaluating ...")
batch_size = 50_000
y_true, y_pred = [], []
for start in tqdm(range(0, len(test), batch_size), desc="Scoring batches"):
    end = start + batch_size
    batch = test.iloc[start:end]
    preds = pipeline.predict(batch)
    y_pred.extend(preds)
    y_true.extend(batch['price_direction'].values)

metrics = pipeline.metric_dict(y_true, y_pred)
print("Test metrics →", metrics)

Gpu Usage

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tqdm.auto import tqdm; tqdm.pandas()

In [43]:
PRED_HORIZON = 10
CSV_PATH = '/kaggle/input/high-frequency-crypto-limit-order-book-data/BTC_1sec.csv'

df = pd.read_csv(CSV_PATH, low_memory=False)
df['future_midpoint'] = df['midpoint'].shift(-PRED_HORIZON)
df['price_direction'] = (df['future_midpoint'] > df['midpoint']).astype(int)
df = df.dropna(subset=['price_direction'])

feature_cols = [
    c for c in df.columns
    if c not in ['price_direction', 'future_midpoint', 'midpoint', 'system_time', 'index']
       and df[c].dtype != 'object'
]

X = df[feature_cols]
y = df['price_direction']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [44]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test,  label=y_test)

In [45]:
params = {
    'objective'      : 'binary:logistic',
    'eval_metric'    : ['auc', 'logloss'],
    'tree_method'    : 'gpu_hist',      # key line: use GPU for histogram building
    'predictor'      : 'gpu_predictor', # GPU inference
    'gpu_id'         : 0,               # default GPU
    'learning_rate'  : 0.05,
    'max_depth'      : 6,
    'subsample'      : 0.8,
    'colsample_bytree': 0.8,
    'verbosity'      : 1
}


In [48]:
num_rounds = 1000
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_rounds,
    evals=[(dtrain, 'train'), (dtest, 'valid')],
    verbose_eval=25     # progress every 25 rounds
)


[0]	train-auc:0.69268	train-logloss:0.67247	valid-auc:0.63623	valid-logloss:0.68724
[25]	train-auc:0.70592	train-logloss:0.61944	valid-auc:0.65420	valid-logloss:0.65287
[50]	train-auc:0.71102	train-logloss:0.60900	valid-auc:0.65721	valid-logloss:0.64724
[75]	train-auc:0.71525	train-logloss:0.60465	valid-auc:0.65911	valid-logloss:0.64547
[100]	train-auc:0.71835	train-logloss:0.60189	valid-auc:0.65979	valid-logloss:0.64502
[125]	train-auc:0.72109	train-logloss:0.59964	valid-auc:0.66067	valid-logloss:0.64449
[150]	train-auc:0.72351	train-logloss:0.59775	valid-auc:0.66098	valid-logloss:0.64435
[175]	train-auc:0.72599	train-logloss:0.59597	valid-auc:0.66108	valid-logloss:0.64428
[200]	train-auc:0.72820	train-logloss:0.59436	valid-auc:0.66091	valid-logloss:0.64435
[225]	train-auc:0.73037	train-logloss:0.59278	valid-auc:0.66072	valid-logloss:0.64451
[250]	train-auc:0.73269	train-logloss:0.59117	valid-auc:0.66034	valid-logloss:0.64480
[275]	train-auc:0.73470	train-logloss:0.58975	valid-auc:0.6

In [49]:
pred_prob = model.predict(dtest)
pred_bin  = (pred_prob > 0.5).astype(int)

print("Accuracy :", accuracy_score(y_test, pred_bin))
print("F1-score :", f1_score(y_test, pred_bin))
print("AUC-ROC  :", roc_auc_score(y_test, pred_prob))


Accuracy : 0.6106739883383622
F1-score : 0.5386251537762856
AUC-ROC  : 0.6586155058661158


Improve Scores

In [50]:
# =============================================================================
# 1. SETUP & IMPORTS
# =============================================================================
import os
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
tqdm.pandas()  # enable DataFrame.progress_apply()

import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# =============================================================================
# 2. CONFIGURATION
# =============================================================================
CSV_PATH        = '/kaggle/input/high-frequency-crypto-limit-order-book-data/BTC_1sec.csv'
PRED_HORIZON    = 10                # seconds ahead to predict
TEST_SIZE_RATIO = 0.2               # 20% hold-out
BATCH_SIZE      = 50_000            # for batched inference
XGB_ROUNDS      = 300               # max boosting rounds
EARLY_STOPPING  = 30                # for validation
GPU_ID          = 0                 # GPU index

# =============================================================================
# 3. LOAD & TARGET CREATION
# =============================================================================
print("📥 Loading data …")
df = pd.read_csv(CSV_PATH, low_memory=False)

print("🎯 Creating future midpoint & direction …")
with tqdm(total=2, desc="Target shift") as bar:
    df['future_midpoint'] = df['midpoint'].shift(-PRED_HORIZON); bar.update(1)
    df['price_direction'] = (df['future_midpoint'] > df['midpoint']).astype(int); bar.update(1)
df.dropna(subset=['price_direction'], inplace=True)

# =============================================================================
# 4. FEATURE ENGINEERING
# =============================================================================
print("⚙️ Engineering features …")

def compute_order_imbalance(row):
    bid = sum(row[f'bids_notional_{i}'] for i in range(15))
    ask = sum(row[f'asks_notional_{i}'] for i in range(15))
    return 0 if (bid + ask)==0 else (bid - ask) / (bid + ask)

# row-wise imbalance
df['order_imbalance'] = df.progress_apply(compute_order_imbalance, axis=1)

# VWAP features
for side in ['bids', 'asks']:
    prices, vols = [], []
    for i in range(5):
        dist_col = f'{side[:-1]}s_distance_{i}'  # 'bids_distance_i'
        price = df['midpoint'] * (1 + (df[dist_col] / 100) * (1 if side=='asks' else -1))
        vol   = df[f'{side}_notional_{i}']
        prices.append(price); vols.append(vol)
    df[f'{side}_vwap'] = sum(p*v for p,v in zip(prices, vols)) / (sum(vols)+1e-8)
df['vwap_mid'] = (df['bids_vwap'] + df['asks_vwap']) / 2

# Spread ratio, market/limit ratios, cancel pressure
df['relative_spread'] = df['spread'] / df['midpoint']
df['bid_market_ratio'] = df[[c for c in df if 'bids_market_notional_' in c]].sum(axis=1) / \
                         (df[[c for c in df if 'bids_limit_notional_'  in c]].sum(axis=1)+1e-8)
df['ask_market_ratio'] = df[[c for c in df if 'asks_market_notional_' in c]].sum(axis=1) / \
                         (df[[c for c in df if 'asks_limit_notional_'  in c]].sum(axis=1)+1e-8)
df['cancel_pressure']  = (df[[c for c in df if 'bids_cancel_notional_' in c]].sum(axis=1) - \
                         df[[c for c in df if 'asks_cancel_notional_' in c]].sum(axis=1)) / \
                        (df[[c for c in df if 'bids_cancel_notional_' in c]].sum(axis=1) + \
                         df[[c for c in df if 'asks_cancel_notional_' in c]].sum(axis=1) + 1e-8)

# Momentum
df['mid_ret_1'] = df['midpoint'].pct_change(1).fillna(0)
df['mid_ret_5'] = df['midpoint'].pct_change(5).fillna(0)
df['mid_vol30'] = df['mid_ret_1'].rolling(30).std().fillna(0)

# =============================================================================
# 5. TRAIN/TEST SPLIT
# =============================================================================
split_idx = int(len(df) * (1 - TEST_SIZE_RATIO))
train_df, test_df = df.iloc[:split_idx], df.iloc[split_idx:]
print(f"🚂 Train rows: {len(train_df):,} | 🧪 Test rows: {len(test_df):,}")

# =============================================================================
# 6. MATRIX PREPARATION
# =============================================================================
exclude = {'index','system_time','future_midpoint','price_direction'}
features = [c for c in df.columns if c not in exclude and df[c].dtype!='object']
dtrain = xgb.DMatrix(train_df[features], label=train_df['price_direction'])
dtest  = xgb.DMatrix(test_df[features],  label=test_df['price_direction'])

# =============================================================================
# 7. XGBOOST GPU TRAINING
# =============================================================================
print("🏋️ Training XGBoost on GPU …")
params = {
    'objective'      :'binary:logistic',
    'eval_metric'    :'auc',
    'tree_method'    :'gpu_hist',
    'predictor'      :'gpu_predictor',
    'gpu_id'         : GPU_ID,
    'learning_rate'  : 0.05,
    'max_depth'      : 6,
    'subsample'      : 0.8,
    'colsample_bytree':0.8,
}
model = xgb.train(
    params,
    dtrain,
    num_boost_round= XGB_ROUNDS,
    evals=[(dtrain,'train'),(dtest,'valid')],
    early_stopping_rounds=EARLY_STOPPING,
    verbose_eval=25
)

# =============================================================================
# 8. BATCHED INFERENCE & METRICS
# =============================================================================
print("🔍 Scoring test set …")
pred_prob, pred_bin = [], []
for start in tqdm(range(0, len(test_df), BATCH_SIZE), desc="Batches"):
    end   = min(start + BATCH_SIZE, len(test_df))
    batch = xgb.DMatrix(test_df[features].iloc[start:end])
    p     = model.predict(batch)
    pred_prob.extend(p)
    pred_bin.extend((p > 0.5).astype(int))

acc = accuracy_score(test_df['price_direction'], pred_bin)
f1  = f1_score(      test_df['price_direction'], pred_bin)
auc = roc_auc_score(test_df['price_direction'], pred_prob)

# =============================================================================
# 9. RESULTS
# =============================================================================
print("\n=== FINAL METRICS ===")
print(f"Accuracy : {acc:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"AUC-ROC  : {auc:.4f}")

📥 Loading data …
🎯 Creating future midpoint & direction …


Target shift:   0%|          | 0/2 [00:00<?, ?it/s]

⚙️ Engineering features …


  0%|          | 0/1030728 [00:00<?, ?it/s]

🚂 Train rows: 824,582 | 🧪 Test rows: 206,146
🏋️ Training XGBoost on GPU …
[0]	train-auc:0.69289	valid-auc:0.64026
[25]	train-auc:0.70994	valid-auc:0.65991
[50]	train-auc:0.71470	valid-auc:0.66218
[75]	train-auc:0.71850	valid-auc:0.66340
[100]	train-auc:0.72170	valid-auc:0.66394
[125]	train-auc:0.72460	valid-auc:0.66377
[150]	train-auc:0.72719	valid-auc:0.66378
[154]	train-auc:0.72758	valid-auc:0.66407
🔍 Scoring test set …


Batches:   0%|          | 0/5 [00:00<?, ?it/s]


=== FINAL METRICS ===
Accuracy : 0.6124
F1-score : 0.5851
AUC-ROC  : 0.6641


In [None]:
# =============================================================================
# End-to-End DeepLOB-Style LOB Direction Model with tqdm
# =============================================================================

import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# Enable tqdm for pandas
tqdm.pandas()

# --------------------
# 1. CONFIGURATION
# --------------------
CSV_PATH     = '/kaggle/input/high-frequency-crypto-limit-order-book-data/BTC_1sec.csv'
PRED_HORIZON = 10            # seconds ahead to predict
TEST_RATIO   = 0.2           # 20% hold-out
BATCH_SIZE   = 256           # for training/inference
EPOCHS       = 100
PATIENCE     = 10
LR_FACTOR    = 0.5

# --------------------
# 2. LOAD & PREPROCESS
# --------------------
print("📥 Loading CSV")
df = pd.read_csv(CSV_PATH, low_memory=False)

from tqdm.auto import tqdm

print("🎯 Creating targets")
pbar = tqdm(total=2, desc="Target shift")
df['future_midpoint']  = df['midpoint'].shift(-PRED_HORIZON)
pbar.update(1)
df['price_direction']  = (df['future_midpoint'] > df['midpoint']).astype(int)
pbar.update(1)
pbar.close()
df.dropna(subset=['price_direction'], inplace=True)

# --------------------
# 3. FEATURE ENGINEERING
# --------------------
print("⚙️  Feature engineering")

def order_imbalance(row):
    bid = sum(row[f'bids_notional_{i}'] for i in range(15))
    ask = sum(row[f'asks_notional_{i}'] for i in range(15))
    return 0 if (bid+ask)==0 else (bid-ask)/(bid+ask)

df['imbalance'] = df.progress_apply(order_imbalance, axis=1)

# Build tensor input: sliding windows of length PRED_HORIZON+1
LEVELS   = 10  # top N levels
FEATURES = 4   # distance, notional, imbalance, spread
WINDOW   = PRED_HORIZON + 1

# pre-allocate
X, y = [], []
cols_dist   = [f'bids_distance_{i}' for i in range(LEVELS)] + [f'asks_distance_{i}' for i in range(LEVELS)]
cols_notional = [f'bids_notional_{i}' for i in range(LEVELS)] + [f'asks_notional_{i}' for i in range(LEVELS)]

print("🔄 Building time-series windows")
for i in tqdm(range(len(df) - WINDOW)):
    window = df.iloc[i:i+WINDOW]
    # tensor shape: (WINDOW, LEVELS*2 features)
    dist    = window[cols_dist].values
    notional= window[cols_notional].values
    imb     = window['imbalance'].values.reshape(-1,1)
    spr     = window['spread'].values.reshape(-1,1)
    # stack features: [dist, notional, imb, spr]
    frame = np.concatenate([dist, notional, imb, spr], axis=1)
    X.append(frame)
    y.append(window['price_direction'].iloc[-1])

X = np.array(X, dtype=np.float32)  # shape=(samples, WINDOW, FEATURES_TOTAL)
y = np.array(y, dtype=np.int8)

# --------------------
# 4. TRAIN/VAL SPLIT
# --------------------
split = int(len(X) * (1 - TEST_RATIO))
X_train, X_val = X[:split], X[split:]
y_train, y_val = y[:split], y[split:]
print(f"🚂 Train: {len(X_train)} samples | 🧪 Val: {len(X_val)} samples")

# --------------------
# 5. MODEL DEFINITION
# --------------------
def build_deeplob(input_shape):
    inp = layers.Input(shape=input_shape)
    # Convolutional block
    x = layers.Conv1D(64, kernel_size=3, activation='relu', padding='same')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(32, kernel_size=5, activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    # LSTM block
    x = layers.LSTM(128, return_sequences=True, dropout=0.2)(x)
    x = layers.LSTM(64, dropout=0.2)(x)
    # Dense head
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    return models.Model(inp, out)

feat_dim = (LEVELS*2) + (LEVELS*2) + 2  # 42
model = build_deeplob(input_shape=(WINDOW, feat_dim))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.AUC(name='auc')])

model.summary()

# --------------------
# 6. TRAINING
# --------------------
es = callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=PATIENCE, restore_best_weights=True)
rlr= callbacks.ReduceLROnPlateau(monitor='val_loss', factor=LR_FACTOR, patience=5, verbose=1)

print("🏋️ Training DeepLOB model")
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[es, rlr],
    verbose=1
)

# --------------------
# 7. EVALUATION
# --------------------
print("🔍 Evaluating on validation set")
y_pred_prob = model.predict(X_val, batch_size=BATCH_SIZE, verbose=1).flatten()
y_pred_bin  = (y_pred_prob > 0.5).astype(int)

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
acc = accuracy_score(y_val, y_pred_bin)
f1  = f1_score(y_val, y_pred_bin)
auc = roc_auc_score(y_val, y_pred_prob)

print(f"\n=== FINAL VALIDATION METRICS ===")
print(f"Accuracy : {acc:.4f}")
print(f"F1-Score : {f1:.4f}")
print(f"AUC-ROC  : {auc:.4f}")


📥 Loading CSV
🎯 Creating targets


Target shift:   0%|          | 0/2 [00:00<?, ?it/s]

⚙️  Feature engineering


  0%|          | 0/1030728 [00:00<?, ?it/s]

🔄 Building time-series windows


  0%|          | 0/1030717 [00:00<?, ?it/s]

🚂 Train: 824573 samples | 🧪 Val: 206144 samples


🏋️ Training DeepLOB model
Epoch 1/100


I0000 00:00:1753270354.808329     175 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 11ms/step - auc: 0.6691 - loss: 0.6325 - val_auc: 0.6406 - val_loss: 0.6544 - learning_rate: 0.0010
Epoch 2/100
[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - auc: 0.6895 - loss: 0.6211 - val_auc: 0.6380 - val_loss: 0.6562 - learning_rate: 0.0010
Epoch 3/100
[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 10ms/step - auc: 0.6957 - loss: 0.6169 - val_auc: 0.6381 - val_loss: 0.6561 - learning_rate: 0.0010
Epoch 4/100
[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - auc: 0.7017 - loss: 0.6131 - val_auc: 0.6309 - val_loss: 0.6603 - learning_rate: 0.0010
Epoch 5/100
[1m3217/3221[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - auc: 0.7082 - loss: 0.6085