## Import Libraries, Cross Validation Function, Load Data

In [1]:
import os
import random
import zipfile
import joblib
import numpy as np
import pandas as pd
import lightgbm as lgb 
import xgboost as xgb 
import catboost as cbt 
from sklearn.utils import indexable
from sklearn.utils.validation import _deprecate_positional_args

def _num_samples(X):
    try:
        return X.shape[0]
    except AttributeError:
        return len(X)

# Ensures that validation data comes after training data while handling gaps.
class PurgedGroupTimeSeriesSplit(object):
    @_deprecate_positional_args
    def __init__(self, n_splits=5, *, 
                 max_train_group_size=np.inf,
                 max_val_group_size=np.inf,
                 val_group_gap=10,
                 verbose=False):
        self.n_splits = n_splits
        self.max_train_group_size = max_train_group_size
        self.max_val_group_size = max_val_group_size
        self.val_group_gap = val_group_gap
        self.verbose = verbose
        
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        
        n_splits = self.n_splits
        group_gap = self.val_group_gap
        max_val_group_size = self.max_val_group_size
        n_folds = n_splits + 1
        
        # Get unique groups in order of appearance
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_groups = _num_samples(unique_groups)
        
        # Determine validation group size & starting indices
        group_val_size = min(n_groups // n_folds, max_val_group_size)
        group_val_starts = range(n_groups - n_splits * group_val_size, n_groups, group_val_size)
        
        # Map each group to its sample indices
        group_dict = {}
        n_samples = _num_samples(X)
        for idx in range(n_samples):
            group = groups[idx]
            group_dict.setdefault(group, []).append(idx)
        
        # Generate train-validation splits
        for group_val_start in group_val_starts:
            train_indices = []
            val_indices = []
            
            # Select training data from earlier groups (leaving a gap)
            for g in unique_groups[:max(0, group_val_start - group_gap)]:
                train_indices.extend(group_dict[g])
            
            # Select validation data from subsequent groups
            for g in unique_groups[group_val_start:group_val_start + group_val_size]:
                val_indices.extend(group_dict[g])
            
            yield train_indices, val_indices

# Data Loading
def load_df_from_zip(zip_filename, csv_filename):
    """Load a CSV file from within a ZIP archive."""
    with zipfile.ZipFile(zip_filename, 'r') as zf:
        with zf.open(csv_filename) as csvfile:
            return pd.read_csv(csvfile)

# Load data from ZIP files
df_part_1 = load_df_from_zip("data_features_part_1.zip", "data_features_part_1.csv")
# df_part_2 = load_df_from_zip("data_features_part_2.zip", "data_features_part_2.csv")
# df_part_3 = load_df_from_zip("data_features_part_3.zip", "data_features_part_3.csv")
# df_part_4 = load_df_from_zip("data_features_part_4.zip", "data_features_part_4.csv")

# Concatenate and sort
df_ = pd.concat([df_part_1], ignore_index=True)
df_ = df_.sort_values(by='time_id').reset_index(drop=True)
df_train = df_.copy()

print(df_.head())
print("Shape:", df_.shape)
print("Target preview:")
print(df_['target'].head())

# Prepare features and target
# df_features = df_.drop(columns=['row_id', 'target'])
df_features = df_[['vwap', 'micro_price', 'seconds_in_bucket', 'wap', 'wap_kurtosis', 'market_urgency_v2', 'market_urgency', 'wap_skewness', 'mid_price', 'date_id', 'ask_price', 'bid_price', 'time_id', 'wap_reference_price_imb', 'wap_near_price_imb', 'seconds_in_bucket_group', 'matched_size_group_expanding_mean100', 'wap_group_first_ratio']]
df_train = df_train.drop(columns=['row_id'])

   stock_id  date_id  seconds_in_bucket  imbalance_size  \
0         0        0                  0      3180602.69   
1       126        0                  0      2983745.15   
2       127        0                  0       623714.79   
3       128        0                  0      1894634.36   
4       129        0                  0            0.00   

   imbalance_buy_sell_flag  reference_price  matched_size  far_price  \
0                        1         0.999812   13380276.64   1.001713   
1                        1         0.999567   20568138.41   1.001713   
2                        1         0.999563    1666465.54   1.001713   
3                        1         0.999649    3370966.09   1.001713   
4                        0         0.999342     123264.19   1.001713   

   near_price  bid_price  ...  \
0     0.99966   0.999812  ...   
1     0.99966   0.999954  ...   
2     0.99966   0.999981  ...   
3     0.99966   0.999494  ...   
4     0.99966   0.999342  ...   

   reference_

## Setup Data, Models

In [2]:
# Set seeds for reproducibility.
random.seed(42)
np.random.seed(42)

# Create directory for saving models.
os.makedirs('cbt_models', exist_ok=True)

# Prepare feature matrix X & target vector Y
X = df_features.values
Y = df_train['target'].values

# Keep only samples with finite target values
finite_mask = np.isfinite(Y)
X = X[finite_mask]
Y = Y[finite_mask]

# Define groups for time-series CV using the sorted 'time_id' column
groups = df_['time_id'].values[finite_mask]

# Create instance of custom time-series splitter
tscv = PurgedGroupTimeSeriesSplit(n_splits=5, val_group_gap=10)

# Dictionaries to record fold scores & track best scores
fold_scores = { 'cbt': [] }
best_scores = { 'cbt': np.inf }

# Define model dictionary.
model_dict = {
    'cbt': cbt.CatBoostRegressor(objective='MAE', iterations=3000, verbose=0, random_seed=42),
}

## Training Loop

In [3]:
# Loop over folds generated by time-based splitter
for fold, (train_indices, val_indices) in enumerate(tscv.split(X, Y, groups=groups)):
    print(f"Processing fold {fold}")
    
    # Split into training and validation sets
    X_train, Y_train = X[train_indices], Y[train_indices]
    X_val, Y_val = X[val_indices], Y[val_indices]
    model = model_dict['cbt']
    
    # Train the model with early stopping
    model.fit(
        X_train, Y_train, 
        eval_set=(X_val, Y_val), 
        early_stopping_rounds=100,
        verbose=True
    )
    
    # Evaluate model on validation set using MAE
    preds = model.predict(X_val)
    score = np.mean(np.abs(preds - Y_val))
    fold_scores['cbt'].append(score)
    print(f"Fold {fold} score (MAE): {score}")
    
    # Save the model for the current fold.
    joblib.dump(model, f'cbt_models/cbt_fold{fold}.model')
    
    # Save as best model if new best
    if score < best_scores['cbt']:
        best_scores['cbt'] = score
        joblib.dump(model, 'cbt_models/cbt_best.model')

# Load best model and report average validation score.
best_model = joblib.load('cbt_models/cbt_best.model')
avg_score = np.mean(fold_scores['cbt'])
print(f"Average validation score for CatBoost: {avg_score}")


Processing fold 0
0:	learn: 5.2415853	test: 5.3377213	best: 5.3377213 (0)	total: 77.6ms	remaining: 3m 52s
1:	learn: 5.2294815	test: 5.3266563	best: 5.3266563 (1)	total: 92.1ms	remaining: 2m 18s
2:	learn: 5.2172934	test: 5.3152353	best: 5.3152353 (2)	total: 106ms	remaining: 1m 45s
3:	learn: 5.2061916	test: 5.3050675	best: 5.3050675 (3)	total: 119ms	remaining: 1m 29s
4:	learn: 5.1944202	test: 5.2944508	best: 5.2944508 (4)	total: 133ms	remaining: 1m 19s
5:	learn: 5.1842984	test: 5.2851165	best: 5.2851165 (5)	total: 146ms	remaining: 1m 13s
6:	learn: 5.1741347	test: 5.2757876	best: 5.2757876 (6)	total: 160ms	remaining: 1m 8s
7:	learn: 5.1639309	test: 5.2666383	best: 5.2666383 (7)	total: 173ms	remaining: 1m 4s
8:	learn: 5.1542304	test: 5.2578747	best: 5.2578747 (8)	total: 186ms	remaining: 1m 1s
9:	learn: 5.1441626	test: 5.2489264	best: 5.2489264 (9)	total: 200ms	remaining: 59.8s
10:	learn: 5.1353387	test: 5.2406923	best: 5.2406923 (10)	total: 213ms	remaining: 58s
11:	learn: 5.1260609	test: 5

## Feature Importance

In [4]:
feature_names = df_features.columns
importances = model_dict['cbt'].get_feature_importance()

# Create DataFrame
df_importances = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort most important first
df_importances.sort_values(by='importance', ascending=False, inplace=True)
print(df_importances)

                                 feature  importance
0                                   vwap   33.679160
2                      seconds_in_bucket   15.107620
1                            micro_price   10.752042
3                                    wap   10.696879
4                           wap_kurtosis    7.321572
6                         market_urgency    2.979449
5                      market_urgency_v2    2.576827
7                           wap_skewness    2.572450
9                                date_id    2.317449
12                               time_id    1.987843
8                              mid_price    1.874867
11                             bid_price    1.845523
10                             ask_price    1.511020
14                    wap_near_price_imb    1.245032
15               seconds_in_bucket_group    1.062274
16  matched_size_group_expanding_mean100    0.961635
13               wap_reference_price_imb    0.798465
17                 wap_group_first_ratio    0.

In [5]:
# Cumulative importance
df_importances['cumulative'] = df_importances['importance'].cumsum()
total_importance = df_importances['importance'].sum()
df_importances['cumulative_pct'] = df_importances['cumulative'] / total_importance

# Select features until 95% of total importance is accounted for
selected_features = df_importances[df_importances['cumulative_pct'] <= 0.95]['feature'].tolist()
print("Features covering 95% of total importance:", selected_features)

Features covering 95% of total importance: ['vwap', 'seconds_in_bucket', 'micro_price', 'wap', 'wap_kurtosis', 'market_urgency', 'market_urgency_v2', 'wap_skewness', 'date_id', 'time_id', 'mid_price', 'bid_price']


When running with all features, these cover 95% of importance:

'vwap', 'micro_price', 'seconds_in_bucket', 'wap', 'wap_kurtosis', 'market_urgency_v2', 'market_urgency', 'wap_skewness', 'mid_price', 'date_id', 'ask_price', 'bid_price', 'time_id', 'wap_reference_price_imb', 'wap_near_price_imb', 'seconds_in_bucket_group', 'matched_size_group_expanding_mean100', 'wap_group_first_ratio'