# Optiver Trading at the Close - LightGBM Baseline

基于特征工程的 LightGBM 方案

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded!')

## 1. 特征工程函数

In [None]:
def create_features(df):
    """特征工程"""
    df = df.copy()
    
    # 基础特征
    df["spread"] = df["ask_price"] - df["bid_price"]
    df["spread_pct"] = df["spread"] / (df["wap"] + 1e-8)
    df["mid_price"] = (df["ask_price"] + df["bid_price"]) / 2
    df["liquidity_imbalance"] = (df["bid_size"] - df["ask_size"]) / (df["bid_size"] + df["ask_size"] + 1)
    
    # 市场紧迫度 (最强特征)
    df["market_urgency"] = df["spread"] * df["liquidity_imbalance"]
    df["market_urgency_v2"] = (
        df["mid_price"] - 
        (df["bid_price"] * df["bid_size"] + df["ask_price"] * df["ask_size"]) / 
        (df["bid_size"] + df["ask_size"] + 1)
    )
    
    # 不平衡特征
    df["price_imbalance"] = (df["ask_price"] - df["bid_price"]) / (df["ask_price"] + df["bid_price"] + 1e-8)
    df["size_imbalance"] = (df["ask_size"] - df["bid_size"]) / (df["ask_size"] + df["bid_size"] + 1)
    df["matched_ratio"] = df["matched_size"] / (df["imbalance_size"] + df["matched_size"] + 1)
    df["imbalance_intensity"] = df["imbalance_size"] * df["imbalance_buy_sell_flag"]
    
    # 价格关系
    df["wap_ref_diff"] = df["wap"] - df["reference_price"]
    df["wap_ref_pct"] = df["wap_ref_diff"] / (df["reference_price"] + 1e-8)
    
    # 价格位置
    for col in ["reference_price", "far_price", "near_price", "bid_price", "ask_price", "wap"]:
        if col in df.columns:
            df[f"{col}_diff_mid"] = df[col] - df["mid_price"]
    
    # 时间特征
    df["seconds_bucket"] = df["seconds_in_bucket"] // 60
    df["is_last_minute"] = (df["seconds_in_bucket"] >= 540).astype(int)
    
    # 交叉特征
    df["spread_x_imbalance"] = df["spread"] * df["imbalance_size"]
    df["urgency_x_flag"] = df["market_urgency"] * df["imbalance_buy_sell_flag"]
    
    return df

print('Feature engineering function defined!')

## 2. 加载数据并训练

In [None]:
# 加载训练数据
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
print(f'Train shape: {train.shape}')

# 特征工程
train = create_features(train)

# 特征列
feature_cols = [
    'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price',
    'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size',
    'ask_price', 'ask_size', 'wap',
    'spread', 'spread_pct', 'mid_price', 'liquidity_imbalance',
    'market_urgency', 'market_urgency_v2',
    'price_imbalance', 'size_imbalance', 'matched_ratio', 'imbalance_intensity',
    'wap_ref_diff', 'wap_ref_pct',
    'reference_price_diff_mid', 'far_price_diff_mid', 'near_price_diff_mid',
    'bid_price_diff_mid', 'ask_price_diff_mid', 'wap_diff_mid',
    'seconds_bucket', 'is_last_minute',
    'spread_x_imbalance', 'urgency_x_flag',
]

print(f'Number of features: {len(feature_cols)}')

In [None]:
# 处理缺失值
train[feature_cols] = train[feature_cols].fillna(0)
train[feature_cols] = train[feature_cols].replace([np.inf, -np.inf], 0)

# 时间序列分割
split_day = int(train['date_id'].max() * 0.8)
train_mask = train['date_id'] <= split_day
valid_mask = train['date_id'] > split_day

X_train = train.loc[train_mask, feature_cols]
y_train = train.loc[train_mask, 'target']
X_valid = train.loc[valid_mask, feature_cols]
y_valid = train.loc[valid_mask, 'target']

print(f'Train: {len(X_train):,}, Valid: {len(X_valid):,}')

In [None]:
# LightGBM 参数
params = {
    'objective': 'mae',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 128,
    'max_depth': 8,
    'min_child_samples': 100,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 1.0,
    'verbosity': -1,
    'n_jobs': -1,
    'seed': 42,
}

# 训练
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100),
    ],
)

# 验证
y_pred = model.predict(X_valid)
mae = np.mean(np.abs(y_valid - y_pred))
print(f'\nValidation MAE: {mae:.6f}')

## 3. 提交推理

In [None]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    # 特征工程
    test = create_features(test)
    
    # 处理缺失值
    test[feature_cols] = test[feature_cols].fillna(0)
    test[feature_cols] = test[feature_cols].replace([np.inf, -np.inf], 0)
    
    # 预测
    predictions = model.predict(test[feature_cols])
    
    # 提交
    sample_prediction['target'] = predictions
    env.predict(sample_prediction)
    
    counter += 1
    if counter % 100 == 0:
        print(f'Processed {counter} batches')

print(f'Done! Total batches: {counter}')