In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
import pyarrow as pa
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CONFIG:
    seeds = [2025, 42, 44, 100] # 每折用不同的seed
    target_col = "responder_6"
    feature_cols = ["symbol_id"] \
        + [f"feature_{idx:02d}" for idx in range(79) if idx not in (9, 10, 11, 61)] \
        + [f"responder_{idx}_lag_1" for idx in range(9)] \
        + ['sin_time_id', 'cos_time_id','sin_time_id_half_day','cos_time_id_half_day'] \
        + [f'feature_09_cat_{idx}' for idx in range(11)] + [f'feature_10_cat_{idx}' for idx in range(9)] + ['feature_11_cat_0', 'feature_11_cat_1']

    categorical_cols = ["feature_09", "feature_10", "feature_11"]
    corr_cols = [f"feature_{i:02d}" for i in range(21, 32)]
    cat_features = ['feature_09_cat_0', 'feature_09_cat_1', 'feature_09_cat_2', 'feature_09_cat_3', 'feature_09_cat_4', 
        'feature_09_cat_5', 'feature_09_cat_6', 'feature_09_cat_7', 'feature_09_cat_8', 'feature_09_cat_9', 
        'feature_09_cat_10', 'feature_10_cat_0', 'feature_10_cat_1', 'feature_10_cat_2', 'feature_10_cat_3', 
        'feature_10_cat_4', 'feature_10_cat_5', 'feature_10_cat_6', 'feature_10_cat_7', 'feature_10_cat_8', 
        'feature_11_cat_0', 'feature_11_cat_1']

# catboost selected features
selected = [6, 7, 0, 2, 60, 24, 4, 59, 5, 8, 1, 36, 38, 58, 22, 30, 15, 23, 20, 56, 29, 26, 25, 47, 48, 31, 27, 68]
feature_names = [f"feature_{i:02d}" for i in selected]
# lag_cols = [f"responder_{idx}_lag_1" for idx in range(9)]
lag_cols = [f"responder_{idx}_lag_1" for idx in [3, 6, 7, 8]]
time_cols = ['sin_time_id', 'cos_time_id','sin_time_id_half_day','cos_time_id_half_day']
cat_features = ['feature_09_cat_0', 'feature_09_cat_1', 'feature_09_cat_2', 'feature_09_cat_3', 'feature_09_cat_4', 
 'feature_09_cat_5', 'feature_09_cat_6', 'feature_09_cat_7', 'feature_09_cat_8', 'feature_09_cat_9', 
 'feature_09_cat_10', 'feature_10_cat_0', 'feature_10_cat_1', 'feature_10_cat_2', 'feature_10_cat_3', 
 'feature_10_cat_4', 'feature_10_cat_5', 'feature_10_cat_6', 'feature_10_cat_7', 'feature_10_cat_8', 
 'feature_11_cat_0', 'feature_11_cat_1']
# categorical_cols = ["feature_09", "feature_10", "feature_11"]
label_name = 'responder_6'
weight_name = 'weight'

## Load data

In [3]:
train = pl.scan_parquet("/root/autodl-tmp/jane-street-2024/train-validate-set/train_fold_2.parquet").collect().to_pandas()
valid = pl.scan_parquet("/root/autodl-tmp/jane-street-2024/train-validate-set/valid_fold_2.parquet").collect().to_pandas()
train.shape, valid.shape

((30370711, 126), (3593216, 126))

In [4]:
train = train[["date_id", "symbol_id"] + feature_names + lag_cols + time_cols + cat_features + [label_name] + [weight_name]]
valid = valid[["date_id", "symbol_id"] + feature_names + lag_cols + time_cols + cat_features + [label_name] + [weight_name]]

In [5]:
train[cat_features] = train[cat_features].astype(int)
valid[cat_features] = valid[cat_features].astype(int)

In [6]:
train.head()

Unnamed: 0,date_id,symbol_id,feature_06,feature_07,feature_00,feature_02,feature_60,feature_24,feature_04,feature_59,feature_05,feature_08,feature_01,feature_36,feature_38,feature_58,feature_22,feature_30,feature_15,feature_23,feature_20,feature_56,feature_29,feature_26,feature_25,feature_47,feature_48,feature_31,feature_27,feature_68,responder_3_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1,sin_time_id,cos_time_id,sin_time_id_half_day,cos_time_id_half_day,feature_09_cat_0,feature_09_cat_1,feature_09_cat_2,feature_09_cat_3,feature_09_cat_4,feature_09_cat_5,feature_09_cat_6,feature_09_cat_7,feature_09_cat_8,feature_09_cat_9,feature_09_cat_10,feature_10_cat_0,feature_10_cat_1,feature_10_cat_2,feature_10_cat_3,feature_10_cat_4,feature_10_cat_5,feature_10_cat_6,feature_10_cat_7,feature_10_cat_8,feature_11_cat_0,feature_11_cat_1,responder_6,weight
0,399,0,-2.237359,1.097173,-0.453808,-1.123646,3.781964,2.047261,-3.384918,7.715572,4.388564,-1.578253,3.21646,-3.295923,-1.242731,,-0.144492,-0.475814,,-0.255842,1.325152,-0.066772,-0.192106,,1.840004,2.502701,3.112202,,,0.125012,0.155073,0.086133,0.08275,0.177626,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1.352247,1.752883
1,399,1,-2.782829,1.819493,-0.438661,-0.700854,3.702588,1.125364,-3.447633,6.492496,2.559703,-1.229943,3.26947,-1.191989,-1.502919,,1.123336,-0.407621,,1.34741,1.209733,-0.462388,-0.14176,,1.081099,6.204994,2.259765,,,0.461226,-1.000574,0.30136,0.19551,0.857975,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,-1.987774,2.768091
2,399,2,-4.270811,1.777802,-0.396425,-0.506801,5.091932,0.808474,-2.614925,8.028045,4.920594,-1.241793,3.005508,-2.982916,-0.311678,,-0.598706,-0.598047,,-1.242307,-0.983549,0.174288,-0.765484,,0.588094,0.400728,-6.433438,,,-0.243479,,,,,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,-5.0,1.366166
3,399,3,-3.133297,1.943953,-0.764966,-0.342314,3.326953,1.279059,-3.321225,9.21697,3.073976,-1.467717,3.015288,2.250811,-1.045066,,-1.182899,-0.562273,,-0.8774,-0.15907,0.539966,-0.590195,,1.392716,1.114253,-2.280511,,,0.513953,1.383015,0.307818,0.17012,0.825546,0.0,1.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,3.309647,0.929881
4,399,7,-1.925597,2.595224,-0.671893,-0.476501,8.399661,0.964512,-2.565749,23.756943,3.605127,-1.397752,3.497678,-2.920491,-2.074677,,-0.362905,1.183362,,2.014356,0.615812,0.091007,2.037769,,1.62647,4.806841,5.269048,,,0.212855,-0.744757,0.198144,0.113393,0.491688,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1.125397,1.582554


In [7]:
# Fill missing values
train[feature_names + lag_cols] = train[feature_names + lag_cols].ffill().fillna(0)
valid[feature_names + lag_cols] = valid[feature_names + lag_cols].ffill().fillna(0)

In [8]:
X_train = train[ feature_names + lag_cols + time_cols + cat_features ]
# X_train = X_train.ffill().fillna(0)
y_train = train[ label_name ]
w_train = train[weight_name]

X_valid = valid[ feature_names + lag_cols + time_cols + cat_features ]
# X_valid = X_valid.ffill().fillna(0)
y_valid = valid[ label_name ]
w_valid = valid[weight_name]

In [9]:
# dtrain = lgb.Dataset(X_train, label=y_train, weight=w_train)
# dvalid = lgb.Dataset(X_valid, label=y_valid, weight=w_valid)

# del X_train, y_train, w_train, X_valid, y_valid, w_valid

## Model training

In [10]:
# def get_model(seed, dtrain):
#     # Define parameters for LightGBM
#     LGB_params = {
#         'device':'gpu',
#         "colsample_bytree": 0.6,
#         "colsample_bynode": 0.6,
#         "gpu_use_dp": True,
#         "objective": 'regression',
#         "metric": 'rmse',
#         'boosting_type': 'gbdt',
#         "random_state": seed,
#         "max_depth": 10,
#         "learning_rate": 0.1,
#         "reg_alpha": 0.2,
#         "reg_lambda": 5,
#         'num_leaves': 64,
#         "max_bin": 255,
#         "early_stopping_rounds": 100
#     }

#     # Train the model using LightGBM
#     model = lgb.train(
#         LGB_params,
#         dtrain,
#         num_boost_round=120,  # Number of boosting rounds
#     )
    
#     return model

def get_model(seed):
    # Define simplified parameters for LightGBM
    LGB_params = {
        'device':'gpu',
        "colsample_bytree": 0.6,
        "colsample_bynode": 0.6,
        "gpu_use_dp": True,
        "objective": 'regression',
        "metric": 'rmse',
        'boosting_type': 'gbdt',
        "random_state": seed,
        "max_depth": 10,
        "learning_rate": 0.1,
        "reg_alpha": 0.2,
        "reg_lambda": 5,
        'num_leaves': 64,
        "max_bin": 255,
        "early_stopping_rounds": 100
    }

    # # Train the model using LightGBM
    # model = lgb.train(
    #     LGB_params,
    #     dtrain,
    #     num_boost_round=100,  # Reduced number of boosting rounds
    # )
    model = LGBMRegressor(**LGB_params)
    
    return model

In [11]:
model = get_model(2025)
model.fit(X_train, y_train)



[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1


LightGBMError: GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1

In [10]:
def evaluate_model_on_valid_sets(model, valid_sets, feature_cols, target_col, weight_col):
    r2_scores = []
    
    for valid_df in valid_sets:
        # 获取特征并进行相同的预处理
        X_valid = valid_df[feature_cols].ffill().fillna(0)  # 添加与之前相同的预处理步骤
        y_valid = valid_df[target_col]
        w_valid = valid_df[weight_col]
        
        # Convert to DMatrix
        dvalid = lgb.Dataset(X_valid, label=y_valid, weight=w_valid)
        
        # 进行预测
        y_pred_valid = model.predict(dvalid)
        
        # 计算 R² 值
        valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid)
        r2_scores.append(valid_score)
    
    return r2_scores

In [11]:
import optuna
from optuna.samplers import TPESampler
from tqdm.auto import tqdm

def objective(trial):
    """
    Optuna objective function for hyperparameter optimization
    """
    # Define hyperparameter search space
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),  # Added for LightGBM
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),  # Added for LightGBM
        
        # Fixed parameters
        'random_state': CONFIG.seeds[int(fold_num)],
        'metric': 'rmse',  # Evaluation metric
        'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
        'device': 'gpu',  # Use GPU if available
        # ... additional parameters for LGBMRegressor
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),  # Number of boosting iterations
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 10, 100)  # Early stopping rounds
    }

    # Train the model using LightGBM
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=params['n_estimators'],  # Use n_estimators from params
    )
    
    # Evaluate on all validation sets
    valid_sets = [valid]
    r2_scores = evaluate_model_on_valid_sets(
        model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, 'weight'
    )
    mean_r2 = sum(r2_scores) / len(r2_scores)
    
    
    # 打印当前试验的结果
    print(f"\nTrial {trial.number}:")
    print(f"R2 scores: {r2_scores}")
    print(f"Mean R2: {mean_r2}")
    
    return mean_r2

def run_optuna_optimization(n_trials=100):
    """
    运行Optuna优化
    """
    # 创建study对象
    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=CONFIG.seeds[int(fold_num)]),
        study_name="lgbm_optimization"
    )
    
    # 运行优化
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    # 打印结果
    print("\nBest trial:")
    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    # 可视化结果
    try:
        # 参数重要性
        optuna.visualization.plot_param_importances(study)
        plt.show()
        
        # 优化历史
        optuna.visualization.plot_optimization_history(study)
        plt.show()
        
        # 参数关系
        optuna.visualization.plot_parallel_coordinate(study)
        plt.show()
    except:
        print("Visualization failed. Make sure you have plotly installed for better visualizations.")
    
    return study

In [None]:
# # 运行优化
# study = run_optuna_optimization(n_trials=100)  # 可以根据需要调整trials数量

# # 使用最佳参数训练最终模型
# best_params = study.best_params
# best_params.update({
#     'random_state': CONFIG.seeds[int(fold_num)],
#     'tree_method': 'gpu_hist',
#     'device': 'cuda',
#     'n_gpus': 1
# })

# # 训练最终模型
# final_model = LGBMRegressor(**best_params)
# X_train = train[CONFIG.feature_cols].ffill().fillna(0)
# y_train = train[CONFIG.target_col]
# w_train = train['weight']
# final_model.fit(X_train, y_train, sample_weight=w_train)

# # 评估最终模型
# valid_sets = [valid]
# final_scores = evaluate_model_on_valid_sets(
#     final_model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, 'weight'
# )
# print("\nFinal Model Scores:")
# print(f"R2 scores: {final_scores}")
# print(f"Mean R2: {sum(final_scores) / len(final_scores)}")

: 

## Evaluation

In [13]:
model = get_model(CONFIG.seeds[int(fold_num)], dtrain)

In [None]:
valid_sets = [valid, valid2]  # 填入验证集（valid, valid1, valid2, valid3）
r2_scores = evaluate_model_on_valid_sets(model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, "weight")
print("R² scores for the validation sets:", r2_scores)
print("R^2 scores mean:", sum(r2_scores) / len(r2_scores))

In [None]:
# 首先准备评估集的数据
X_valid1 = valid2[CONFIG.feature_cols].ffill().fillna(0)
y_valid1 = valid2[CONFIG.target_col]

def continue_training(model, d_valid, eval_sets=None, early_stopping_rounds=10):
    """
    更安全的继续训练方法
    
    Args:
        model: LightGBM模型
        d_valid: 当前验证集
        eval_sets: 包含评估集的列表，格式为 [(X_valid1, y_valid1)]
        early_stopping_rounds: 早停轮数
    """
    # Define original parameters
    params = model.params
    original_lr = float(params.get('learning_rate', 0.1))
    
    # Update parameters
    params.update({
        'learning_rate': original_lr * 0.1,
        'reg_alpha': float(params.get('reg_alpha', 0)) + 0.5,
        'reg_lambda': float(params.get('reg_lambda', 0)) + 0.5,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    })
    
    # Prepare evaluation sets
    evals = [(d_valid, 'train')]
    if eval_sets:
        evals += [(lgb.Dataset(X, label=y), f'valid_{i}') for i, (X, y) in enumerate(eval_sets)]
    
    # Continue training
    model = lgb.train(
        params,
        dtrain=model,  # Use the existing model
        num_boost_round=500,  # Set a high number for boosting rounds
        valid_sets=evals,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=True  # Optional: to see the progress
    )
    
    return model

# Use the updated function
eval_sets = [
    (X_valid1, y_valid1)
]

model = continue_training(model, dvalid, eval_sets)

In [None]:
new_r2_scores = evaluate_model_on_valid_sets(model, [valid2], CONFIG.feature_cols, CONFIG.target_col, 'weight')
print("R² scores for the other validation sets:", new_r2_scores)

# new_r2_scores = None

## Save the model

In [15]:
result = {
    "model" : model,
    "cv" : [r2_scores, new_r2_scores]
}

with open(f"lgbm_fold_{fold_num}.pkl", "wb") as fp:
    pickle.dump(result, fp)