In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
import pyarrow as pa
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

from sklearn.metrics import r2_score
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CONFIG:
    seeds = [2025, 42, 44, 100] # 每折用不同的seed
    target_col = "responder_6"
    feature_cols = ["symbol_id"] \
        + [f"feature_{idx:02d}" for idx in range(79) if idx not in (9, 10, 11, 61)] \
        + [f"responder_{idx}_lag_1" for idx in range(9)] \
        + ['sin_time_id', 'cos_time_id','sin_time_id_half_day','cos_time_id_half_day'] \
        + [f'feature_09_cat_{idx}' for idx in range(11)] + [f'feature_10_cat_{idx}' for idx in range(9)] + ['feature_11_cat_0', 'feature_11_cat_1']

    categorical_cols = ["feature_09", "feature_10", "feature_11"]
    corr_cols = [f"feature_{i:02d}" for i in range(21, 32)]
    cat_features = ['feature_09_cat_0', 'feature_09_cat_1', 'feature_09_cat_2', 'feature_09_cat_3', 'feature_09_cat_4', 
        'feature_09_cat_5', 'feature_09_cat_6', 'feature_09_cat_7', 'feature_09_cat_8', 'feature_09_cat_9', 
        'feature_09_cat_10', 'feature_10_cat_0', 'feature_10_cat_1', 'feature_10_cat_2', 'feature_10_cat_3', 
        'feature_10_cat_4', 'feature_10_cat_5', 'feature_10_cat_6', 'feature_10_cat_7', 'feature_10_cat_8', 
        'feature_11_cat_0', 'feature_11_cat_1']

In [3]:
path = '/root/autodl-tmp/jane-street-2024/train-validate-set/train_fold_3.parquet'
fold_num = path.split('fold_')[1].split('.')[0]
print(int(fold_num)) 

((35152365, 107), (4413112, 107))

In [None]:
train = pl.scan_parquet(path).collect().to_pandas()
valid = pl.scan_parquet(f"/root/autodl-tmp/jane-street-2024/train-validate-set/valid_fold_{fold_num}.parquet").collect().to_pandas()

# valid1 = pl.scan_parquet("/root/autodl-tmp/jane-street-2024/train-validate-set/valid_fold_1.parquet").collect().to_pandas()
valid2 = pl.scan_parquet(f"/root/autodl-tmp/jane-street-2024/train-validate-set/valid_fold_{str(int(fold_num) - 1)}.parquet").collect().to_pandas()
train.shape, valid.shape

In [None]:
train[CONFIG.cat_features] = train[CONFIG.cat_features].astype(int)
valid[CONFIG.cat_features] = valid[CONFIG.cat_features].astype(int)

In [7]:
X_train = train[ CONFIG.feature_cols ]
X_train = X_train.ffill().fillna(0)
y_train = train[ CONFIG.target_col ]
w_train = train["weight"]

X_valid = valid[ CONFIG.feature_cols ]
X_valid = X_valid.ffill().fillna(0)
y_valid = valid[ CONFIG.target_col ]
w_valid = valid["weight"]

In [None]:
train_pool = Pool(data=X_train, label=y_train, weight=train['weight'])  
valid_pool = Pool(data=X_valid, label=y_valid, weight=valid['weight'])  

del X_train, y_train, w_train, X_valid, y_valid, w_valid

## Model training

In [8]:
def get_model(seed):
    # CatBoost parameters (defined in the original notebook)
    CAT_Params = {
        'task_type':'GPU',
        'random_state':seed,
        'eval_metric':'RMSE',
        'bagging_temperature':0.50,
        'iterations':200,
        'learning_rate':0.1,
        'max_depth':12,
        'l2_leaf_reg':1.25,
        'min_data_in_leaf':24,
        'random_strength':0.25,
        'early_stopping_rounds': 50,
        'verbose':0
    }
    
    # Train the model using cbt.train
    model = CatBoost(params)
    model.train(train_pool, eval_set=valid_pool)
    return model


In [9]:
def evaluate_model_on_valid_sets(model, valid_sets, feature_cols, target_col, weight_col):
    r2_scores = []
    
    for valid_df in valid_sets:
        # 获取特征并进行相同的预处理
        X_valid = valid_df[feature_cols].ffill().fillna(0)  # 添加与之前相同的预处理步骤
        y_valid = valid_df[target_col]
        w_valid = valid_df[weight_col]
        
        # Convert to DMatrix
        dvalid = lgb.Dataset(X_valid, label=y_valid, weight=w_valid)
        
        # 进行预测
        y_pred_valid = model.predict(dvalid)
        
        # 计算 R² 值
        valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid)
        r2_scores.append(valid_score)
    
    return r2_scores

<catboost.core.CatBoostRegressor at 0x7f2841c07880>

In [None]:
import optuna
from optuna.samplers import TPESampler
from tqdm.auto import tqdm

def objective(trial):
    """
    Optuna objective function for hyperparameter optimization
    """
    # Define hyperparameter search space
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),  # Added for LightGBM
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),  # Added for LightGBM
        
        # Fixed parameters
        'random_state': CONFIG.seeds[int(fold_num)],
        'metric': 'rmse',  # Evaluation metric
        'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
        'device': 'gpu',  # Use GPU if available
    }

    # Train the model using LightGBM
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=400,  # Number of boosting rounds
        valid_sets=[dvalid],  # Evaluation set
        early_stopping_rounds=10,  # Early stopping
        verbose_eval=False  # Verbose output
    )
    
    # Evaluate on all validation sets
    valid_sets = [valid]
    r2_scores = evaluate_model_on_valid_sets(
        model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, 'weight'
    )
    mean_r2 = sum(r2_scores) / len(r2_scores)
    
    
    # 打印当前试验的结果
    print(f"\nTrial {trial.number}:")
    print(f"R2 scores: {r2_scores}")
    print(f"Mean R2: {mean_r2}")
    
    return mean_r2

def run_optuna_optimization(n_trials=100):
    """
    运行Optuna优化
    """
    # 创建study对象
    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=CONFIG.seeds[int(fold_num)]),
        study_name="xgboost_optimization"
    )
    
    # 运行优化
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    # 打印结果
    print("\nBest trial:")
    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    # 可视化结果
    try:
        # 参数重要性
        optuna.visualization.plot_param_importances(study)
        plt.show()
        
        # 优化历史
        optuna.visualization.plot_optimization_history(study)
        plt.show()
        
        # 参数关系
        optuna.visualization.plot_parallel_coordinate(study)
        plt.show()
    except:
        print("Visualization failed. Make sure you have plotly installed for better visualizations.")
    
    return study

In [None]:
# # 运行优化
# study = run_optuna_optimization(n_trials=100)  # 可以根据需要调整trials数量

# # 使用最佳参数训练最终模型
# best_params = study.best_params
# best_params.update({
#     'random_state': CONFIG.seeds[int(fold_num)],
#     'tree_method': 'gpu_hist',
#     'device': 'cuda',
#     'n_gpus': 1
# })

# # 训练最终模型
# final_model = LGBMRegressor(**best_params)
# X_train = train[CONFIG.feature_cols].ffill().fillna(0)
# y_train = train[CONFIG.target_col]
# w_train = train['weight']
# final_model.fit(X_train, y_train, sample_weight=w_train)

# # 评估最终模型
# valid_sets = [valid]
# final_scores = evaluate_model_on_valid_sets(
#     final_model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, 'weight'
# )
# print("\nFinal Model Scores:")
# print(f"R2 scores: {final_scores}")
# print(f"Mean R2: {sum(final_scores) / len(final_scores)}")

In [None]:
model = get_model(CONFIG.seeds[int(fold_num)])

In [None]:
valid_sets = [valid, valid2]  # 填入验证集（valid, valid1, valid2, valid3）
r2_scores = evaluate_model_on_valid_sets(model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, "weight")
print("R² scores for the validation sets:", r2_scores)
print("R^2 scores mean:", sum(r2_scores) / len(r2_scores))

In [None]:
# 首先准备评估集的数据
X_valid1 = valid2[CONFIG.feature_cols].ffill().fillna(0)
y_valid1 = valid2[CONFIG.target_col]

def continue_training(model, d_valid, eval_sets=None, early_stopping_rounds=10):
    """
    更安全的继续训练方法
    
    Args:
        model: LightGBM模型
        d_valid: 当前验证集
        eval_sets: 包含评估集的列表，格式为 [(X_valid1, y_valid1)]
        early_stopping_rounds: 早停轮数
    """
    # Define original parameters
    params = model.params
    original_lr = float(params.get('learning_rate', 0.1))
    
    # Update parameters
    params.update({
        'learning_rate': original_lr * 0.1,
        'reg_alpha': float(params.get('reg_alpha', 0)) + 0.5,
        'reg_lambda': float(params.get('reg_lambda', 0)) + 0.5,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    })
    
    # Prepare evaluation sets
    evals = [(d_valid, 'train')]
    if eval_sets:
        evals += [(lgb.Dataset(X, label=y), f'valid_{i}') for i, (X, y) in enumerate(eval_sets)]
    
    # Continue training
    model = lgb.train(
        params,
        dtrain=model,  # Use the existing model
        num_boost_round=500,  # Set a high number for boosting rounds
        valid_sets=evals,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=True  # Optional: to see the progress
    )
    
    return model

# Use the updated function
eval_sets = [
    (X_valid1, y_valid1)
]

model = continue_training(model, dvalid, eval_sets)

In [None]:
new_r2_scores = evaluate_model_on_valid_sets(model, [valid2], CONFIG.feature_cols, CONFIG.target_col, 'weight')
print("R² scores for the other validation sets:", new_r2_scores)

# new_r2_scores = None

In [13]:
result = {
    "model" : model,
    "cv" : [r2_scores, new_r2_scores]
}
with open(f"cat_fold_{fold_num}.pkl", "wb") as fp:
    pickle.dump(result, fp)