In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
import pyarrow as pa
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

from sklearn.metrics import r2_score
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !python --version

In [3]:
class CONFIG:
    seeds = [2025, 42, 44, 100] # 每折用不同的seed
    target_col = "responder_6"
    feature_cols = ["symbol_id"] \
        + [f"feature_{idx:02d}" for idx in range(79) if idx not in (9, 10, 11, 61)] \
        + [f"responder_{idx}_lag_1" for idx in range(9)] \
        + ['sin_time_id', 'cos_time_id','sin_time_id_half_day','cos_time_id_half_day'] \
        + [f'feature_09_cat_{idx}' for idx in range(11)] + [f'feature_10_cat_{idx}' for idx in range(9)] + ['feature_11_cat_0', 'feature_11_cat_1']

    categorical_cols = ["feature_09", "feature_10", "feature_11"]
    corr_cols = [f"feature_{i:02d}" for i in range(21, 32)]
    cat_features = ['feature_09_cat_0', 'feature_09_cat_1', 'feature_09_cat_2', 'feature_09_cat_3', 'feature_09_cat_4', 
        'feature_09_cat_5', 'feature_09_cat_6', 'feature_09_cat_7', 'feature_09_cat_8', 'feature_09_cat_9', 
        'feature_09_cat_10', 'feature_10_cat_0', 'feature_10_cat_1', 'feature_10_cat_2', 'feature_10_cat_3', 
        'feature_10_cat_4', 'feature_10_cat_5', 'feature_10_cat_6', 'feature_10_cat_7', 'feature_10_cat_8', 
        'feature_11_cat_0', 'feature_11_cat_1']

## Load Data

In [4]:
path = '/root/autodl-tmp/jane-street-2024/train-validate-set/train_fold_3.parquet'
fold_num = path.split('fold_')[1].split('.')[0]
print(int(fold_num)) 

3


In [5]:
train = pl.scan_parquet(path).collect().to_pandas()
valid = pl.scan_parquet(f"/root/autodl-tmp/jane-street-2024/train-validate-set/valid_fold_{fold_num}.parquet").collect().to_pandas()

# valid1 = pl.scan_parquet("/root/autodl-tmp/jane-street-2024/train-validate-set/valid_fold_1.parquet").collect().to_pandas()
valid2 = pl.scan_parquet(f"/root/autodl-tmp/jane-street-2024/train-validate-set/valid_fold_{str(int(fold_num) - 1)}.parquet").collect().to_pandas()
train.shape, valid.shape

((28366539, 126), (3678400, 126))

In [6]:
# # Trick of boosting LB score, data leakage on the validation set
# train = pd.concat([train, valid]).reset_index(drop=True)
# train.shape

In [7]:
train[CONFIG.cat_features] = train[CONFIG.cat_features].astype(int)
valid[CONFIG.cat_features] = valid[CONFIG.cat_features].astype(int)

In [8]:
train.head()

Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1,sin_time_id,cos_time_id,sin_time_id_half_day,cos_time_id_half_day,feature_09_cat_0,feature_09_cat_1,feature_09_cat_2,feature_09_cat_3,feature_09_cat_4,feature_09_cat_5,feature_09_cat_6,feature_09_cat_7,feature_09_cat_8,feature_09_cat_9,feature_09_cat_10,feature_10_cat_0,feature_10_cat_1,feature_10_cat_2,feature_10_cat_3,feature_10_cat_4,feature_10_cat_5,feature_10_cat_6,feature_10_cat_7,feature_10_cat_8,feature_11_cat_0,feature_11_cat_1
0,4053975,299,0,0,2.127546,-0.880901,1.853225,-1.80249,-2.127882,2.021924,0.082818,-0.035213,-0.308829,0.412338,-1.151062,0.162444,-0.640052,,-0.710322,,-2.446608,-1.333768,1.258962,,0.320832,-0.043842,-0.874775,-0.744612,,,1.304105,-0.476804,-0.22871,,,,-2.508356,-1.725281,-2.096086,-0.073593,0.264961,,-0.813235,,,-1.683302,,-2.014813,0.018643,-2.04724,-1.459839,-0.903734,,0.554139,,,-1.029081,,-1.614851,2.281001,,0.365138,0.202085,-0.140561,0.119507,0.584034,0.329463,-1.484548,-1.706445,-0.990837,-0.359095,-0.671586,-1.036622,0.54107,-0.496476,,,-0.26424,-0.226641,-0.303292,-0.415175,-0.945986,-0.718301,-0.46593,-0.891837,-1.400011,-0.88963,-0.918387,-1.156469,-0.533242,1,0.709144,0.411318,0.829466,-0.220744,-0.1466,1.636756,0.571712,0.211628,0.702972,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,4053976,299,0,1,3.20798,-1.672922,1.735088,-1.492592,-1.833632,2.322099,0.055945,-0.059071,-0.246541,0.301029,-1.219211,0.177601,-0.630067,,-0.070608,,-2.031275,-1.963985,1.29812,,1.32956,0.631334,-0.395234,-0.069487,,,1.582811,-0.568069,-0.425684,,,,-0.094414,0.532491,2.290173,-0.017645,0.017628,,1.588295,,,-0.082799,,-0.576814,1.845289,0.911428,1.901088,1.875338,,1.356675,,,-1.69564,,-1.557775,1.538595,,0.76588,0.4222,-0.140561,-0.22335,0.019048,-0.266962,-1.841778,-1.797549,-0.869093,0.276341,-0.479076,-1.178965,-0.070509,-0.617389,,,0.193938,0.237417,-0.155081,-0.175125,0.000753,-0.164361,0.028683,-0.888872,-0.992065,0.087211,-2.064678,-1.254025,0.105347,1,1.470303,1.444104,1.94703,0.689948,0.566533,1.242728,0.124056,0.073153,0.218433,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
2,4053977,299,0,2,1.646734,-1.358929,1.775953,-1.664195,-1.385287,2.457021,0.04118,-0.049191,-0.18945,0.268929,-0.904464,0.13533,-0.561205,,-0.728397,,-1.919673,-1.392008,-0.017061,,-0.219873,-0.716607,-0.212404,0.220898,,,1.204163,-0.763031,-0.320489,,,,0.790255,0.039609,-0.472411,0.058479,0.137372,,-0.818783,,,-0.150273,,-1.903321,1.44121,0.062122,0.163077,-0.378335,,-0.581603,,,0.081836,,-1.907411,1.679703,,-1.090642,-0.412189,-0.140561,2.243897,3.071264,3.692641,-1.670085,-2.123901,-0.948671,0.379411,-0.323449,-1.263949,-0.071063,-0.659664,,,0.602711,0.788557,2.449167,2.120994,0.151334,-0.302043,0.299192,-0.13483,-1.543766,0.260476,-0.377385,-1.043058,0.185319,1,0.604905,0.084139,-0.033296,0.303427,0.110881,0.493181,0.513768,0.204895,0.966184,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0
3,4053978,299,0,3,0.983182,-0.948128,1.600585,-1.535779,-1.52349,1.857884,0.035973,-0.039753,-0.191361,0.262757,-1.117189,0.122068,-0.605168,,-0.501467,,-1.668525,-1.445979,0.183513,,-0.688328,-1.431856,0.201183,0.527071,,,-1.287662,-0.40495,-0.416545,,,,-1.180663,-1.141834,2.164649,0.147309,0.380221,,1.623603,,,1.838844,,,,0.89121,-0.004659,0.006225,,-1.359173,,,-1.351499,,-2.696196,1.054091,,-1.761094,-1.270636,-0.140561,,,,,,-0.897137,0.275753,-0.310824,-1.021779,-0.058799,-0.54089,,,-1.170569,-1.880349,-2.102673,-1.729555,2.026738,0.868594,2.081741,0.598884,-1.536289,1.624629,-0.841361,-2.440849,-0.532792,1,0.486128,-0.681057,-0.313679,0.003267,0.003109,-0.000779,0.060547,0.046916,0.080118,0.0,1.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,4053979,299,0,7,1.058102,-1.764999,1.758211,-2.043462,-1.450967,2.215869,0.06518,-0.059074,-0.211993,0.134314,-1.071263,0.942283,-0.398252,,0.038432,,-2.076025,-1.607107,0.704938,,-0.624456,0.239791,-0.6111,-0.246576,,,-0.223046,0.220124,0.334935,,,,-0.113433,-0.458192,-0.982798,0.116899,0.09087,,2.159572,,,1.768083,,,,-0.281465,3.18138,1.595658,,0.903624,,,-0.451248,,-2.400968,1.77393,,0.812202,0.25109,-0.140561,0.579597,0.72602,0.758708,,,-0.754799,0.246555,-0.389904,-0.723902,1.67952,-0.367824,,,-0.394535,-0.288341,-0.244196,-0.318671,0.396339,0.052121,-0.516689,-0.228245,-0.41421,-0.494643,-0.775926,-0.665538,-0.515261,1,1.727799,0.634517,0.500418,1.668819,0.585581,1.956042,0.896282,0.62976,1.75552,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [9]:
X_train = train[ CONFIG.feature_cols ]
X_train = X_train.ffill().fillna(0)
y_train = train[ CONFIG.target_col ]
w_train = train["weight"]

X_valid = valid[ CONFIG.feature_cols ]
X_valid = X_valid.ffill().fillna(0)
y_valid = valid[ CONFIG.target_col ]
w_valid = valid["weight"]

# X_train.shape, y_train.shape, w_train.shape, X_valid.shape, y_valid.shape, w_valid.shape

## Feature engineering

In [10]:
X_train.head()

Unnamed: 0,symbol_id,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1,sin_time_id,cos_time_id,sin_time_id_half_day,cos_time_id_half_day,feature_09_cat_0,feature_09_cat_1,feature_09_cat_2,feature_09_cat_3,feature_09_cat_4,feature_09_cat_5,feature_09_cat_6,feature_09_cat_7,feature_09_cat_8,feature_09_cat_9,feature_09_cat_10,feature_10_cat_0,feature_10_cat_1,feature_10_cat_2,feature_10_cat_3,feature_10_cat_4,feature_10_cat_5,feature_10_cat_6,feature_10_cat_7,feature_10_cat_8,feature_11_cat_0,feature_11_cat_1
0,0,-0.880901,1.853225,-1.80249,-2.127882,2.021924,0.082818,-0.035213,-0.308829,0.412338,-1.151062,0.162444,-0.640052,0.0,-0.710322,0.0,-2.446608,-1.333768,1.258962,0.0,0.320832,-0.043842,-0.874775,-0.744612,0.0,0.0,1.304105,-0.476804,-0.22871,0.0,0.0,0.0,-2.508356,-1.725281,-2.096086,-0.073593,0.264961,0.0,-0.813235,0.0,0.0,-1.683302,0.0,-2.014813,0.018643,-2.04724,-1.459839,-0.903734,0.0,0.554139,0.0,0.0,-1.029081,0.0,-1.614851,2.281001,0.0,0.365138,0.202085,0.119507,0.584034,0.329463,-1.484548,-1.706445,-0.990837,-0.359095,-0.671586,-1.036622,0.54107,-0.496476,0.0,0.0,-0.26424,-0.226641,-0.303292,-0.415175,0.709144,0.411318,0.829466,-0.220744,-0.1466,1.636756,0.571712,0.211628,0.702972,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,1,-1.672922,1.735088,-1.492592,-1.833632,2.322099,0.055945,-0.059071,-0.246541,0.301029,-1.219211,0.177601,-0.630067,0.0,-0.070608,0.0,-2.031275,-1.963985,1.29812,0.0,1.32956,0.631334,-0.395234,-0.069487,0.0,0.0,1.582811,-0.568069,-0.425684,0.0,0.0,0.0,-0.094414,0.532491,2.290173,-0.017645,0.017628,0.0,1.588295,0.0,0.0,-0.082799,0.0,-0.576814,1.845289,0.911428,1.901088,1.875338,0.0,1.356675,0.0,0.0,-1.69564,0.0,-1.557775,1.538595,0.0,0.76588,0.4222,-0.22335,0.019048,-0.266962,-1.841778,-1.797549,-0.869093,0.276341,-0.479076,-1.178965,-0.070509,-0.617389,0.0,0.0,0.193938,0.237417,-0.155081,-0.175125,1.470303,1.444104,1.94703,0.689948,0.566533,1.242728,0.124056,0.073153,0.218433,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
2,2,-1.358929,1.775953,-1.664195,-1.385287,2.457021,0.04118,-0.049191,-0.18945,0.268929,-0.904464,0.13533,-0.561205,0.0,-0.728397,0.0,-1.919673,-1.392008,-0.017061,0.0,-0.219873,-0.716607,-0.212404,0.220898,0.0,0.0,1.204163,-0.763031,-0.320489,0.0,0.0,0.0,0.790255,0.039609,-0.472411,0.058479,0.137372,0.0,-0.818783,0.0,0.0,-0.150273,0.0,-1.903321,1.44121,0.062122,0.163077,-0.378335,0.0,-0.581603,0.0,0.0,0.081836,0.0,-1.907411,1.679703,0.0,-1.090642,-0.412189,2.243897,3.071264,3.692641,-1.670085,-2.123901,-0.948671,0.379411,-0.323449,-1.263949,-0.071063,-0.659664,0.0,0.0,0.602711,0.788557,2.449167,2.120994,0.604905,0.084139,-0.033296,0.303427,0.110881,0.493181,0.513768,0.204895,0.966184,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0
3,3,-0.948128,1.600585,-1.535779,-1.52349,1.857884,0.035973,-0.039753,-0.191361,0.262757,-1.117189,0.122068,-0.605168,0.0,-0.501467,0.0,-1.668525,-1.445979,0.183513,0.0,-0.688328,-1.431856,0.201183,0.527071,0.0,0.0,-1.287662,-0.40495,-0.416545,0.0,0.0,0.0,-1.180663,-1.141834,2.164649,0.147309,0.380221,0.0,1.623603,0.0,0.0,1.838844,0.0,-1.903321,1.44121,0.89121,-0.004659,0.006225,0.0,-1.359173,0.0,0.0,-1.351499,0.0,-2.696196,1.054091,0.0,-1.761094,-1.270636,2.243897,3.071264,3.692641,-1.670085,-2.123901,-0.897137,0.275753,-0.310824,-1.021779,-0.058799,-0.54089,0.0,0.0,-1.170569,-1.880349,-2.102673,-1.729555,0.486128,-0.681057,-0.313679,0.003267,0.003109,-0.000779,0.060547,0.046916,0.080118,0.0,1.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,7,-1.764999,1.758211,-2.043462,-1.450967,2.215869,0.06518,-0.059074,-0.211993,0.134314,-1.071263,0.942283,-0.398252,0.0,0.038432,0.0,-2.076025,-1.607107,0.704938,0.0,-0.624456,0.239791,-0.6111,-0.246576,0.0,0.0,-0.223046,0.220124,0.334935,0.0,0.0,0.0,-0.113433,-0.458192,-0.982798,0.116899,0.09087,0.0,2.159572,0.0,0.0,1.768083,0.0,-1.903321,1.44121,-0.281465,3.18138,1.595658,0.0,0.903624,0.0,0.0,-0.451248,0.0,-2.400968,1.77393,0.0,0.812202,0.25109,0.579597,0.72602,0.758708,-1.670085,-2.123901,-0.754799,0.246555,-0.389904,-0.723902,1.67952,-0.367824,0.0,0.0,-0.394535,-0.288341,-0.244196,-0.318671,1.727799,0.634517,0.500418,1.668819,0.585581,1.956042,0.896282,0.62976,1.75552,0.0,1.0,0.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [11]:
dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid, weight=w_valid)

del X_train, y_train, w_train, X_valid, y_valid, w_valid

## Necessary Functions

In [12]:
def get_model(seed):
    # Define parameters for xgboost
    params = {
        'learning_rate': 0.015,
        'max_depth': 10,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.05,
        'reg_lambda': 0.05,
        'random_state': seed,
        'tree_method': 'gpu_hist',
        'objective': 'reg:squarederror',  # Use appropriate objective
    }

    # Train the model using xgboost.train without evals
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=400,  # Number of boosting rounds
        verbose_eval=True  # Verbose output
    )
    
    return model

In [13]:
def evaluate_model_on_valid_sets(model, valid_sets, feature_cols, target_col, weight_col):
    r2_scores = []
    
    for valid_df in valid_sets:
        # 获取特征并进行相同的预处理
        X_valid = valid_df[feature_cols].ffill().fillna(0)  # 添加与之前相同的预处理步骤
        y_valid = valid_df[target_col]
        w_valid = valid_df[weight_col]
        
        # Convert to DMatrix
        dvalid = xgb.DMatrix(X_valid, label=y_valid, weight=w_valid)
        
        # 进行预测
        y_pred_valid = model.predict(dvalid)
        
        # 计算 R² 值
        valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid)
        r2_scores.append(valid_score)
    
    return r2_scores

In [14]:
import optuna
from optuna.samplers import TPESampler
from tqdm.auto import tqdm

def objective(trial):
    """
    Optuna objective function for hyperparameter optimization
    """
    # Define hyperparameter search space
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        
        # Fixed parameters
        'random_state': CONFIG.seeds[int(fold_num)],
        'tree_method': 'gpu_hist',
        'objective': 'reg:squarederror',  # Use appropriate objective
    }

    # Train the model using xgboost.train
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=400,  # Number of boosting rounds
        evals=[(dvalid, 'validation')],  # Evaluation set
        early_stopping_rounds=10,  # Early stopping
        verbose_eval=False  # Verbose output
    )
    
    # Evaluate on all validation sets
    valid_sets = [valid]
    r2_scores = evaluate_model_on_valid_sets(
        model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, 'weight'
    )
    mean_r2 = sum(r2_scores) / len(r2_scores)
    
    # 打印当前试验的结果
    print(f"\nTrial {trial.number}:")
    print(f"R2 scores: {r2_scores}")
    print(f"Mean R2: {mean_r2}")
    
    return mean_r2

def run_optuna_optimization(n_trials=100):
    """
    运行Optuna优化
    """
    # 创建study对象
    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=CONFIG.seeds[int(fold_num)]),
        study_name="xgboost_optimization"
    )
    
    # 运行优化
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    # 打印结果
    print("\nBest trial:")
    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    # 可视化结果
    try:
        # 参数重要性
        optuna.visualization.plot_param_importances(study)
        plt.show()
        
        # 优化历史
        optuna.visualization.plot_optimization_history(study)
        plt.show()
        
        # 参数关系
        optuna.visualization.plot_parallel_coordinate(study)
        plt.show()
    except:
        print("Visualization failed. Make sure you have plotly installed for better visualizations.")
    
    return study

In [15]:
# # 运行优化
# study = run_optuna_optimization(n_trials=100)  # 可以根据需要调整trials数量

# # 使用最佳参数训练最终模型
# best_params = study.best_params
# best_params.update({
#     'random_state': CONFIG.seeds[int(fold_num)],
#     'tree_method': 'gpu_hist',
#     'device': 'cuda',
#     'n_gpus': 1
# })

# # 训练最终模型
# final_model = XGBRegressor(**best_params)
# X_train = train[CONFIG.feature_cols].ffill().fillna(0)
# y_train = train[CONFIG.target_col]
# w_train = train['weight']
# final_model.fit(X_train, y_train, sample_weight=w_train)

# # 评估最终模型
# valid_sets = [valid]
# final_scores = evaluate_model_on_valid_sets(
#     final_model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, 'weight'
# )
# print("\nFinal Model Scores:")
# print(f"R2 scores: {final_scores}")
# print(f"Mean R2: {sum(final_scores) / len(final_scores)}")

## Model training

In [16]:
model = get_model(CONFIG.seeds[int(fold_num)])

## Evaluation and Prediction

In [17]:
# # Avoid potential memory issues when making predictions
# y_pred_train1 = model.predict(X_train.iloc[:X_train.shape[0]//2])
# y_pred_train2 = model.predict(X_train.iloc[X_train.shape[0]//2:])
# train_score = r2_score(y_train, np.concatenate([y_pred_train1, y_pred_train2], axis=0), sample_weight=w_train )
# train_score
# # 0.028943955898284912

In [18]:
# y_pred_valid = model.predict(X_valid)
# valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid )
# valid_score

# 0.01829439401626587 没有fill，没有pca
# 0.010878384113311768 有fill，有pca
# 0.01117163896560669 有fill，没有pca
# 最新dataset with onehot score: 0.006251037120819092

In [19]:
valid_sets = [valid, valid2]  # 填入验证集（valid, valid1, valid2, valid3）
r2_scores = evaluate_model_on_valid_sets(model, valid_sets, CONFIG.feature_cols, CONFIG.target_col, "weight")
print("R² scores for the validation sets:", r2_scores)
print("R^2 scores mean:", sum(r2_scores) / len(r2_scores))

R² scores for the validation sets: [0.014700889587402344, 0.015374958515167236]
R^2 scores mean: 0.01503792405128479


### 用这个CV来检索最佳模型

In [20]:
print(xgb.__version__)

2.1.3


In [21]:
# 首先准备评估集的数据
X_valid1 = valid2[CONFIG.feature_cols].ffill().fillna(0)
y_valid1 = valid2[CONFIG.target_col]

import xgboost as xgb

def continue_training(model, d_valid, eval_sets=None, early_stopping_rounds=10):
    """
    更安全的继续训练方法
    
    Args:
        model: XGBoost模型
        d_valid: 当前验证集
        eval_sets: 包含评估集的列表，格式为 [(X_valid1, y_valid1)]
        early_stopping_rounds: 早停轮数
    """
    # Define original parameters
    params = model.attributes()
    original_lr = float(params.get('learning_rate', 0.1))
    
    # Update parameters
    params.update({
        'learning_rate': original_lr * 0.1,
        'reg_alpha': float(params.get('reg_alpha', 0)) + 0.5,
        'reg_lambda': float(params.get('reg_lambda', 0)) + 0.5,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    })
    
    # Convert data to DMatrix
    if eval_sets:
        evals = [(d_valid, 'train')] + [(xgb.DMatrix(X, label=y), f'valid_{i}') for i, (X, y) in enumerate(eval_sets)]
        early_stopping_rounds = 10
    else:
        evals=None
        early_stopping_rounds = None
    
    # Continue training
    model = xgb.train(
        params,
        d_valid,
        num_boost_round=500,  # Set a high number for boosting rounds
        evals=evals,
        early_stopping_rounds=early_stopping_rounds,
        xgb_model=model,  # Use the existing booster
        verbose_eval=True  # Optional: to see the progress
    )
    
    return model

# Use the updated function
eval_sets = [
    (X_valid1, y_valid1)
]

model = continue_training(model, dvalid, eval_sets)

[0]	train-rmse:0.89940	valid_0-rmse:0.84380
[1]	train-rmse:0.89905	valid_0-rmse:0.84379
[2]	train-rmse:0.89879	valid_0-rmse:0.84377
[3]	train-rmse:0.89854	valid_0-rmse:0.84376
[4]	train-rmse:0.89825	valid_0-rmse:0.84375
[5]	train-rmse:0.89798	valid_0-rmse:0.84374
[6]	train-rmse:0.89774	valid_0-rmse:0.84373
[7]	train-rmse:0.89747	valid_0-rmse:0.84372
[8]	train-rmse:0.89722	valid_0-rmse:0.84371
[9]	train-rmse:0.89699	valid_0-rmse:0.84369
[10]	train-rmse:0.89673	valid_0-rmse:0.84370
[11]	train-rmse:0.89650	valid_0-rmse:0.84370
[12]	train-rmse:0.89626	valid_0-rmse:0.84369
[13]	train-rmse:0.89604	valid_0-rmse:0.84368
[14]	train-rmse:0.89582	valid_0-rmse:0.84367
[15]	train-rmse:0.89557	valid_0-rmse:0.84365
[16]	train-rmse:0.89534	valid_0-rmse:0.84364
[17]	train-rmse:0.89511	valid_0-rmse:0.84363
[18]	train-rmse:0.89488	valid_0-rmse:0.84362
[19]	train-rmse:0.89464	valid_0-rmse:0.84361
[20]	train-rmse:0.89438	valid_0-rmse:0.84359
[21]	train-rmse:0.89414	valid_0-rmse:0.84359
[22]	train-rmse:0.89

In [22]:
new_r2_scores = evaluate_model_on_valid_sets(model, [valid, valid2], CONFIG.feature_cols, CONFIG.target_col, 'weight')
print("R² scores for the other validation sets:", new_r2_scores)

# new_r2_scores = None

R² scores for the other validation sets: [0.06117713451385498, 0.016469299793243408]


## Save Outputs

In [23]:
result = {
    "model" : model,
    "cv" : [r2_scores, new_r2_scores]
}
with open(f"xgb_fold_{fold_num}.pkl", "wb") as fp:
    pickle.dump(result, fp)

: 