In [23]:
################################################
# 加载数据和特征工程
################################################

%run load_data.ipynb
%run build_features.ipynb

In [24]:
################################################
# 准备数据格式
################################################

features = combined_data.iloc[:n_original, :-1]
targets = combined_data.iloc[:n_original, -1]

In [25]:
################################################
# 定义自定义评分器
################################################

import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import mean_absolute_error, make_scorer

def calculate_validation_score(y_true, y_pred, threshold=30):
    mae = np.mean(np.abs(y_true - y_pred))

    y_true_binary = ((y_true <= threshold) & (y_true >= 0)).astype(int)
    y_pred_binary = ((y_pred <= threshold) & (y_pred >= 0)).astype(int)

    mask = (y_pred >= 0) & (y_pred <= threshold)
    range_mae = (
        mean_absolute_error(y_true[mask], y_pred[mask]) if np.sum(mask) > 0 else 100
    )

    if np.sum(y_pred_binary) > 0:
        precision = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_pred_binary)
    else:
        precision = 0
    if np.sum(y_true_binary) > 0:
        recall = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_true_binary)
    else:
        recall = 0

    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
    return score

custom_scorer = make_scorer(calculate_validation_score, greater_is_better=True)


In [None]:
################################################
# 超参数搜索
################################################

param_grid = {
    'max_depth': [7, 9, 11],
    'learning_rate': [0.01, 0.02],
    'num_leaves': [31, 63, 127],
    'feature_fraction': [0.8, 0.9],
    'bagging_fraction': [0.8, 0.9],
    'bagging_freq': [0, 5, 10],
    'n_estimators': [15000, 20000],
    'min_child_samples': [20, 30, 50],
}

gbm = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression')
grid = GridSearchCV(gbm, param_grid, cv=3, scoring=custom_scorer, verbose=1)
grid.fit(features, targets)

print(f'Best parameters found by grid search are: {grid.best_params_}')
print(f'Best estimator found by grid search are: {grid.best_estimator_}')


In [16]:

combined_params = {
    "best_params": grid.best_params_,
    "best_estimator": grid.best_estimator_
}

In [17]:
import json

with open('../data/external_data/best_hyper_parameters.json', 'w', encoding='utf-8') as file:
    json.dump(combined_params, file, indent=4)
