In [None]:
# GBRT
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 全局绘图格式设置
plt.rcParams.update({
    'font.family': 'Times New Roman',
    'font.size': 15,
    'axes.linewidth': 1.5,
    'xtick.direction': 'in',
    'ytick.direction': 'in',
    'xtick.labelsize': 15,
    'ytick.labelsize': 15,
    'axes.labelsize': 15,   # 坐标轴标签字号
    'legend.fontsize': 15,  # 图例字号
    'figure.titlesize': 15  # 图标题字号
})
# 加载数据
data = pd.read_excel('微生物多样性输入变量_带时滞特征特征选择筛选二分类230nornaless0610.xlsx')

# 数据准备
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 类权重
class_weight = {0: 1, 1: sum(y_train == 0) / sum(y_train == 1)}

# GBRT分类器（指定超参数，无需优化）
model = GradientBoostingClassifier(
    max_depth=7,            # 设定树的最大深度
    learning_rate=0.1,      # 学习率
    n_estimators=100,       # 树的数量
    subsample=0.6,          # 子采样比例
    min_samples_split=5,    # 最小样本分割数
    min_samples_leaf=2      # 叶节点的最小样本数
)

# 训练模型
model.fit(X_train, y_train)

# 预测
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
test_probs = model.predict_proba(X_test)[:, 1]

# 输出指标
def print_metrics(y_true, preds, dataset):
    print(f'=== {dataset} 集评估 ===')
    print(classification_report(y_true, preds))
    print(f'总体准确率 ({dataset}): {accuracy_score(y_true, preds):.3f}\n')

print_metrics(y_train, train_preds, '训练')
print_metrics(y_test, test_preds, '测试')

# 混淆矩阵绘图函数
def plot_cm(y_true, preds, title):
    cm = confusion_matrix(y_true, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(title, fontsize=15, pad=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tick_params(direction='in')
    for spine in plt.gca().spines.values():
        spine.set_linewidth(1.5)
    plt.show()

plot_cm(y_train, train_preds, 'Confusion Matrix - Training Set')
plot_cm(y_test, test_preds, 'Confusion Matrix - Test Set')

# ROC与精确召回曲线
def plot_curve(x, y, xlabel, ylabel, title, score_label):
    plt.figure(figsize=(8, 6))
    plt.plot(x, y, color='#4B0082', linewidth=1.0, label=score_label)
    if xlabel == 'False Positive Rate (FPR)':
        plt.plot([0, 1], [0, 1], 'k--', linewidth=1.0)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title, pad=15)
    plt.legend()
    plt.tick_params(direction='in')
    for spine in plt.gca().spines.values():
        spine.set_linewidth(1.5)
    plt.show()

fpr, tpr, _ = roc_curve(y_test, test_probs)
auc_score = roc_auc_score(y_test, test_probs)
plot_curve(fpr, tpr, 'False Positive Rate (FPR)', 'True Positive Rate (TPR)', 'ROC Curve - Test Set', f'AUC = {auc_score:.2f}')

precision, recall, _ = precision_recall_curve(y_test, test_probs)
ap_score = average_precision_score(y_test, test_probs)
plot_curve(recall, precision, 'Recall', 'Precision', 'Precision-Recall Curve - Test Set', f'AP = {ap_score:.2f}')

# 阈值分析
thresholds = np.arange(0.1, 1.0, 0.1)
f1_scores = [f1_score(y_test, (test_probs >= thr).astype(int)) for thr in thresholds]
optimal_thr = thresholds[np.argmax(f1_scores)]
print(f'最优阈值: {optimal_thr:.2f}, F1: {max(f1_scores):.3f}')

# 特征重要性
feature_importances = model.feature_importances_
indices = np.argsort(feature_importances)[::-1]

plt.figure(figsize=(8, 6))
plt.bar(range(min(10, X.shape[1])), feature_importances[indices][:10], align='center', color='#4B0082')
plt.xticks(range(min(10, X.shape[1])), X.columns[indices][:10], rotation=90)
plt.title('Feature Importance', pad=15)
plt.ylabel('Importance Score')
plt.tick_params(direction='in')
for spine in plt.gca().spines.values():
    spine.set_linewidth(1.5)
plt.tight_layout()
plt.show()
