In [None]:
# CatBoost
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 设置全局绘图风格，与论文图表一致
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.linewidth'] = 1.5
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# 1. 加载数据
file_path = '微生物多样性输入变量_带时滞特征特征选择筛选二分类230less0610.xlsx'
data = pd.read_excel(file_path)

# 2. 数据准备
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. 类别权重
neg_count = sum(y_train == 0)
pos_count = sum(y_train == 1)
class_weights = [neg_count / (neg_count + pos_count), pos_count / (neg_count + pos_count)]

# 4. 指定超参数CatBoost模型
model = CatBoostClassifier(
    depth=7,
    learning_rate=0.1,
    iterations=200,
    subsample=0.8,
    l2_leaf_reg=5,
    verbose=False,
    class_weights=class_weights
)
model.fit(X_train, y_train)

# 5. 预测
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
test_probs = model.predict_proba(X_test)[:, 1]

# 6. 输出指标
def print_metrics(y_true, preds, dataset):
    print(f'=== {dataset} 集评估 ===')
    print(classification_report(y_true, preds))
    print(f'总体准确率 ({dataset}): {accuracy_score(y_true, preds):.3f}\n')

print_metrics(y_train, train_preds, '训练')
print_metrics(y_test, test_preds, '测试')

# 7. 混淆矩阵绘图函数
def plot_cm(y_true, preds, title):
    cm = confusion_matrix(y_true, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(title, fontsize=12, pad=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tick_params(direction='in')
    for spine in plt.gca().spines.values():
        spine.set_linewidth(1.5)
    plt.show()

plot_cm(y_train, train_preds, 'Confusion Matrix - Training Set')
plot_cm(y_test, test_preds, 'Confusion Matrix - Test Set')

# 8. ROC与精确召回曲线
fpr, tpr, _ = roc_curve(y_test, test_probs)
auc_score = roc_auc_score(y_test, test_probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='#4B0082', linewidth=1.0, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1.0)
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve - Test Set', pad=15)
plt.legend()
plt.tick_params(direction='in')
for spine in plt.gca().spines.values():
    spine.set_linewidth(1.5)
plt.show()

precision, recall, _ = precision_recall_curve(y_test, test_probs)
ap_score = average_precision_score(y_test, test_probs)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='#4B0082', linewidth=1.0, label=f'AP = {ap_score:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Test Set', pad=15)
plt.legend()
plt.tick_params(direction='in')
for spine in plt.gca().spines.values():
    spine.set_linewidth(1.5)
plt.show()

# 9. 阈值分析
thresholds = np.arange(0.1, 1.0, 0.1)
f1_scores = [f1_score(y_test, (test_probs >= thr).astype(int)) for thr in thresholds]
optimal_thr = thresholds[np.argmax(f1_scores)]
print(f'最优阈值: {optimal_thr:.2f}, F1: {max(f1_scores):.3f}')

# 10. 特征重要性
feature_importances = model.get_feature_importance()
sorted_indices = np.argsort(feature_importances)[::-1]

plt.figure(figsize=(8, 6))
plt.barh(X.columns[sorted_indices][:10][::-1], feature_importances[sorted_indices][:10][::-1], color='#4B0082')
plt.xlabel('Feature Importance')
plt.title('Feature Importance', pad=15)
plt.tick_params(direction='in')
for spine in plt.gca().spines.values():
    spine.set_linewidth(1.5)
plt.tight_layout()
plt.show()
