In [None]:
# KNN
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, f1_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# 设置全局绘图风格
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 15
plt.rcParams['axes.linewidth'] = 1.5
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15

# 1. 加载数据
file_path = '微生物多样性输入变量_带时滞特征特征选择筛选二分类230nornaless0610.xlsx'
data = pd.read_excel(file_path)

# 2. 数据准备（标准化，与程序一相同划分比例）
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 3. 使用程序一方案12对应的超参数
model = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='manhattan')
model.fit(X_train, y_train)

# 4. 预测
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
test_probs = model.predict_proba(X_test)[:, 1]

# 5. 输出指标
print('=== 训练集评估 ===')
print(classification_report(y_train, train_preds))
print(f'训练集准确率: {accuracy_score(y_train, train_preds):.3f}\n')

print('=== 测试集评估 ===')
print(classification_report(y_test, test_preds))
print(f'测试集准确率: {accuracy_score(y_test, test_preds):.3f}\n')

# 6. 混淆矩阵绘图函数
def plot_cm(y_true, preds, title):
    cm = confusion_matrix(y_true, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(title, fontsize=15, pad=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tick_params(direction='in')
    plt.show()

plot_cm(y_train, train_preds, 'Confusion Matrix - Training Set')
plot_cm(y_test, test_preds, 'Confusion Matrix - Test Set')

# 7. ROC曲线与精确召回曲线绘图
fpr, tpr, _ = roc_curve(y_test, test_probs)
auc_score = roc_auc_score(y_test, test_probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, linewidth=1.0, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1.0)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Test Set')
plt.legend()
plt.show()

precision, recall, _ = precision_recall_curve(y_test, test_probs)
ap_score = average_precision_score(y_test, test_probs)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, linewidth=1.0, label=f'AP = {ap_score:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Test Set')
plt.legend()
plt.show()

# 8. 阈值分析
thresholds = np.arange(0.1, 1.0, 0.1)
f1_scores = [f1_score(y_test, (test_probs >= thr).astype(int)) for thr in thresholds]
optimal_thr = thresholds[np.argmax(f1_scores)]
print(f'最优阈值: {optimal_thr:.2f}, 最大F1值: {max(f1_scores):.3f}')
