In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# 1. 加载数据集
data = load_iris()
X = data.data  # 特征数据
y = data.target # 目标标签
feature_names = data.feature_names
target_names = data.target_names

In [5]:
print("数据集加载完成。")
print(f"特征数量: {X.shape[1]}, 样本数量: {X.shape[0]}")
print(f"类别名称: {target_names}")
print("-" * 50)

数据集加载完成。
特征数量: 4, 样本数量: 150
类别名称: ['setosa' 'versicolor' 'virginica']
--------------------------------------------------


In [6]:
# 2. 定义要对比的模型
models = {
    "Logistic Regression": LogisticRegression(max_iter=200, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(objective='multi:softmax', num_class=len(target_names),
                             eval_metric='mlogloss', use_label_encoder=False, random_state=42)
}

In [7]:
# 3. 设置交叉验证策略
# KFold 将数据集分成 k 个连续的折叠，每个折叠用作测试集一次。
# shuffle=True 会在分割前打乱数据。
# random_state=42 确保每次运行时的分割是可复现的。
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
# 用于存储每个模型的精度
accuracy_scores = {}
classification_reports = {}
confusion_matrices = {}

In [9]:
print("开始进行模型交叉验证评估...")
print("-" * 50)

开始进行模型交叉验证评估...
--------------------------------------------------


In [10]:
# 4. 遍历模型并进行交叉验证预测
for name, model in models.items():
    print(f"正在评估模型: {name}...")

    # 使用 cross_val_predict 进行交叉验证预测
    # 这会为数据集中每个样本生成一个在交叉验证中被预测时的预测结果
    y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1) # n_jobs=-1 使用所有可用CPU核心并行计算

    # 计算精度
    accuracy = accuracy_score(y, y_pred)
    accuracy_scores[name] = accuracy

    # 生成分类报告（包含精度、召回率、F1分数等）
    report = classification_report(y, y_pred, target_names=target_names, output_dict=True)
    classification_reports[name] = report

    # 生成混淆矩阵
    cm = confusion_matrix(y, y_pred)
    confusion_matrices[name] = cm

    print(f"  {name} 精度: {accuracy:.4f}")
    print("-" * 50)

正在评估模型: Logistic Regression...
  Logistic Regression 精度: 0.9733
--------------------------------------------------
正在评估模型: Decision Tree...
  Decision Tree 精度: 0.9533
--------------------------------------------------
正在评估模型: Random Forest...
  Random Forest 精度: 0.9600
--------------------------------------------------
正在评估模型: XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  XGBoost 精度: 0.9467
--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
# 5. 打印并可视化结果
print("\n--- 交叉验证结果汇总 ---")

# 打印精度
print("\n模型精度:")
for name, acc in accuracy_scores.items():
    print(f"- {name}: {acc:.4f}")

# 打印详细分类报告（选择一个模型作为示例或遍历打印所有）
print("\n--- 详细分类报告 (Random Forest 示例) ---")
print(pd.DataFrame(classification_reports["Random Forest"]).transpose()) # 转置方便查看


--- 交叉验证结果汇总 ---

模型精度:
- Logistic Regression: 0.9733
- Decision Tree: 0.9533
- Random Forest: 0.9600
- XGBoost: 0.9467

--- 详细分类报告 (Random Forest 示例) ---
              precision  recall  f1-score  support
setosa             1.00    1.00      1.00    50.00
versicolor         0.94    0.94      0.94    50.00
virginica          0.94    0.94      0.94    50.00
accuracy           0.96    0.96      0.96     0.96
macro avg          0.96    0.96      0.96   150.00
weighted avg       0.96    0.96      0.96   150.00
