In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 构建并训练决策树分类器，这里特征选择标准使用基尼指数，树的最大深度为1

base_model = DecisionTreeClassifier(max_depth=2, criterion='gini',random_state=1).fit(X_train, y_train)

y_pred = base_model.predict(X_test)#对训练集进行预测

print(f"决策树的准确率：{accuracy_score(y_test,y_pred):.3f}")

决策树的准确率：0.833


In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd

wine = load_wine()#使用葡萄酒数据集
print(f"所有特征：{wine.feature_names}")
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)


print(f"训练数据量：{len(X_train)}，测试数据量：{len(X_test)}")

# 定义模型，这里最大分类器数量为50，学习率为1.5
model = AdaBoostClassifier(estimator=base_model,n_estimators=50,learning_rate=0.8)
# 训练
model.fit(X_train, y_train) 
# 预测
y_pred = model.predict(X_test) 
acc = metrics.accuracy_score(y_test, y_pred) # 准确率
print(f"准确率：{acc:.2}")


所有特征：['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
训练数据量：142，测试数据量：36
准确率：0.97


## 使用GridSearchCV自动调参

In [None]:
hyperparameter_space = {'n_estimators':list(range(2, 102, 2)), 
                        'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}


# 使用准确率为标准，将得到的准确率最高的参数输出，cv=5表示交叉验证参数，这里使用五折交叉验证，n_jobs=-1表示并行数和cpu一致
gs = GridSearchCV(AdaBoostClassifier(
                    # algorithm='SAMME',
                    estimator=base_model, # 这行注释会使用默认模型，也就是深度为1的模型
                    random_state=1),
                  param_grid=hyperparameter_space, 
                  scoring="accuracy", n_jobs=-1, cv=5)

gs.fit(X_train, y_train)
print("最优超参数:", gs.best_params_)
print("最佳交叉验证准确率:", gs.best_score_)

最优超参数: {'learning_rate': 0.9, 'n_estimators': 24}
最佳交叉验证准确率: 0.9928571428571429


In [19]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

# 定义超参数搜索空间
hyperparameter_space = {
    "n_estimators": list(range(2, 102, 2)),
    "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
}

# 创建 GridSearchCV 对象
# 移除了 algorithm='SAMME.R' 参数（或改为 algorithm='SAMME'）
gs = GridSearchCV(
    AdaBoostClassifier(estimator=base_model, random_state=1),  # 使用之前定义的基础模型
    param_grid=hyperparameter_space,
    scoring="accuracy",
    n_jobs=-1,
    cv=5,
)

# 执行网格搜索
gs.fit(X_train, y_train)

# 输出最佳参数
print("最优超参数:", gs.best_params_)
print("最佳交叉验证准确率:", gs.best_score_)

最优超参数: {'learning_rate': 0.9, 'n_estimators': 24}
最佳交叉验证准确率: 0.9928571428571429
