# 改错记录
1. 运行时报错：AttributeError: module 'urllib' has no attribute 'urlopen' —— 在python3.x中应该使用urllib.request.urlopen()方法
2. 

In [2]:
import numpy as np
## 加载CSV文件
dataset = np.loadtxt("diabetes.csv", delimiter=",", skiprows=1)
## 区分特征和标签
X = dataset[:,0:7]
y = dataset[:,8]

In [3]:
## 数据标准化
from sklearn import preprocessing
## 标准化
normalized_X = preprocessing.normalize(X)

In [4]:
## 特征选择
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split

(trainX, testX, trainy, testy) = train_test_split(normalized_X, y, random_state=3, test_size=0.2)
model = ExtraTreesClassifier()
model.fit(trainX, trainy)
## 特征重要度
print(model.feature_importances_)

[0.15870686 0.17036047 0.18643466 0.102149   0.08290808 0.15171649
 0.14772442]


In [9]:
## 建模与评估
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)
model.fit(trainX, trainy)
print('MODEL')
print(model)
## 预测
expected = testy
predicted = model.predict(testX)
## 输出评估结果
print('RESULT')
print(metrics.classification_report(expected, predicted))
print('CONFUSION MATRIX')
print(metrics.confusion_matrix(expected, predicted))

MODEL
RandomForestClassifier(n_estimators=10)
RESULT
              precision    recall  f1-score   support

         0.0       0.63      0.78      0.70        92
         1.0       0.49      0.31      0.38        62

    accuracy                           0.59       154
   macro avg       0.56      0.54      0.54       154
weighted avg       0.57      0.59      0.57       154

CONFUSION MATRIX
[[72 20]
 [43 19]]


In [6]:
## 超参数调优
from sklearn.model_selection import GridSearchCV
# 定义超参数的搜索空间
param_grid = {
    'n_estimators': [10, 20, 30],  # 可以根据实际情况扩展
    'max_depth': [None, 10, 20],  # 可以根据实际情况扩展
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# 创建GridSearchCV对象
grid_search = GridSearchCV(model, param_grid, cv=5)
# 使用训练数据进行超参数调优
grid_search.fit(X, y)
# 输出最佳参数
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
# 输出最佳模型
best_model = grid_search.best_estimator_

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 10}


In [7]:
best_model.fit(trainX, trainy)
print('MODEL')
print(best_model)
## 预测
expected = testy
predicted = best_model.predict(testX)
## 输出评估结果
print('RESULT')
print(metrics.classification_report(expected, predicted))
print('CONFUSION MATRIX')
print(metrics.confusion_matrix(expected, predicted))

MODEL
RandomForestClassifier(min_samples_leaf=2, min_samples_split=10,
                       n_estimators=10)
RESULT
              precision    recall  f1-score   support

         0.0       0.69      0.85      0.76        92
         1.0       0.66      0.44      0.52        62

    accuracy                           0.68       154
   macro avg       0.67      0.64      0.64       154
weighted avg       0.68      0.68      0.67       154

CONFUSION MATRIX
[[78 14]
 [35 27]]
