### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [57]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

Part1. data: breast_cancer

In [29]:
cancer = datasets.load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.25, random_state=42)
clf=GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
print(f'accuracy_score: {metrics.accuracy_score(y_test, y_pred):.3f}')

accuracy_score: 0.958


In [49]:
# 設定要訓練的超參數組合
n_estimators = np.arange(50,500,50)
max_depth = np.arange(1,11)
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

grid_search = GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1, verbose=1) #窮舉法
# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)
## scoring選擇 https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter


Fitting 5 folds for each of 90 candidates, totalling 450 fits


In [50]:
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.971792 using {'max_depth': 1, 'n_estimators': 100}


In [54]:
# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingClassifier(
    max_depth=grid_result.best_params_['max_depth'],
    n_estimators=grid_result.best_params_['n_estimators']
)
clf_bestparam.fit(x_train,y_train)
y_pred=clf_bestparam.predict(x_test)
print(f'accuracy_score: {metrics.accuracy_score(y_test, y_pred):.3f}')

accuracy_score: 0.965


調整超參數後accuracy由0.958提升到0.965

Part2. data: iris

In [67]:
iris=datasets.load_iris()
x_train, x_test, y_train, y_test=train_test_split(iris.data, iris.target, test_size=0.2, random_state=2)
reg=GradientBoostingRegressor(random_state=2)
reg.fit(x_train, y_train)
y_pred=reg.predict(x_test)
print(f'MSE: {metrics.mean_squared_error(y_test, y_pred):.4f} ')

MSE: 0.0572 


In [68]:
# 設定要訓練的超參數組合
n_estimators = np.arange(50,500,50)
max_depth = np.arange(1,11)
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

grid_search = GridSearchCV(reg, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1) #窮舉法
# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)
## scoring選擇 https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

Fitting 5 folds for each of 90 candidates, totalling 450 fits


In [69]:
print(f'Best MSE: {grid_result.best_score_} using {grid_result.best_params_}')

Best MSE: -0.024818725858610412 using {'max_depth': 1, 'n_estimators': 50}


In [71]:
# 使用最佳參數重新建立模型
reg_bestparam=GradientBoostingRegressor(
    n_estimators=grid_result.best_params_['n_estimators'],
    max_depth=grid_result.best_params_['max_depth']
)
reg_bestparam.fit(x_train, y_train)
y_pred=reg_bestparam.predict(x_test)
print(f'MSE: {metrics.mean_squared_error(y_test, y_pred):.4f}')

MSE: 0.0437


調整超參數後MSE由0.0572 降到 0.0437(MSE越小越好)