# 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [2]:
from sklearn import datasets,metrics
from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import time

In [3]:
boston = datasets.load_boston()
df = pd.DataFrame(boston.data , columns = boston.feature_names)
df2 = (df - df.mean()) / df.std() #z轉換
df2.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.4173,0.284548,-1.286636,-0.272329,-0.144075,0.413263,-0.119895,0.140075,-0.981871,-0.665949,-1.457558,0.440616,-1.074499
1,-0.414859,-0.48724,-0.592794,-0.272329,-0.73953,0.194082,0.366803,0.556609,-0.867024,-0.986353,-0.302794,0.440616,-0.491953
2,-0.414861,-0.48724,-0.592794,-0.272329,-0.73953,1.281446,-0.265549,0.556609,-0.867024,-0.986353,-0.302794,0.396035,-1.207532
3,-0.41427,-0.48724,-1.305586,-0.272329,-0.834458,1.015298,-0.809088,1.076671,-0.752178,-1.105022,0.11292,0.415751,-1.360171
4,-0.410003,-0.48724,-1.305586,-0.272329,-0.834458,1.227362,-0.510674,1.076671,-0.752178,-1.105022,0.11292,0.440616,-1.025487


In [4]:
X = df2.values  #用.values取
Y = boston.target
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25,random_state=42)

GBR = GradientBoostingRegressor()
GBR.fit(x_train,y_train)
y_pred = GBR.predict(x_test)

print(f'Accuacy= {metrics.r2_score(y_test, y_pred):.5f}')
print(f'MSE= {metrics.mean_squared_error(y_test,y_pred):.5f}')

Accuacy= 0.88335
MSE= 8.16840


In [5]:
tStart = time.time()
# 設定要訓練的超參數組合
learning_rate = [0.03, 0.05, 0.1, 0.15, 0.2] #default=0.1
n_estimators = [50, 100, 200, 300, 400]      #default=100
max_depth = [1, 3, 5, 7]                     #default=3
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate)

# 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(GBR, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train,y_train)

tEnd = time.time()
print("It cost %f sec" % (tEnd - tStart))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   11.1s


It cost 16.227108 sec


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   15.9s finished


In [6]:
# 印出最佳結果與最佳參數
print('Best Accuracy: %f using %s' % (grid_result.best_score_ , grid_result.best_params_))

Best Accuracy: -12.678452 using {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


In [7]:
grid_result.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}

In [8]:
#用最佳重新建模
GBR_best = GradientBoostingRegressor(learning_rate = grid_result.best_params_['learning_rate'], 
                                     max_depth = grid_result.best_params_['max_depth'], 
                                     n_estimators = grid_result.best_params_['n_estimators'])
GBR_best.fit(x_train,y_train)
Y_pred = GBR_best.predict(x_test)
print(f'ACC= {metrics.r2_score(y_test,Y_pred):.5f}')
print(f'MSE= {metrics.mean_squared_error(y_test,Y_pred):.5f}')

ACC= 0.88502
MSE= 8.05164
