# 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [87]:
import pandas as pd
import numpy as np
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [99]:
# 讀取鳶尾花資料集
wine = datasets.load_wine()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3)

# 建立模型
clf = RandomForestClassifier()

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [100]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.9629629629629629


In [101]:
R2=r2_score(y_test, y_pred)
R2

0.9385315879339784

In [102]:
MSE=mean_squared_error(y_test, y_pred)
MSE

0.037037037037037035

In [103]:
def evaluate(model, x_test, y_test):
    predictions = model.predict(x_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = clf
base_accuracy = evaluate(base_model, x_test, y_test)

Model Performance
Average Error: 0.0370 degrees.
Accuracy = nan%.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [105]:
# 設定要訓練的超參數組合
param_grid = {
    'n_estimators': [100, 200,500,1000],
    'max_depth': [10, 25, 50,100],
    'min_samples_split': [2, 6, 8,8],
    'min_samples_leaf': [1, 3, 5,7]}

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(clf, param_grid,  cv=5,n_jobs=-1, verbose=2)


In [106]:
# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)


Fitting 5 folds for each of 256 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 1280 out of 1280 | elapsed:  7.8min finished


In [107]:
results=grid_search.best_params_
results

{'max_depth': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 8,
 'n_estimators': 100}

In [108]:
# 使用最佳參數重新建立模型
clf_bestparam = RandomForestClassifier(n_estimators=100, max_depth=50, min_samples_leaf=1, min_samples_split=8)

# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred_grid = clf_bestparam.predict(x_test)

In [109]:
R2=r2_score(y_test, y_pred_grid)
R2

1.0

In [110]:
MSE_best=mean_squared_error(y_test, y_pred_grid)
MSE_best

0.0

In [111]:
acc_grid = metrics.accuracy_score(y_test, y_pred_grid)
print("Acuuracy: ", acc_grid)

Acuuracy:  1.0


In [112]:
best_model = clf_bestparam
base_accuracy = evaluate(best_model, x_test, y_test)

Model Performance
Average Error: 0.0000 degrees.
Accuracy = nan%.


  after removing the cwd from sys.path.
