## 作業

1. 試著調整 RandomForestClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型與決策樹的結果進行比較

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
import warnings
warnings.simplefilter('ignore')

In [2]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

wine = load_wine()
X = wine.data
y = wine.target

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(f"Accuracy = {accuracy_score(y_test, y_pred):.3f}")

Accuracy = 0.978


In [3]:
def get_best_model_and_accuracy(model, params, X, y):
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(model, params, cv=5, error_score=0, n_jobs=-1)
    grid.fit(X, y)
    print(f"Best accuracy: {grid.best_score_}")
    print(f"Best params: {grid.best_params_}")
    print(f"Average time to fit (s): {grid.cv_results_['mean_fit_time'].mean():.3f}")
    print(f"Average time to score (s): {grid.cv_results_['mean_score_time'].mean():.3f}")

In [7]:
from sklearn.pipeline import Pipeline

clf = RandomForestClassifier()

pipeline = Pipeline([
    ('clf', clf)
])
params = {
    'clf__n_estimators': [10, 20, 50, 100],
    'clf__criterion': ['gini', 'entropy'],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [2, 5, 10],
}
get_best_model_and_accuracy(pipeline, params, X, y)

Best accuracy: 0.9775280898876404
Best params: {'clf__criterion': 'gini', 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 10, 'clf__n_estimators': 50}
Average time to fit (s): 0.044
Average time to score (s): 0.005


In [8]:
rf = RandomForestClassifier(n_estimators=50, criterion='gini', min_samples_leaf=5, min_samples_split=10)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(f"Accuracy = {accuracy_score(y_test, y_pred):.3f}")

Accuracy = 1.000


In [9]:
for depth in [1, 2, 5, 10]:
    rf = RandomForestClassifier(n_estimators=50, criterion='gini', min_samples_leaf=5, min_samples_split=10,
                               max_depth=depth)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    print(f"max_depth={depth}, Accuracy = {accuracy_score(y_test, y_pred):.3f}")

max_depth=1, Accuracy = 0.956
max_depth=2, Accuracy = 1.000
max_depth=5, Accuracy = 1.000
max_depth=10, Accuracy = 1.000


In [10]:
rf = RandomForestClassifier(n_estimators=50, criterion='gini', min_samples_leaf=5, min_samples_split=10,
                               max_depth=2)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(f"Accuracy = {accuracy_score(y_test, y_pred):.3f}")

Accuracy = 1.000


In [11]:
rf.feature_importances_

array([0.12047078, 0.01550357, 0.00842629, 0.01335885, 0.01757168,
       0.07905004, 0.09666052, 0.01445104, 0.00963994, 0.13655309,
       0.1077182 , 0.13056996, 0.25002604])