# 训练和显示决策树

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
# 生成dot文件
from sklearn.tree import export_graphviz
import os
def image_path(fig_id):
    return os.path.join('.', "images", "decision_trees", fig_id)

export_graphviz(
    tree_clf,
    out_file=image_path('iris_tree.dot'),
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

## [安装graphviz](https://graphviz.gitlab.io/download/)

> dot -Tpng .\images\decision_trees\iris_tree.dot -o .\images\decision_trees\iris_tree.png

![决策树](images/decision_trees/iris_tree.png)

# 预测和评估分类可能性

In [17]:
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [18]:
tree_clf.predict([[5, 1.5]])

array([1])

# 回归

In [19]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

export_graphviz(
    tree_clf,
    out_file=image_path('iris_tree_regression.dot'),
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

![回归决策树](images/decision_trees/iris_tree_regression.png)

7

In [23]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=10000, noise=0.4)

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np
param_grid = {
    'max_depth': np.arange(3, 20),
    'max_leaf_nodes': np.arange(2, 100),
    'min_samples_leaf': np.arange(1, 50)
}

gd_search_cv = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=True)
gd_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 81634 candidates, totalling 244902 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 465 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 9765 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 25265 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 46965 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 74865 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 108965 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 149265 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 195765 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 244902 out of 244902 | elapsed: 12.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 'max_leaf_nodes': array([ 2,  3, ..., 98, 99]), 'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [27]:
gd_search_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=44,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=15, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [28]:
gd_search_cv.best_score_

0.861

In [29]:
from sklearn.metrics import accuracy_score

y_pred = gd_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)

0.863

8

In [42]:
from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone
dfs = [clone(gd_search_cv.best_estimator_) for _ in range(999)]
scores = []
ss = ShuffleSplit(n_splits=1000 , test_size=len(X_train) - 100)
for df , (train_index, test_index) in zip(dfs , ss.split(X_train)):
    df.fit(X_train[train_index], y_train[train_index])
    y_pred = df.predict(X_test)
    scores.append(accuracy_score(y_pred, y_test))
np.mean(scores)

0.7763128128128127

In [43]:
y_pred = np.empty([1000, len(X_test)], dtype=np.float32)

for tree_index, tree in enumerate(dfs):
    y_pred[tree_index] = tree.predict(X_test)

In [44]:
from scipy.stats import mode

y_pred_votes, n_votes = mode(y_pred, axis=0)

In [45]:
accuracy_score(y_test, y_pred_votes.reshape([-1]))

0.798