In [9]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=4)
tree_clf.fit(X,y)

In [10]:

from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file="iris_tree.dot",
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [8]:
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [12]:
from sklearn.datasets import make_moons


dataset = make_moons(n_samples=10000, noise=0.4)

In [13]:
dataset

(array([[ 0.12492247,  1.24488386],
        [-0.19267338,  0.2907312 ],
        [ 1.15758651,  0.07018046],
        ...,
        [ 0.59391331,  0.30609471],
        [ 1.21265331, -0.97116266],
        [ 2.46687408,  0.18249486]]),
 array([0, 1, 1, ..., 1, 1, 1]))

In [26]:
from sklearn.model_selection import train_test_split

split = train_test_split(dataset, test_size=0.2)

split[0]

X_train, X_test, y_train, y_test = train_test_split(dataset[0], dataset[1], test_size=0.2)

In [40]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'max_leaf_nodes':[4,8,12,16,20,24,28,32,36,40,44,48], 'max_depth':[2,4,6,8,10,12,14,16]}
]

dt_clf = DecisionTreeClassifier()
gs = GridSearchCV(dt_clf, param_grid, cv=5, return_train_score=True)
gs.fit(X_train, y_train)

In [41]:
gs.best_params_

{'max_depth': 10, 'max_leaf_nodes': 28}

In [50]:
import numpy as np
pred = gs.predict(X_test)
np.sum(y_test == pred) / len(y_test)

0.85

In [88]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=1000, train_size=100)

forest = []

for i, (train_index, _) in enumerate(rs.split(X_train)):
    features = X_train[train_index]
    labels = y_train[train_index]
    dt = DecisionTreeClassifier(max_depth=10, max_leaf_nodes=28)
    dt.fit(features, labels)
    forest.append(dt)

accuracies = []
for tree in forest:
    pred = tree.predict(X_test)
    accuracies.append(np.sum(y_test == pred) / len(y_test))

np.mean(accuracies)

0.781931

In [89]:
from scipy.stats import mode

predictions = []

for instance in X_test:
    votes = []
    for tree in forest:
        votes.append(tree.predict([instance]))
    majority = mode(votes, axis=0, keepdims=True)
    predictions.append(majority.mode[0][0])

np.sum(y_test == predictions) / len(y_test)

0.8555

array([0])