# Decision Trees

## Training and Visualizing a Decision Tree


In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:,2:]
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X,y)

DecisionTreeClassifier(max_depth=2)

In [2]:
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file='tree.dot',
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

## Regresion


In [3]:
from sklearn.tree import DecisionTreeRegressor


tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X,y)

export_graphviz(
    tree_reg,
    out_file='tree_reg.dot',
    #feature_names=iris.feature_names[2:],
    #class_names=iris.target_names,
    rounded=True,
    filled=True
)

## Exercise 7

In [4]:
from sklearn.datasets import make_moons
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

X, y= make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Grid-search

In [5]:
params = [
    {
        #'max_depth':[None] + list(range(2,40)),
        'max_leaf_nodes':list(range(2,100)),
        'min_samples_split':[2,3,4,5]
    }
]
gridsearch_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),
                             params,
                             n_jobs=-1,
                             cv=3,
                             verbose=1
                            )
gridsearch_cv.fit(X_train,y_train)

Fitting 3 folds for each of 392 candidates, totalling 1176 fits


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                             13, 14, 15, 16, 17, 18, 19, 20, 21,
                                             22, 23, 24, 25, 26, 27, 28, 29, 30,
                                             31, ...],
                          'min_samples_split': [2, 3, 4, 5]}],
             verbose=1)

In [6]:
gridsearch_cv.best_estimator_

DecisionTreeClassifier(max_leaf_nodes=15, random_state=42)

In [7]:
y_pred = gridsearch_cv.predict(X_test)

In [8]:
accuracy_score(y_test,y_pred)

0.8605

## Exercise 8

In [9]:
from sklearn.model_selection import ShuffleSplit


n_trees = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train,y_mini_train))

In [10]:
from sklearn.base import clone
import numpy as np
forest = [clone(gridsearch_cv.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []

for tree, (X_mini_train,y_mini_train) in zip(forest,mini_sets):
    tree.fit(X_mini_train,y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    
np.mean(accuracy_scores)

0.803828

In [14]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

In [16]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred,axis=0)

In [18]:
accuracy_score(y_test,y_pred_majority_votes.reshape([-1]))

0.867