# Decision trees and random forests

In [1]:
import numpy as np
import pandas as pd

from sklearn import model_selection as ms, tree, ensemble

%matplotlib inline

In [2]:
WHITES_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

Read in the Wine Quality dataset.

In [3]:
whites = pd.read_csv(WHITES_URL, sep=';')

Define a new variable 'good_quality' for whites with quality >= 7.

In [4]:
whites['good_quality'] = whites.quality >= 7

Prepare the data.

In [5]:
X = whites.drop(['quality', 'good_quality'], axis=1)
y = whites['good_quality']

## Decision trees

Train a decision tree.

In [6]:
tree1 = tree.DecisionTreeClassifier()
tree1.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Export the tree for plotting.

In [7]:
tree.export_graphviz(tree1, 'tree1.dot', feature_names=X.columns)

If you have [Graphviz](http://www.graphviz.org) installed, run `dot -Tpng tree1.dot -o tree1.png`.
Alternatively, use [WebGraphviz](http://www.webgraphviz.com/).

Define stratified folds for cross-validation.

In [8]:
ten_fold_cv = ms.StratifiedKFold(n_splits=10, shuffle=True)

Compute average AUC across folds.

In [9]:
aucs = ms.cross_val_score(tree.DecisionTreeClassifier(), X, y, scoring='roc_auc', cv=ten_fold_cv)
np.mean(aucs)

0.76894176206340226

Train a decision tree by limiting:
* the maximum number of questions (depth)
* the minimum number of samples in each leaf

In [10]:
tree2 = tree.DecisionTreeClassifier(max_depth=2, min_samples_leaf=50)
tree2.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Export the tree for plotting.

In [11]:
tree.export_graphviz(tree2, 'tree2.dot', feature_names=X.columns)

Investigate importances of predictors (the higher, the more important).

In [12]:
tree2.feature_importances_

array([ 0.        ,  0.11979176,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.88020824])

## Random forests

Train a random forest with 20 decision trees.

In [13]:
rf1 = ensemble.RandomForestClassifier(n_estimators=20)
rf1.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Investigate importances of predictors (the higher, the more important).

In [14]:
rf1.feature_importances_

array([ 0.06067691,  0.08927074,  0.06775139,  0.09002217,  0.08901032,
        0.08546205,  0.08245273,  0.12296708,  0.08165919,  0.07751575,
        0.15321165])

Evaluate performance through cross-validation.

In [15]:
aucs = ms.cross_val_score(ensemble.RandomForestClassifier(n_estimators=20),
                          X, y, scoring='roc_auc', cv=ten_fold_cv)
np.mean(aucs)

0.91325669201171655

What happens when we increase the number of trees?

In [16]:
for n_trees in [2, 5, 10, 20, 50, 100]:
    aucs = ms.cross_val_score(
        ensemble.RandomForestClassifier(n_estimators=n_trees), X, y, scoring='roc_auc', cv=ten_fold_cv)
    print('{:>3} trees: mean AUC {:.2%}'.format(n_trees, np.mean(aucs)))

  2 trees: mean AUC 79.81%
  5 trees: mean AUC 86.76%
 10 trees: mean AUC 89.47%
 20 trees: mean AUC 91.44%
 50 trees: mean AUC 92.17%
100 trees: mean AUC 92.55%
