# Demo: Decision Trees

In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Load the Iris dataset
iris = load_iris()

# We're going to consider two of the four feature names, petal length
# and petal width, and we'll ignore sepal length and width.
X = iris.data[:, 2:]
y = iris.target

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

In [None]:
# Let's take a look at the decision tree...
from sklearn.tree import export_graphviz
export_graphviz(tree_clf, out_file="iris_tree.dot",
               feature_names=iris.feature_names[2:],
               class_names=iris.target_names,
               rounded=True,
               filled=True)

In [None]:
# dot will convert the tree from a .dot file to a .png
# In case you don't have dot installed, the png is already
# in your zip file
!dot -Tpng iris_tree.dot -o iris_tree.png
from IPython.display import Image
Image('iris_tree.png')

In [None]:
# We can also have scikit-learn plot the tree...
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(18, 8))
plot_tree(tree_clf, feature_names=iris.feature_names[2:]);

## Let's see how we did...

In [None]:
tree_clf.score(X, y)

In [None]:
tree_clf.feature_importances_

## Feature Importances
* Because of the way the model works, we can determine which features were important by computing the total amount of information gained from splitting on a per-feature basis
* The "most important feature" is the one whose splits most increased homogeniety during training

In [None]:
import pandas as pd
feat_importances = pd.Series(tree_clf.feature_importances_, index=iris.feature_names[2:])
feat_importances.sort_values().plot(kind='barh', figsize=(6, 6));

## Suppose we encountered a new iris in the wild...
* we mesure the petal length at 5.0 cm and petal width 1.5 cm
* ...what are the probabilities we would classify it as each of the different species of iris?

In [None]:
tree_clf.predict_proba([[5.0, 1.5]])

In [None]:
tree_clf.predict([[5, 1.5]])

In [None]:
iris.target_names[1]

In [None]:
iris.target