In [1]:
# Decision Tree models help highlight how we can use machine learning to enhance our decision making abilities. 
# We’ve all encountered Decision Trees at one point or another. However, where Decision Tree machine learning 
# models differ is in the fact that they use logic and math to generate rules, rather than selecting them on 
# the basis of intuition and subjectivity.

In [2]:
# Decision of a split
# In order to determine which of the three splits is better, we introduce a concept called impurity. 
# Impurity refers to the fact that none of the leaves have a 100% impurity. 
# There are several ways to measure impurity (quality of a split), 
# however, the scikit-learn implementation of the DecisionTreeClassifer uses gini by default, 
# therefore, that’s the one we’re going to cover in this article.

# To calculate the Gini impurity of the left node, 
# we subtract 1 by the fraction of people that are left and the fraction of people that aren’t left.

In [3]:
# The information gain (with Gini Index) is written as follows:
# Information Gain= Gini Coeff - [fraction at left child]* Gini at Left child - [fraction at right child]* Gini at right child

In [4]:
# We can’t go on splitting indefinitely. Therefore, we need a way of telling the tree when to stop. 
# The scikit-learn implementation of the DecisionTreeClassifer uses the minimum impurity decrease 
# to determine whether a node should be split.

In [5]:
# Import the following libraries
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO 
from IPython.display import Image 
from pydot import graph_from_dot_data
import pandas as pd
import numpy as np



In [6]:
# Load the dataset

iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Categorical.from_codes(iris.target, iris.target_names)

In [7]:
# In the proceeding section, 
# build a decision tree classifier to determine the kind of flower given its dimensions.

In [8]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
# Although, decision trees can handle categorical data, still encode the targets in terms of digits
# (i.e. setosa=0, versicolor=1, virginica=2) in order to create a confusion matrix at a later point. 
# Fortunately, the pandas library provides a method for this very purpose.

y = pd.get_dummies(y)

In [10]:
# Test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [11]:
# Create and Train an instance of the DecisionTreeClassifer class. 
# Y values needs to be provided because our model uses a supervised Machine learning algorithm.

In [12]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [21]:
# View the actual decision tree produced by our model by running the following block of code.

dot_data = StringIO()
export_graphviz(dt, out_file=dot_data, feature_names=iris.feature_names)
(graph, ) = graph_from_dot_data(dot_data.getvalue())

#Image(graph.create_png())

In [14]:
# Accuracy of  decision tree does when its presented with test data.

y_pred = dt.predict(X_test)

In [15]:
# Confusion Matrix

species = np.array(y_test).argmax(axis=1)
predictions = np.array(y_pred).argmax(axis=1)
confusion_matrix(species, predictions)

# the diagonals represent correct identification. The other elements the corresponding wrong ones.

array([[13,  0,  0],
       [ 0, 15,  1],
       [ 0,  0,  9]], dtype=int64)

In [16]:
# Decision tree classifier correctly classified 37/38 plants.