In [56]:
from sklearn.datasets import load_iris
from sklearn import tree

iris = load_iris()
X, y = iris.data, iris.target


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [57]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y, criterion=None):
    model = tree.DecisionTreeClassifier(criterion=criterion, max_leaf_nodes=max_leaf_nodes)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae, preds_val

## criterion='gini'

In [58]:
# Define mean absolute error (MAE) array for gini index
MAE_gini = []

# Define DataFrame for predicted values for the gini criterion
from pandas import DataFrame
predict_vals_gini = DataFrame()

for max_leaf_nodes in [5, 25, 50, 100, 250, 500]:
    my_mae, preds_val = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y, 'gini')
    predict_vals_gini[str(max_leaf_nodes)]=preds_val
    MAE_gini.append(my_mae)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %f" %(max_leaf_nodes, my_mae))
    
# Turn MAE_gini array into a Pandas series
from pandas import Series 
MAE_gini = Series(MAE_gini, index=[5, 25, 50, 100, 250, 500])
min_index_gini = MAE_gini.idxmin()

print('The number of leaf nodes that minimizes the mean absolute error is: \n', min_index_gini)

Max leaf nodes: 5  		 Mean Absolute Error:  0.026316
Max leaf nodes: 25  		 Mean Absolute Error:  0.026316
Max leaf nodes: 50  		 Mean Absolute Error:  0.026316
Max leaf nodes: 100  		 Mean Absolute Error:  0.026316
Max leaf nodes: 250  		 Mean Absolute Error:  0.026316
Max leaf nodes: 500  		 Mean Absolute Error:  0.026316
The number of leaf nodes that minimizes the mean absolute error is: 
 5


In [64]:
predict_vals_gini

#predict_vals_gini.shape

Unnamed: 0,5,25,50,100,250,500
0,0,0,0,0,0,0
1,1,1,1,1,1,1
2,1,1,1,1,1,1
3,0,0,0,0,0,0
4,2,2,2,2,2,2
5,1,1,1,1,1,1
6,2,2,2,2,2,2
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,2,2,2,2,2,2


In [66]:
val_y = Series(val_y)
print(val_y)


0     0
1     1
2     1
3     0
4     2
5     1
6     2
7     0
8     0
9     2
10    1
11    0
12    2
13    1
14    1
15    0
16    1
17    1
18    0
19    0
20    1
21    1
22    1
23    0
24    2
25    1
26    0
27    0
28    1
29    2
30    1
31    2
32    1
33    2
34    2
35    0
36    1
37    0
dtype: int64


## criterion='entropy'

In [60]:
# Define MAE array for the entropy criterion
MAE_entropy = []

# Define DataFrame for predicted values for the entropy criterion
from pandas import DataFrame
predict_vals_entropy = DataFrame()

for max_leaf_nodes in [5, 25, 50, 100, 250, 500]:
    my_mae, preds_val = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y, 'entropy')
    predict_vals_entropy[str(max_leaf_nodes)]=preds_val
    MAE_entropy.append(my_mae)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %f" %(max_leaf_nodes, my_mae))
    
# Turn MAE_entropy array into a Pandas Series.
from pandas import Series 
MAE_entropy = Series(MAE_entropy, index=[5, 25, 50, 100, 250, 500])
min_index_entropy = MAE_entropy.idxmin()

print('The number of leaf nodes that minimizes the mean absolute error is: \n', min_index_entropy)

Max leaf nodes: 5  		 Mean Absolute Error:  0.026316
Max leaf nodes: 25  		 Mean Absolute Error:  0.026316
Max leaf nodes: 50  		 Mean Absolute Error:  0.026316
Max leaf nodes: 100  		 Mean Absolute Error:  0.026316
Max leaf nodes: 250  		 Mean Absolute Error:  0.026316
Max leaf nodes: 500  		 Mean Absolute Error:  0.026316
The number of leaf nodes that minimizes the mean absolute error is: 
 5


In [61]:
predict_vals_entropy

Unnamed: 0,5,25,50,100,250,500
0,0,0,0,0,0,0
1,1,1,1,1,1,1
2,1,1,1,1,1,1
3,0,0,0,0,0,0
4,2,2,2,2,2,2
5,1,1,1,1,1,1
6,2,2,2,2,2,2
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,2,2,2,2,2,2


# Misclassification Tracking

### The number of leaf nodes does not effect the error. Let find out why?

Let us first find out which flower in the iris data set is misclassified.

In [29]:
# We randomly choose 5 leaf nodes because it has no effect on the error.
# The criterion does effect the error so we choose the gini index.

model = tree.DecisionTreeClassifier(criterion='gini', max_leaf_nodes=5)
# We take the instance of the DecisionTreeClassifierClass class, save in the variable "model", and
# run the "fit" method on the Training Data.
model.fit(train_X, train_y)
predict_val = model.predict(val_X)

In [32]:
bool_array = predict_val != val_y

bool_array

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False])

We see that model makes one error. Let us find out where the max occured and what its feature vector is.

In [34]:
error_index = bool_array.argmax()

22