In [9]:
pip install graphviz

Note: you may need to restart the kernel to use updated packages.


In [7]:
from sklearn.datasets import load_breast_cancer
#from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz 
import matplotlib.pyplot as plt

data = load_breast_cancer()
print(f"Feature names: {data.feature_names}")
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print(f"first data point in the training set: {X_train[0]}\nlabel: {y_train[0]}")

#decision tree built using the training data 
#DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, ..., ccp_alpha=0.0)
#criterion{“gini”, “entropy”}, default=”gini”
#split made by the best feature within a random selected subset of features 
clf = tree.DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train, y_train)

#test the predictor using the first test case
q = [X_test[1]]
print("query", q)
print("prediction", clf.predict(q))

#pruning the tree using different alphas, save the pruned trees to clfs
#clfs[0] -- the original tree without pruning; clfs[-1] -- the last tree with only one single node
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(clfs[-1].tree_.node_count, ccp_alphas[-1]))
#remove the last tree with only one node
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

#compute the accuracy over training dataset and the test dataset
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

Feature names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
first data point in the training set: [1.185e+01 1.746e+01 7.554e+01 4.327e+02 8.372e-02 5.642e-02 2.688e-02
 2.280e-02 1.875e-01 5.715e-02 2.070e-01 1.238e+00 1.234e+00 1.388e+01
 7.595e-03 1.500e-02 1.412e-02 8.578e-03 1.792e-02 1.784e-03 1.306e+01
 2.575e+01 8.435e+01 5.178e+02 1.369e-01 1.758e-01 1.316e-01 9.140e-02
 3.101e-01 7.007e-02]
label: 1
query [array([1.321e+01, 2.525e+01, 8.410e+01, 5.379e+02, 8.791e-02, 5.205e-0

In [13]:
import os
os.environ["PATH"] += os.pathsep + '/usr/local/opt/graphviz/bin'  # Update the path if needed


In [14]:
max_test_accuracy = test_scores[0]
clf_select = 0
for idx in range(1, len(clfs)):
    if test_scores[idx] > max_test_accuracy:
            max_test_accuracy = test_scores[idx]
            clf_select = idx
print("prunned tree, accuracy ", max_test_accuracy)
#print("alpha ", ccp_alpha[clf_select])

#prepare dot file for graphviz
dot_data = tree.export_graphviz(clfs[clf_select], out_file=None, 
                      feature_names=data.feature_names,  
                      class_names=data.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

prunned tree, accuracy  0.9370629370629371


ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x3372181a0>

In [6]:
# Entropy

In [5]:
from sklearn.datasets import load_breast_cancer
#from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz 
import matplotlib.pyplot as plt

data = load_breast_cancer()
print(f"Feature names: {data.feature_names}")
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print(f"first data point in the training set: {X_train[0]}\nlabel: {y_train[0]}")

#decision tree built using the training data 
#DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, ..., ccp_alpha=0.0)
#criterion{“gini”, “entropy”}, default=”gini”
#split made by the best feature within a random selected subset of features 
clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=0)
clf = clf.fit(X_train, y_train)

#test the predictor using the first test case
q = [X_test[1]]
print("query", q)
print("prediction", clf.predict(q))

#pruning the tree using different alphas, save the pruned trees to clfs
#clfs[0] -- the original tree without pruning; clfs[-1] -- the last tree with only one single node
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(clfs[-1].tree_.node_count, ccp_alphas[-1]))
#remove the last tree with only one node
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

#compute the accuracy over training dataset and the test dataset
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

Feature names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
first data point in the training set: [1.185e+01 1.746e+01 7.554e+01 4.327e+02 8.372e-02 5.642e-02 2.688e-02
 2.280e-02 1.875e-01 5.715e-02 2.070e-01 1.238e+00 1.234e+00 1.388e+01
 7.595e-03 1.500e-02 1.412e-02 8.578e-03 1.792e-02 1.784e-03 1.306e+01
 2.575e+01 8.435e+01 5.178e+02 1.369e-01 1.758e-01 1.316e-01 9.140e-02
 3.101e-01 7.007e-02]
label: 1
query [array([1.321e+01, 2.525e+01, 8.410e+01, 5.379e+02, 8.791e-02, 5.205e-0