In [9]:
import numpy as np
from sklearn import datasets, model_selection, metrics
from decision_tree import DecisionTree

# Training on Iris Dataset

In [10]:
iris = datasets.load_iris()

X = np.array(iris.data)
Y = np.array(iris.target)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25, random_state=42)
print("Train Size:", X_train.shape[0])
print("Train Size:", X_test.shape[0])

Train Size: 112
Train Size: 38


In [11]:
# Building the tree
my_tree = DecisionTree(max_depth=4, min_samples_leaf=1)
my_tree.train(X_train, Y_train)

In [12]:
# Let's see the tree
my_tree.print_tree()

                                                ->  Idx=0  Val=4.6 Labels=(array([0.]), array([11])) Pred Probs=[1. 0. 0.]
                                ->  Idx=2  Val=3.0 Labels=(array([0., 1.]), array([11, 12])) Pred Probs=[0.47826087 0.52173913 0.        ]
                                                ->  Idx=0  Val=5.5 Labels=(array([1.]), array([12])) Pred Probs=[0. 1. 0.]
                ->  Idx=1  Val=3.3 Labels=(array([0., 1.]), array([35, 12])) Pred Probs=[0.74468085 0.25531915 0.        ]
                                                ->  Idx=0  Val=5.0 Labels=(array([0.]), array([8])) Pred Probs=[1. 0. 0.]
                                ->  Idx=0  Val=5.1 Labels=(array([0.]), array([24])) Pred Probs=[1. 0. 0.]
                                                ->  Idx=0  Val=5.2 Labels=(array([0.]), array([16])) Pred Probs=[1. 0. 0.]
->  Idx=3  Val=1.3 Labels=(array([0., 1., 2.]), array([35, 39, 38])) Pred Probs=[0.3125     0.34821429 0.33928571]
                         

In [13]:
# Let's see the Train performance
train_preds = my_tree.predict(X_set=X_train)
print("TRAIN PERFORMANCE")
print("Train size", len(Y_train))
print("True preds", sum(train_preds == Y_train))
print("Train Accuracy", sum(train_preds == Y_train) / len(Y_train))

TRAIN PERFORMANCE
Train size 112
True preds 106
Train Accuracy 0.9464285714285714


In [14]:
# Let's see the Test performance
test_preds = my_tree.predict(X_set=X_test)
print("TEST PERFORMANCE")
print("Test size", len(Y_test))
print("True preds", sum(test_preds == Y_test))
print("Accuracy", sum(test_preds == Y_test) / len(Y_test))

TEST PERFORMANCE
Test size 38
True preds 38
Accuracy 1.0


# Training on Breast Cancer Dataset

In [16]:
# Load data
data = datasets.load_breast_cancer()
X = data.data
Y = data.target

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)
print("Train Size:", X_train.shape[0])
print("Train Size:", X_test.shape[0])

Train Size: 455
Train Size: 114


In [17]:
# Building the tree
my_tree_2 = DecisionTree(max_depth=4, min_samples_leaf=1)
my_tree_2.train(X_train, Y_train)

In [18]:
my_tree_2.print_tree()

                                                ->  Idx=0  Val=10.12 Labels=(array([1.]), array([56])) Pred Probs=[0. 1.]
                                ->  Idx=0  Val=11.46 Labels=(array([1.]), array([113])) Pred Probs=[0. 1.]
                                                ->  Idx=0  Val=12.42 Labels=(array([1.]), array([57])) Pred Probs=[0. 1.]
                ->  Idx=7  Val=0.02 Labels=(array([0., 1.]), array([  6, 221])) Pred Probs=[0.02643172 0.97356828]
                                                ->  Idx=0  Val=12.05 Labels=(array([1.]), array([57])) Pred Probs=[0. 1.]
                                ->  Idx=21  Val=22.96 Labels=(array([0., 1.]), array([  6, 108])) Pred Probs=[0.05263158 0.94736842]
                                                ->  Idx=0  Val=11.85 Labels=(array([0., 1.]), array([ 6, 51])) Pred Probs=[0.10526316 0.89473684]
->  Idx=20  Val=14.97 Labels=(array([0., 1.]), array([169, 286])) Pred Probs=[0.37142857 0.62857143]
                                

In [19]:
# Let's see the Train performance
train_preds = my_tree_2.predict(X_set=X_train)
print("TRAIN PERFORMANCE")
print("Train size", len(Y_train))
print("True preds", sum(train_preds == Y_train))
print("Train Accuracy", sum(train_preds == Y_train) / len(Y_train))

TRAIN PERFORMANCE
Train size 455
True preds 425
Train Accuracy 0.9340659340659341


In [20]:
# Let's see the Test performance
test_preds = my_tree_2.predict(X_set=X_test)
print("TEST PERFORMANCE")
print("Test size", len(Y_test))
print("True preds", sum(test_preds == Y_test))
print("Accuracy", sum(test_preds == Y_test) / len(Y_test))

TEST PERFORMANCE
Test size 114
True preds 107
Accuracy 0.9385964912280702
