In [1]:
import numpy as np
from sklearn import datasets, model_selection, metrics
from decision_tree import DecisionTree

# Training on Iris Dataset

In [2]:
iris = datasets.load_iris()

X = np.array(iris.data)
Y = np.array(iris.target)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25, random_state=0)
print("Train Shape:", X_train.shape)
print("Train Shape:", X_test.shape)

Train Shape: (112, 4)
Train Shape: (38, 4)


In [3]:
# Building the tree
my_tree = DecisionTree(max_depth=4, min_samples_leaf=1)
my_tree.train(X_train, Y_train)

In [4]:
# Let's see the tree
my_tree.print_tree()

                                                ->  LEAF:  Labels Count=0.0->10 Pred Probs=[1. 0. 0.]
                                ->  Node: Is X[2] < 3.0 
                                                ->  LEAF:  Labels Count=1.0->11 Pred Probs=[0. 1. 0.]
                ->  Node: Is X[1] < 3.2 
                                                ->  LEAF:  Labels Count=0.0->10 Pred Probs=[1. 0. 0.]
                                ->  Node: Is X[0] < 5.1 
                                                ->  LEAF:  Labels Count=0.0->17 Pred Probs=[1. 0. 0.]
->  Node: Is X[3] < 1.3 
                                                ->  LEAF:  Labels Count=1.0->12 Pred Probs=[0. 1. 0.]
                                ->  Node: Is X[2] < 4.5 
                                                ->  LEAF:  Labels Count=1.0->11, 2.0->6 Pred Probs=[0.         0.64705882 0.35294118]
                ->  Node: Is X[2] < 5.1 
                                                ->  LEAF:  Labels Count=2.0->1

In [7]:
# Let's see the Train performance
train_preds = my_tree.predict(X_set=X_train)
print("TRAIN PERFORMANCE")
print("Train size", len(Y_train))
print("True preds", sum(train_preds == Y_train))
print("Train Accuracy", sum(train_preds == Y_train) / len(Y_train))

TRAIN PERFORMANCE
Train size 112
True preds 106
Train Accuracy 0.9464285714285714


In [9]:
# Let's see the Test performance
test_preds = my_tree.predict(X_set=X_test)
print("TEST PERFORMANCE")
print("Test size", len(Y_test))
print("True preds", sum(test_preds == Y_test))
print("Accuracy", sum(test_preds == Y_test) / len(Y_test))

TEST PERFORMANCE
Test size 38
True preds 34
Accuracy 0.8947368421052632


# Training on Breast Cancer Dataset

In [15]:
# Load data
data = datasets.load_breast_cancer()
X = data.data
Y = data.target

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=0)
print("Train Shape:", X_train.shape)
print("Train Shape:", X_test.shape)

Train Shape: (455, 30)
Train Shape: (114, 30)


In [16]:
# Building the tree
my_tree_2 = DecisionTree(max_depth=4, min_samples_leaf=1)
my_tree_2.train(X_train, Y_train)

In [17]:
my_tree_2.print_tree()

                                                ->  LEAF:  Labels Count=1.0->56 Pred Probs=[0. 1.]
                                ->  Node: Is X[0] < 11.34 
                                                ->  LEAF:  Labels Count=1.0->57 Pred Probs=[0. 1.]
                ->  Node: Is X[7] < 0.0218 
                                                ->  LEAF:  Labels Count=1.0->57 Pred Probs=[0. 1.]
                                ->  Node: Is X[21] < 22.45 
                                                ->  LEAF:  Labels Count=0.0->5, 1.0->52 Pred Probs=[0.0877193 0.9122807]
->  Node: Is X[20] < 14.85 
                                                ->  LEAF:  Labels Count=0.0->4, 1.0->53 Pred Probs=[0.07017544 0.92982456]
                                ->  Node: Is X[26] < 0.2649 
                                                ->  LEAF:  Labels Count=0.0->43, 1.0->14 Pred Probs=[0.75438596 0.24561404]
                ->  Node: Is X[20] < 18.409999999999997 
                          

In [18]:
# Let's see the Train performance
train_preds = my_tree_2.predict(X_set=X_train)
print("TRAIN PERFORMANCE")
print("Train size", len(Y_train))
print("True preds", sum(train_preds == Y_train))
print("Train Accuracy", sum(train_preds == Y_train) / len(Y_train))

TRAIN PERFORMANCE
Train size 455
True preds 431
Train Accuracy 0.9472527472527472


In [19]:
# Let's see the Test performance
test_preds = my_tree_2.predict(X_set=X_test)
print("TEST PERFORMANCE")
print("Test size", len(Y_test))
print("True preds", sum(test_preds == Y_test))
print("Accuracy", sum(test_preds == Y_test) / len(Y_test))

TEST PERFORMANCE
Test size 114
True preds 108
Accuracy 0.9473684210526315
