In [1]:
import numpy as np
from sklearn import datasets, model_selection, metrics
from decision_tree import DecisionTree

# Training on Iris Dataset

In [2]:
iris = datasets.load_iris()

X = np.array(iris.data)
Y = np.array(iris.target)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25, random_state=42)
print("Train Size:", X_train.shape[0])
print("Train Size:", X_test.shape[0])

Train Size: 112
Train Size: 38


In [3]:
# Building the tree
my_tree = DecisionTree(max_depth=4, min_samples_leaf=1)
my_tree.train(X_train, Y_train)

In [5]:
# Let's see the tree
my_tree.print_tree()

                                                ->  LEAF:  Labels Count=0.0->11 Pred Probs=[1. 0. 0.]
                                ->  Node: Is X[2] < 3.0 
                                                ->  LEAF:  Labels Count=1.0->12 Pred Probs=[0. 1. 0.]
                ->  Node: Is X[1] < 3.3 
                                                ->  LEAF:  Labels Count=0.0->8 Pred Probs=[1. 0. 0.]
                                ->  Node: Is X[0] < 5.1 
                                                ->  LEAF:  Labels Count=0.0->16 Pred Probs=[1. 0. 0.]
->  Node: Is X[3] < 1.3 
                                                ->  LEAF:  Labels Count=1.0->13 Pred Probs=[0. 1. 0.]
                                ->  Node: Is X[2] < 4.5 
                                                ->  LEAF:  Labels Count=1.0->13, 2.0->5 Pred Probs=[0.         0.72222222 0.27777778]
                ->  Node: Is X[3] < 1.8 
                                                ->  LEAF:  Labels Count=1.0->1,

In [6]:
# Let's see the Train performance
train_preds = my_tree.predict(X_set=X_train)
print("TRAIN PERFORMANCE")
print("Train size", len(Y_train))
print("True preds", sum(train_preds == Y_train))
print("Train Accuracy", sum(train_preds == Y_train) / len(Y_train))

TRAIN PERFORMANCE
Train size 112
True preds 106
Train Accuracy 0.9464285714285714


In [7]:
# Let's see the Test performance
test_preds = my_tree.predict(X_set=X_test)
print("TEST PERFORMANCE")
print("Test size", len(Y_test))
print("True preds", sum(test_preds == Y_test))
print("Accuracy", sum(test_preds == Y_test) / len(Y_test))

TEST PERFORMANCE
Test size 38
True preds 38
Accuracy 1.0


# Training on Breast Cancer Dataset

In [8]:
# Load data
data = datasets.load_breast_cancer()
X = data.data
Y = data.target

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)
print("Train Size:", X_train.shape[0])
print("Train Size:", X_test.shape[0])

Train Size: 455
Train Size: 114


In [9]:
# Building the tree
my_tree_2 = DecisionTree(max_depth=4, min_samples_leaf=1)
my_tree_2.train(X_train, Y_train)

In [10]:
my_tree_2.print_tree()

                                                ->  LEAF:  Labels Count=1.0->56 Pred Probs=[0. 1.]
                                ->  Node: Is X[0] < 11.46 
                                                ->  LEAF:  Labels Count=1.0->57 Pred Probs=[0. 1.]
                ->  Node: Is X[7] < 0.02107 
                                                ->  LEAF:  Labels Count=1.0->57 Pred Probs=[0. 1.]
                                ->  Node: Is X[21] < 22.965 
                                                ->  LEAF:  Labels Count=0.0->6, 1.0->51 Pred Probs=[0.10526316 0.89473684]
->  Node: Is X[20] < 14.97 
                                                ->  LEAF:  Labels Count=0.0->8, 1.0->49 Pred Probs=[0.14035088 0.85964912]
                                ->  Node: Is X[23] < 830.75 
                                                ->  LEAF:  Labels Count=0.0->41, 1.0->16 Pred Probs=[0.71929825 0.28070175]
                ->  Node: Is X[27] < 0.1578 
                                  

In [11]:
# Let's see the Train performance
train_preds = my_tree_2.predict(X_set=X_train)
print("TRAIN PERFORMANCE")
print("Train size", len(Y_train))
print("True preds", sum(train_preds == Y_train))
print("Train Accuracy", sum(train_preds == Y_train) / len(Y_train))

TRAIN PERFORMANCE
Train size 455
True preds 425
Train Accuracy 0.9340659340659341


In [12]:
# Let's see the Test performance
test_preds = my_tree_2.predict(X_set=X_test)
print("TEST PERFORMANCE")
print("Test size", len(Y_test))
print("True preds", sum(test_preds == Y_test))
print("Accuracy", sum(test_preds == Y_test) / len(Y_test))

TEST PERFORMANCE
Test size 114
True preds 107
Accuracy 0.9385964912280702
