# IRIS - Decision Tree with numpy

In this notebook we will use the iris dataset to build a classification problem with numpy


In [5]:
import numpy as np
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data.astype(float)
y = iris.target.astype(int)

# manual train/test split
np.random.seed(42)
idx = np.random.permutation(len(X))
split = int(0.8 * len(X))
train_idx, test_idx = idx[:split], idx[split:]

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

# Gini + Best Split (NumPy)


In [6]:
def gini(y, n_classes):
    if len(y) == 0:
        return 0.0
    counts = np.bincount(y, minlength=n_classes)
    p = counts / counts.sum()
    return 1.0 - np.sum(p * p)

def best_split(X, y, n_classes):
    n_samples, n_features = X.shape
    parent_gini = gini(y, n_classes)
    best = None  # (gain, feature, threshold)

    for f in range(n_features):
        x = X[:, f]
        # candidate thresholds = midpoints between sorted unique values
        vals = np.unique(x)
        if len(vals) < 2:
            continue
        thresholds = (vals[:-1] + vals[1:]) / 2.0

        for t in thresholds:
            left = x <= t
            right = ~left
            y_left, y_right = y[left], y[right]

            w_left = len(y_left) / n_samples
            w_right = len(y_right) / n_samples

            child_gini = w_left * gini(y_left, n_classes) + w_right * gini(y_right, n_classes)
            gain = parent_gini - child_gini

            if best is None or gain > best[0]:
                best = (gain, f, t)

    return best  # None if no split found

# Build Tree (recursive)


In [7]:
class Node:
    __slots__ = ("feature", "threshold", "left", "right", "pred_class")
    def __init__(self, feature=None, threshold=None, left=None, right=None, pred_class=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.pred_class = pred_class  # majority class at this node

def majority_class(y):
    counts = np.bincount(y)
    return int(np.argmax(counts))

def build_tree(X, y, n_classes, depth=0, max_depth=None, min_samples_split=2):
    # leaf conditions
    if len(y) == 0:
        return None
    pred = majority_class(y)

    # pure node
    if np.all(y == y[0]):
        return Node(pred_class=pred)

    # depth / size stopping
    if max_depth is not None and depth >= max_depth:
        return Node(pred_class=pred)
    if len(y) < min_samples_split:
        return Node(pred_class=pred)

    split = best_split(X, y, n_classes)
    if split is None or split[0] <= 0:
        return Node(pred_class=pred)

    _, f, t = split
    left_mask = X[:, f] <= t
    right_mask = ~left_mask

    left = build_tree(X[left_mask], y[left_mask], n_classes, depth+1, max_depth, min_samples_split)
    right = build_tree(X[right_mask], y[right_mask], n_classes, depth+1, max_depth, min_samples_split)

    return Node(feature=f, threshold=t, left=left, right=right, pred_class=pred)

# Predict


In [8]:
def predict_one(node, x):
    while node.feature is not None:
        if x[node.feature] <= node.threshold:
            node = node.left
        else:
            node = node.right
        if node is None:
            break
    return node.pred_class if node is not None else 0

def predict(node, X):
    return np.array([predict_one(node, x) for x in X], dtype=int)

# Train + Evaluate


In [9]:
n_classes = len(np.unique(y_train))
tree = build_tree(X_train, y_train, n_classes, max_depth=4, min_samples_split=2)

y_pred = predict(tree, X_test)
acc = (y_pred == y_test).mean()
print("Test Accuracy:", acc)

Test Accuracy: 0.9333333333333333


In [10]:
for d in range(1, 8):
    tree_d = build_tree(X_train, y_train, n_classes, max_depth=d)
    acc_d = (predict(tree_d, X_test) == y_test).mean()
    print(f"max_depth={d}  acc={acc_d:.3f}")

max_depth=1  acc=0.600
max_depth=2  acc=0.933
max_depth=3  acc=0.967
max_depth=4  acc=0.933
max_depth=5  acc=0.933
max_depth=6  acc=0.933
max_depth=7  acc=0.933
