# Self-study try-it activity 9.1: Implementing the computer algorithm in Python

A decision tree is a hierarchical model that uses a sequence of decision rules, based on data features, to classify or predict outcomes. It is intuitive, easy to interpret and commonly used in machine learning for its ability to handle both categorical and numerical data effectively.


#### Steps to implement decision trees:

1. Calculate the Gini index.

2. Split the data set.

3. Compute the best split for the data set.

4. Create a recursive function to build the tree.

5. Use the tree to make predictions.

In [None]:
# Import the necessary libraries
import numpy as np
from collections import Counter

In [None]:
# 1. Calculate the Gini index for groups
def gini_index(groups, classes):
    n_instances = float(sum([len(group) for group in groups]))
    gini = 0.0
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        score = 0.0
        labels = [row[-1] for row in group]
        for class_val in classes:
            p = labels.count(class_val) / size
            score += p * p
        gini += (1.0 - score) * (size / n_instances)
    return gini


In [None]:
# 2. Split the data set based on the feature index and split value
def test_split(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

In [None]:
# 3. Compute the best split for the data set
def get_best_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    best_index, best_value, best_score, best_groups = None, None, float('inf'), None
    for index in range(len(dataset[0]) - 1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], gini, groups
    return {'index': best_index, 'value': best_value, 'groups': best_groups}

In [None]:
# Create a terminal node value (most common class)
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return Counter(outcomes).most_common(1)[0][0]

# 4. Create a recursive function to build the tree
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # Check for no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # Check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # Process the left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_best_split(left)
        split(node['left'], max_depth, min_size, depth + 1)
    # Process the right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_best_split(right)
        split(node['right'], max_depth, min_size, depth + 1)

In [None]:
# Build a decision tree
def build_tree(train, max_depth, min_size):
    root = get_best_split(train)
    split(root, max_depth, min_size, 1)
    return root

In [None]:
# 5. Use tree to make a prediction 
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

# Example usage with a small data set
dataset = [
    [2.7, 2.5, 0],
    [1.3, 1.8, 0],
    [3.6, 2.9, 0],
    [7.4, 3.1, 1],
    [9.0, 3.3, 1],
    [7.5, 0.5, 1],
    [2.0, 2.2, 0],
    [3.1, 3.0, 0],
    [8.2, 3.5, 1],
    [6.8, 2.8, 1]
]

tree = build_tree(dataset, max_depth=3, min_size=1)

for row in dataset:
    prediction = predict(tree, row)
    print(f'Expected={row[-1]}, Predicted={prediction}')


Extend these steps to the iris data set. Use the built-in functions created, and predict the first ten samples of the iris data set.

In [None]:
from collections import Counter
from sklearn.datasets import load_iris
import numpy as np

In [None]:
# Load the iris data set and prepare the data
iris = load_iris()
X = iris.data
y = iris.target

# Combine X and y for processing
dataset = [list(X[i]) + [y[i]] for i in range(len(y))]

# Build the decision tree
max_depth = 3
min_size = 5
tree = build_tree(dataset, max_depth, min_size)

# Test the prediction on the first ten samples
for i in range(10):
    row = dataset[i]
    prediction = predict(tree, row)
    print(f"Expected: {row[-1]}, Predicted: {prediction}")


## To do:

1. Experiment with different `max_depth` and `min_size` values to find the optimal parameters.

2. Train a `scikit-learn` decision tree on the same data.

3. Use entropy instead of the Gini index and write the code for entropy.

1. Experiment with different `max_depth` and `min_size` values to find the optimal parameters.

In [None]:
dataset = [
    [2.7, 2.5, 0],
    [1.3, 1.8, 0],
    [3.6, 2.9, 0],
    [7.4, 3.1, 1],
    [9.0, 3.3, 1],
    [7.5, 0.5, 1],
    [2.0, 2.2, 0],
    [3.1, 3.0, 0],
    [8.2, 3.5, 1],
    [6.8, 2.8, 1]
]

tree = build_tree(dataset, max_depth=15, min_size=2)

for row in dataset:
    prediction = predict(tree, row)
    print(f'Expected={row[-1]}, Predicted={prediction}')


2: Train a scikit-learn decision tree on the iris data set with different `max_depth` values.

In [None]:
# Load the iris data set and prepare the data
iris = load_iris()
X = iris.data
y = iris.target

# Combine X and y for processing
dataset = [list(X[i]) + [y[i]] for i in range(len(y))]

# Build the decision tree
max_depth = 15
min_size = 1
tree = build_tree(dataset, max_depth, min_size)

# Test the prediction on the first ten samples
for i in range(10):
    row = dataset[i]
    prediction = predict(tree, row)
    print(f"Expected: {row[-1]}, Predicted: {prediction}")


3. Use entropy instead of the Gini index and write the code for entropy.

In [None]:
from math import log2

def entropy(groups, classes):
    n_instances = float(sum([len(group) for group in groups]))
    entropy = 0.0
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        score = 0.0
        labels = [row[-1] for row in group]
        for class_val in classes:
            p = labels.count(class_val) / size
            if p > 0:
                score += p * log2(p)
        entropy += -score * (size / n_instances)
    return entropy

In [None]:
def get_best_split_entropy(dataset):
    class_values = list(set(row[-1] for row in dataset))
    best_index, best_value, best_score, best_groups = None, None, float('inf'), None
    for index in range(len(dataset[0]) - 1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            entropy = entropy(groups, class_values)
            if entropy < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], entropy, groups
    return {'index': best_index, 'value': best_value, 'groups': best_groups}

In [None]:
# Example usage with a small data set:
dataset = [
    [2.7, 2.5, 0],
    [1.3, 1.8, 0],
    [3.6, 2.9, 0],
    [7.4, 3.1, 1],
    [9.0, 3.3, 1],
    [7.5, 0.5, 1],
    [2.0, 2.2, 0],
    [3.1, 3.0, 0],
    [8.2, 3.5, 1],
    [6.8, 2.8, 1]
]

tree = build_tree(dataset, max_depth=3, min_size=1)

for row in dataset:
    prediction = predict(tree, row)
    print(f'Expected={row[-1]}, Predicted={prediction}')