In [1]:
import pandas as pd
import numpy as np
from graphviz import Digraph

def visualize_tree(tree, features, parent_name='', graph=None, is_root=True):
    if graph is None:
        graph = Digraph()
        graph.node(name='root', label=next(iter(tree)), shape='ellipse')
        parent_name = 'root'

    if isinstance(tree, dict):
        for k, v in tree.items():
            if isinstance(v, dict):
                node_name = f'{k}'
                if is_root:
                    is_root = False
                else:
                    if node_name not in features:
                        graph.node(name=node_name, label=str(k), shape='box')
                    else:
                        graph.node(name=node_name, label=str(k), shape='ellipse')
                graph.edge(parent_name, node_name)
                visualize_tree(v, features, node_name, graph, is_root)
            else:
                if v == 'Yes' or v == 'No':
                    node_name = f'{k}'
                    if node_name not in features:
                        graph.node(name=node_name, label=str(k), shape='box')
                    else:
                        graph.node(name=node_name, label=str(k), shape='ellipse')
                    graph.edge(parent_name, node_name)
                    node_name = f'{k}_{v}'
                    graph.node(name=node_name, label=str(v), shape='diamond')
                    graph.edge(str(k), node_name)
    else:
        if tree == 'Yes' or tree == 'No':
            node_name = f'{parent_name}_{tree}'
            graph.node(name=node_name, label=str(tree), shape='diamond')
            graph.edge(parent_name, node_name)

    return graph

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_value = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy_value

def info_gain(data, split_attribute_name, target_name="class"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for i in range(len(vals))])
    information_gain = total_entropy - weighted_entropy
    return information_gain

def id3(data, original_data, features, target_attribute_name="class", parent_node_class=None, max_depth=5, min_samples_split=2, depth=0):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data) == 0:
        return np.unique(original_data[target_attribute_name])[np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])]
    elif len(features) == 0 or depth >= max_depth or len(data) < min_samples_split:
        return parent_node_class
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        item_values = [info_gain(data, feature, target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        tree = {best_feature: {}}
        features = [i for i in features if i != best_feature]
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, original_data, features, target_attribute_name, parent_node_class, max_depth, min_samples_split, depth + 1)
            tree[best_feature][value] = subtree
        return tree

# Examples:
training_data = pd.DataFrame({
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Target': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
})
# training_data = pd.DataFrame({
#     'Feature1': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'],
#     'Feature2': ['X', 'X', 'Y', 'X', 'Y', 'Y', 'X', 'X', 'Y', 'Y'],
#     'Target': ['Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes']
# })
feature_columns = training_data.columns[:-1].to_list()
target_column = 'Target'
decision_tree = id3(training_data, training_data, feature_columns, target_column, max_depth=1, min_samples_split=3)
print("Pre-pruned Tree:", decision_tree)

# Visualize the pre-pruned tree
graph = visualize_tree(decision_tree, feature_columns)
graph.render('pre_pruned_decision_tree', format='png', cleanup=True)

Pre-pruned Tree: {'Outlook': {'Overcast': 'Yes', 'Rain': 'Yes', 'Sunny': 'Yes'}}


'pre_pruned_decision_tree.png'