In [1]:
# The intuition behind decision trees is based on human decision making. At each step, we ask ourselves a question to help narrow down the decision.
# We do the same in decision trees in a mathematical manner.

In [1]:
import numpy as np
import pandas as pd
# We have a fundamental question given a feature set x and outcome y: "What is the best way to split the data?"


def calculate_entropy(values):
    if len(values) == 0:
        return 0
    counts = values.value_counts()
    entropy = 0
    for count in counts:    
        p = count / len(values)
        # Avoid log(0) errors
        if p > 0:
            entropy += -p * np.log2(p)
    return entropy


def split_and_calculate_entropy(x, y, feature_name):
    feature = x[feature_name]
    feature_values = feature.unique()
    total_samples = len(y)
    weighted_entropy = 0

    for value in feature_values:
        subset_indices = feature == value
        subset_output = y[subset_indices]
        subset_size = len(subset_output)
        # Weight entropy by subset size
        weight = subset_size / total_samples
        weighted_entropy += weight * calculate_entropy(subset_output)
    return weighted_entropy


def split_data(x, y, min_gain=0.01):
    initial_entropy = calculate_entropy(y)
    information_gains = {}
    best_feature = None
    max_gain = 0
    
    for feature in x.columns:
        # Skip features with too many unique values relative to dataset size
        if len(x[feature].unique()) > len(x) / 2:
            continue
            
        gain = initial_entropy - split_and_calculate_entropy(x, y, feature)
        if gain > min_gain:
            information_gains[feature] = gain
            if gain > max_gain:
                max_gain = gain
                best_feature = feature
                
    return information_gains, best_feature




In [2]:
from pandas import DataFrame

x = {"gender": ["male", "female", "male", "female"],
               "class": ["A", "A", "B", "B"]}
y = {"death": [True, False, True, False]}
x_df = DataFrame(x)
y_df = DataFrame(y)
print(split_data(x_df, y_df))




{'gender': np.float64(1.0), 'class': np.float64(-1.0)}


# CART - Classification and Regression Trees

In [None]:


class DecisionTree:
    def __init__(self, max_samples, min_samples_split):
        self.max_samples = max_samples
        self.min_samples_split = min_samples_split
        self.tree = None
    def _gini(self, y):
        counts = y.value_counts()

        

        
