In [1]:
import numpy as np

# Decision Tree (Árvore de Decisão)

In [49]:
X_train = np.array([[1, 1, 1],
[0, 0, 1],
 [0, 1, 0],
 [1, 0, 1],
 [1, 1, 1],
 [1, 1, 0],
 [0, 0, 0],
 [1, 1, 0],
 [0, 1, 0],
 [0, 1, 0]])

y_train = np.array([1, 1, 0, 0, 1, 1, 0, 1, 0, 0])

## Entropy
Entropia:
$$H(p_1) = -p_1 \text{log}_2(p_1) - (1- p_1) \text{log}_2(1- p_1)$$

In [60]:
def entropy(y):
    """
    [EN-US] Computes the entropy
    [PT-BR] Computa a entropia
    
    Args:
       y (ndarray): Numpy array indicating whether each example at a node is
           edible (`1`) or poisonous (`0`)
       
    Returns:
        entropy (float): Entropy at that node
    """
    
    entropy = .0

    if len(y) != 0 or type(y) != numpy.float64:
        p = len(y[y == 1]) / len(y)
    
        if p == 0 or p == 1:
            entropy = .0
        else:
            entropy = -p * np.log2(p) - (1 - p) * np.log2(1 - p)

    return entropy

In [51]:
entropy(np.array([0.5, 1]))

1.0

In [52]:
def split_indices(X, index_feature):
    """
    Splits the data at the given node into
    left and right branches
    
    Args:
        X (ndarray):             Data matrix of shape(n_samples, n_features)
        index_feature (int):           Index of feature to split on
    
    Returns:
        left_indices (list):     Indices with feature value == 1
        right_indices (list):    Indices with feature value == 0
    """

    left_indices = []
    right_indices = []
    for i, x in enumerate(X):
        if x[index_feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices, right_indices

In [65]:
def weighted_entropy(X, y, index_feature):
    """
    This function takes the splitted dataset, the indices we chose to split and returns the weighted entropy.
    """

    left_indices, right_indices = split_indices(X, index_feature)
    
    w_left = len(left_indices) / len(X)
    w_right = len(right_indices) / len(X)
    p_left = sum(y[left_indices]) / len(left_indices)
    p_right = sum(y[right_indices]) / len(right_indices)

    weighted_entropy = w_left * entropy(p_left) + w_right * entropy(p_right)
    return weighted_entropy

In [66]:
left_indices, right_indices = split_indices(X_train, 0)
weighted_entropy(X_train, y_train, 0)

TypeError: object of type 'numpy.float64' has no len()

## Information Gain
$$\text{Information Gain} = H(p_1^\text{node})- (w^{\text{left}}H(p_1^\text{left}) + w^{\text{right}}H(p_1^\text{right}))$$

In [67]:
def information_gain(X, y, index_feature):
    """
    [EN-US] Compute the information of splitting the node on a given feature
    [PT-BR] Computa o information gain da divisão do nó em uma feature    
    """    

    p_node = sum(y) / len(y)
    h_node = entropy(p_node)
    w_entropy = weighted_entropy(X, y, index_feature)
    return h_node - w_entropy

In [17]:
information_gain(X_train, y_train, left_indices, right_indices, entropy, weighted_entropy)

0.2780719051126377

In [70]:
def best_split(X, y):
    """
    Returns the optimal feature and threshold value
    to split the node data 
    
    Args:
        X (ndarray):            Data matrix of shape(n_samples, n_features)
        y (array like):         list or ndarray with n_samples containing the target variable
        node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.

    Returns:
        best_feature (int):     The index of the best feature to split
    """    
    
    _, features = X.shape[1]
    best_feature = -1

    max_info_gain = 0
    for feature in range(features):
        gain = information_gain(X, y, feature)

        if gain > max_infor_gain:
            max_info_gain = gain
            best_feature = feature

    return best_feature

In [71]:
def tree_recursive(X, y, branch_name, max_depth, current_depth):
    """
    Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.
    This function just prints the tree.
    
    Args:
        X (ndarray):            Data matrix of shape(n_samples, n_features)
        y (array like):         list or ndarray with n_samples containing the target variable
        node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
        branch_name (string):   Name of the branch. ['Root', 'Left', 'Right']
        max_depth (int):        Max depth of the resulting tree. 
        current_depth (int):    Current depth. Parameter used during recursive call.
   
    """

    if current_depth == max_depth:
        formatting = ' ' * current_depth + '-' * current_depth
        print(f'{formatting} {branch_name} leaf node with indices {list(range(X.shape[0]))}')
        return

    best_feature = best_split(X, y)
    formatting = '-' * current_depth
    print(f'{formatting} Depth {current_depth}, {branch_name}: Split on feature: {best_feature}')

    left_indices, right_indices = split_indices(X, best_feature)
    tree.append((left_indices, right_indices, best_feature))

    tree_recursive(X[left_indices], y, 'Left', max_depth, current_depth+1)
    tree_recursive(X[right_indices], y, 'Right', max_depth, current_depth+1)