In [1]:
import numpy as np

# Decision Tree (Árvore de Decisão)

In [2]:
X_train = np.array([[1, 1, 1],
[0, 0, 1],
 [0, 1, 0],
 [1, 0, 1],
 [1, 1, 1],
 [1, 1, 0],
 [0, 0, 0],
 [1, 1, 0],
 [0, 1, 0],
 [0, 1, 0]])

y_train = np.array([1, 1, 0, 0, 1, 1, 0, 1, 0, 0])

In [3]:
def entropy(p):
    if p == 0 or p == 1:
        return 0
    else:
        return -p * np.log2(p) - (1 - p) * np.log2(1 - p)

In [4]:
entropy(0.5)

1.0

In [13]:
def split_indices(X, index_feature):
    """
    Given a dataset and a index feature, return two lists for the two split nodes, the left node has the animals that have 
    that feature = 1 and the right node those that have the feature = 0
    """

    left_indices = []
    right_indices = []
    for i, x in enumerate(X):
        if x[index_feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices, right_indices

In [14]:
def weighted_entropy(X, y, left_indices, right_indices, entropy):
    """
    This function takes the splitted dataset, the indices we chose to split and returns the weighted entropy.
    """

    w_left = len(left_indices) / len(X)
    w_right = len(right_indices) / len(X)
    p_left = sum(y[left_indices]) / len(left_indices)
    p_right = sum(y[right_indices]) / len(right_indices)

    weighted_entropy = w_left * entropy(p_left) + w_right * entropy(p_right)
    return weighted_entropy

In [15]:
left_indices, right_indices = split_indices(X_train, 0)
weighted_entropy(X_train, y_train, left_indices, right_indices, entropy)

0.7219280948873623

In [16]:
def information_gain(X, y, left_indices, right_indices, entropy, weighted_entropy):
    """
    Here, X has the elements in the node and y is theirs respectives classes
    """

    p_node = sum(y) / len(y)
    h_node = entropy(p_node)
    w_entropy = weighted_entropy(X, y, left_indices, right_indices, entropy)
    return h_node - w_entropy

In [17]:
information_gain(X_train, y_train, left_indices, right_indices, entropy, weighted_entropy)

0.2780719051126377