# Decision Tree
https://www.youtube.com/watch?v=NxEHSAfFlK8 <br>
Need to decide on
- Which feature do we split on
- Where to split (eg with numerical values)
- When to stop


In [1]:
import numpy as np
# TODO MUST WRITE THIS OURSELVES
from collections import Counter

class Node:
    """
    A class made for nodes of a decision tree
    """
    # to pass a value to the var 'value', you must call the name (because of *)
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        # Which feature to split on
        self.feature = feature
        # Which threshold to split on (where)
        self.threshold = threshold
        # Left node we're pointing to
        self.left = left
        # Right node we're pointing to
        self.right = right 
        # Value of the node. Incase it is not a leaf node, value is None
        self.value = None

    def is_leaf_node(self):
        """
        This function checks whether a node is a leaf node or not

        Returns:
        True if the node is a leaf node
        False id the node is NOT a leaf node
        """
        return self.value is not None


class DecisionTree:
    """
    A class for a decision tree
    """
    def __init__(self, min_sample_split=2, max_depth=100, n_features=None):
        # Stopping criteria
        self.min_samples_split = min_sample_split
        self.max_depth = max_depth
        # Add randomness to tree by eg using subset of features
        self.n_features = n_features
        # Root, start of tree
        self.root = None

    def fit(self, X, y):
        """
        ....
        
        """
        # Making sure that parameter n_features does not exceed self.n_features
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1], self.n_features)
        # building tree recursively
        self.root = self._grow_tree(X,y)
    
    # Helper function
    def _grow_tree(self, X, y, depth=0):
        """
        Function that grows the tree recursively, helper function used in fit function

        """
        n_samples, n_featu = X.shape
        # Get all unique labels (values of y)
        n_labels = len(np.unique(y))

        # Check the stopping criteria
        if (depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value = leaf_value)
        
        # we create the randomness of our decision tree here, no duplicates
        feature_index = np.random.choice(n_featu, self.n_features, replace=False)
        # Find the best split
        best_feat, best_thres = self._best_split(X, y, feature_idex)
        # Create child nodes, create new subtrees

        # Depth increase by 1 as we increase child nodes

    # Helper function to select best split
    def _best_split(self, X, y, feature_ids):
        """
        Helper function that calculates the best possible split based on the information gain
        
        Return:
        Split place and split threshold
        """
        # Initial set as -1
        best_gain = -1
        split_id, split_thres = None, None

        for feature_id, in feature_ids:
            # Get column
            X_column = X[:, feature_id]
            # Get only the unique ones
            thresholds = np.unique(X_column)

            for threshold in thresholds:
                # Calculate the information gain
                gain = self.information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_id = feature_id
                    split_thres = threshold
        return split_id, split_thres

    # Helper function for information gain
    def _information_gain(self, y, X_column, threshold):
        """
        Helper function that calculates the information gain

        Return:
        Information gain
        """
        # Parent entropy
        parent_entropy = self._entropy(y)

        # Create children
        left_indices, right_indices = self._split(X_column, threshold)

        # If no more children, IG is 0
        if len(left_indices) == 0 or len(right_indices) == 0:
            return 0

        # Caculate the weighted average entropy of the children
        n = len(y)
        n_left, n_right  = len(left_indices), len(right_indices)
        entropy_left, entropy_right = self._entropy(y[left_indices]), self._entropy(y[right_indices])
        child_entropy = (n_left / n) * entropy_left + (n_right / n) * entropy_right\
        
        # Calculate the information gain (IG)
        information_gain = parent_entropy - child_entropy
        return information_gain
    
    def _split(self, X_column, split_thresh):
        """
        Helper function that splits the tree creating children

        Return:
        Left and right indices
        """
        # Gives you one list of values (.flatten())
        left_indices = np.argwhere(X_column <= split_thresh).flatten()
        right_indices = np.argwhere(X_column > split_thresh).flatten()

        return left_indices, right_indices

    def _entropy(self, y):
        """
        Helper function that calculates the entropy

        Return:
        Entropy
        """

        # Count the occurances in bins (histogram)
        hist = np.bincount(y)
        ps = hist / len(y)
        # Entropy formula
        # TODO Is it log2 or log3?
        entropy = - np.sum([p*np.log2(p) for p in ps if p>0])
        return entropy
    
    # Helper function to calculate value of y
    def _most_common_label(self, y):
        """
        Helper function used in the _grow_tree_function to calculate the value of a node
        
        Return:
        Returns the value of y
        """
        # TODO: REWRITE
        counter = Counter(y)
        # Get most common label's tuple and the first info including the value
        value = counter.most_common(1)[0][0]
        return value

    def predict():



SyntaxError: incomplete input (1533995893.py, line 103)

In [13]:
y = ['hallo', 'hallo', 'thuis', 'tv']
dct = {}
for i in y:
    if i in dct:
        dct[i] += 1
    else:
        dct[i] = 1
print(dct)
max(set(y), key=y.count)


{'hallo': 2, 'thuis': 1, 'tv': 1}


'hallo'