# Classification Algorithms Using Trees

## Decision Trees (From Scratch)


In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

from collections import Counter
from typing import Union, Optional


# Black code formatter (Optional)
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


## Decision Trees

[![image.png](https://i.postimg.cc/RCDcYtgh/image.png)](https://postimg.cc/5j8YYXdW)

<br>

[Source](https://www.youtube.com/watch?v=NxEHSAfFlK8&t=287)

<br>

[![image.png](https://i.postimg.cc/nVBF8bkm/image.png)](https://postimg.cc/vD8F9KC8)


### Decisions To Be Made

1. **Split feature**: Which feature should be used for the splitting?
2. **Split point**: At what point in a numerical variable should we split?
3. **When to stop splitting**: When you should you stop splitting to avoid trees from growing so big?

### Steps

> The following steps are used to build the Decision Tree classifier from scratch.

#### Training
Given the entire dataset:
1. Calculate the information gain with each possible split. i.e using all the possible features, calculate the IG.
2. Divide the data with the feature and the value threshold (if it's numerical) that gives the most information gain.
3. The result from step 2 is used to create the branches.
4. Repeat steps 1 thru 3 until a stopping criteria is reached.

#### Making Predictions
Given a data point:

1. Traverse the tree until you reach a leaf node.
2. Return the most common class label i.e (if a leaf node is pure, return the class label otherwise, return a majority vote)


#### Important Terms

* Entropy: This refers to how much variance the data has. i.e. it measures how random or unpredictable a node is. The entropy is largest when a node has 50% of both classes (e.g. a binary class). It ranges between `0` and `1`.

$$
E = - \sum^C_{i=1}(p_{i}*log_{2}(p_{i}))
$$

where:

$p_{i}$ is the probability of randomly picking an element of $class_{i}$ .

$C$ is the total number of classes. For a binary problem, $C = 2$. i.e $C_{unique} = [0, 1]$


* **Information Gain (IG)**: This measures the quality of the splits. i.e. it measures how much entropy was removed by splitting on a feature. It's the basic criterion to decide whether a feature should be used to split a node or not. The feature with the optimal split i.e., the highest value of information gain at a node of a decision tree is used as the feature for splitting the node. It ranges between `0` and `1`.

$$
IG = E_{parent} - (weighted_{average}) * E_{children}
$$

### Stopping Criteria

1. **Maximum depth**: This refers to how deep you want the tree to grow.
2. **Minimum no of samples**: Refers to the minimum number of samples a node can have before splitting can take place.
3. **Minimum impurity decrease**: Refers to the minimum entropy change required for a split to take place.

In [25]:
# Create 2 classes. The 1st class (Node) is used to implement
# the node and all its attributes while 2nd class (DecisionTree)
# contains all the logic for the classifier. The DecisionTree has
# the attributes used as stopping criteria which prevents the tree
# from growing uncontrollably.


class Node:
    """This class is used to implement the nodes
    of a Decision Tree classifier."""

    def __init__(
        self,
        left: Union[float, int] = None,
        right: Union[float, int] = None,
        feature: Union[float, int] = None,
        threshold: Union[float, int] = None,
        *,
        value: Union[float, int] = None,
    ):
        self.left = left
        self.right = right
        self.feature = feature
        self.threshold = threshold
        self.value = value

    def _is_leaf_node(self) -> bool:
        """It returns True if it's a leaf node and False otherwise."""
        return self.value is not None


# Training
# Given the entire dataset:
# 1. Calculate the information gain with each possible split.
# i.e using all the possible features, calculate the IG.
# 2. Divide the data with the feature and the value threshold (if it's numerical)
# that gives the most information gain.
# 3. The result from step 2 is used to create the branches (grow the tree)
# a. check the stopping criteria to prevent growing trees uncontrollably.
# b. find the best split using IG.
# c. create child nodes.
# 4. Repeat steps 1 thru 3 until a stopping criteria is reached.


class DecisionTree:
    """This class is used to implement Decision Tree classifier."""

    def __init__(
        self, max_depth: int, min_samples_split: int, n_features: Optional[int] = None
    ) -> None:
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_features = n_features
        self.root = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """This is used to train the model."""
        # check that the n_features is valid.
        all_feats = X.shape[1]
        self.n_features = (
            all_feats if self.n_features is None else min(all_feats, self.n_features)
        )
        # grow trees
        self.root = self._grow_tree(X, y)
        return self

    def _grow_tree(self, X: np.ndarray, y: np.ndarray, max_depth: int = 0):
        """This is used to create child nodes recursively."""
        n_samples, n_feats = X.shape
        n_K = np.unique(y)  # number of unique labels.

        # Base case: check the stopping criteria: if the n_K = 1
        # or if the n_samples < min_samples_split or depth > max_depth
        # and return the only present label or the label with
        # the label with the highest frequency.
        if (
            n_K == 1
            or n_samples <= self.min_samples_split
            or max_depth >= self.max_depth
        ):
            leaf_node = DecisionTree._get_most_common_label(input_=y)
            return Node(value=leaf_node)

        # Randomly select features (indices)
        feature_idxs = np.random.choice(n_feats, size=self.n_features, replace=False)

        # Find the best split
        best_feature, best_thresh = self._best_split(X, y, feature_idxs)

        # Create child nodes

    @staticmethod
    def _best_split(
        self, X: np.ndarray, y: np.ndarray, feature_idxs: list[int]
    ) -> tuple[float, int]:
        """This uses information gain to calculate the best split at every node."""
        # Initialize variables
        split_idx, split_thresh = None, None
        best_gain = -1

        # Calculate the IG for each feature and determine the best
        for feat_idx in feature_idxs:
            feat = X[:, feat_idx]
            thresholds = np.unique(feat)  # Unique labels

            for thresh in thresholds:
                # Calculate IG
                info_gain = ...
                # Update values
                if info_gain > best_gain:
                    best_gain = info_gain
                    split_idx, split_thresh = feat_idx, thresh

        return (split_idx, split_thresh)

    def _calculate_info_gain(self) -> float:
        """This is used to calculate the information gain."""
        # Calculate entropy of the parent

        # Calculate weighted average

        # Calculate the entropy of the children

        # Calculate the IG
        pass
    
    def _entropy(self, y:np.ndarray) -> float:
        """This is used to calculate the entropy.
        """
        entropies = []
        K, counts = np.unique(y), Counter(y)
        total = len(y)
        for k_ in K:
            p_k = counts[k_] / total
            entropy = p_k * np.log2(p_k)
            entropies.append(entropy)
        E = - np.sum(entropies)

    @staticmethod
    def _get_most_common_label(input_: np.ndarray) -> int:
        """This returns the most common label."""
        counter = Counter(input_)
        return counter.most_common(n=1)[0][0]

$$
IG = E_{parent} - (weighted_{average}) * E_{children}
$$}

$$
E = - \sum^C_{i=1}(p_{i}*log_{2}(p_{i}))
$$

In [23]:
np.log2()

array([ 6, 12, 41, 33, 24, 11, 26, 25, 39, 29])

In [42]:
a = [0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]
entropies = []
K, total = np.unique(a), len(a)
counts = Counter(a)
counts, K

(Counter({0: 8, 1: 5}), array([0, 1]))

In [44]:
8 / 13

0.6153846153846154

In [45]:
prob = counts[1] / len(a)
prob

for k_ in K:
    p_k = counts[k_] / total
    entropy = p_k * np.log2(p_k)
    print(f"class: {k_}, prob: {p_k}, entropy: {entropy}\n")
    entropies.append(entropy)
E = -np.sum(entropies)

E

class: 0, prob: 0.6153846153846154, entropy: -0.43103982654836437

class: 1, prob: 0.38461538461538464, entropy: -0.5301967781745115



0.9612366047228759

In [48]:
-np.sum([((counts[k_] / total) * np.log2(counts[k_] / total)) for k_ in K])

0.9612366047228759

In [47]:
hist = np.bincount([1, 2, 3, 1, 2])
prob = hist / total
prob

array([0.        , 0.15384615, 0.15384615, 0.07692308])