Modify the Decision Tree scratch code in our lecture such that:
- Modify the scratch code so it can accept an hyperparameter <code>max_depth</code>, in which it will continue create the tree until max_depth is reached.</li>
- Put everything into a class <code>DecisionTree</code>.  It should have at least two methods, <code>fit()</code>, and <code>predict()</code>
- Load the iris data and try with your class</li>

st122645

In [1]:
import matplotlib.pyplot as plt
import numpy as np

In [13]:
#To help with our implementation, we create a class Node
class Node:
    def __init__(self,predicted_class):
#         self.gini = gini
#         self.num_samples = num_samples
#         self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None
        
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)
        
    def predict(self, X):
        return [self._predict(inputs) for inputs in X]
    
    def find_split(self, X, y):
        n_samples = y.size # if not define, error
        if n_samples <= 1:
            return None, None
        class_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / n_samples) ** 2 for n in class_parent)
        feature_ix, threshold = None, None

        for feature in range(self.n_features_):
            sample_sorted = sorted(X[:, feature])
            sort_idx = np.argsort(X[:, feature])
            y_sorted = y[sort_idx] #[0, 0, 1, 1]
            class_left = [0] * self.n_classes_
            class_right = class_parent.copy()
            
            for i in range(1, n_samples): 
                #the class of that sample
                c = y_sorted[i - 1]  #[0]
                #put the sample to the left
                class_left[c] += 1  #[1, 0]
                #take the sample out from the right  [1, 2]
                class_right[c] -= 1
                gini_left = 1.0 - sum(
                    (class_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                #we divided by n_samples - i since we know that the left amount of samples
                #since left side has already i samples
                gini_right = 1.0 - sum(
                    (class_right[x] / (n_samples - i)) ** 2 for x in range(self.n_classes_)
                )
                #weighted gini
                weighted_gini = ((i / n_samples) * gini_left) + ( (n_samples - i) /n_samples) * gini_right

                # in case the value are the same, we do not split
                # (both have to end up on the same side of a split).
                if sample_sorted[i] == sample_sorted[i - 1]:
                    continue
                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    feature_ix = feature
                    threshold = (sample_sorted[i] + sample_sorted[i - 1]) / 2  # midpoint
        return feature_ix, threshold
    
    def _grow_tree(self, X, y, depth = 0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            feature, threshold = self.find_split(X, y)

            if feature is not None:
            #take all the indices that is less than threshold
                indices_left = X[:, feature] < threshold
                X_left, y_left = X[indices_left], y[indices_left]

                #tilde for negation
                X_right, y_right = X[~indices_left], y[~indices_left]

                #take note for later decision
                node.feature_index = feature
                node.threshold = threshold
                node.left = self._grow_tree(X_left, y_left,  depth + 1)
                node.right = self._grow_tree(X_right, y_right,  depth + 1)
        return node
        
    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class


if __name__ == "__main__":
    import sys
    from sklearn.datasets import load_iris

    dataset = load_iris()
    X, y = dataset.data, dataset.target
    clf = DecisionTree(max_depth=10)
    clf.fit(X, y)
    print(clf.predict([[0, 1, 2, 1.5]]))
    print(clf.predict([[1, 3, 5, 15]]))
    print(clf.predict([[0, 0, 5, 1.5]]))

[0]
[2]
[2]
