# Decision Tree Classifier (CART) Implementation

In [52]:
# Here is a Toy Dataset
dataset =[[2.771244718,1.784783929,0],
          [1.728571309,1.169761413,0],
          [3.678319846,2.81281357,0],
          [3.961043357,2.61995032,0],
          [2.999208922,2.209014212,0],
          [7.497545867,3.162953546,1],
          [9.00220326,3.339047188,1],
          [7.444542326,0.476683375,1],
          [10.12493903,3.234550982,1],
          [6.642287351,3.319983761,1]]

In [53]:
def test_split(data, index, value):
    left, right = [], []
    
    for row in data:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
            
    return [left, right]

In [90]:
def gini(groups):
    # load left and right groups
    left, right = groups[0], groups[1] 
    
    # Sample size of each groups for probability calculation
    num_left_samples = float(len(left))
    num_right_samples = float(len(right))
    num_total_samples = num_left_samples + num_right_samples
    
    # Each class samples in each groups
    num_left_class_0 = [row[-1] for row in left].count(0) # Class 0 samples in left
    num_left_class_1 = [row[-1] for row in left].count(1) # Class 1 samples in left
    num_right_class_0 = [row[-1] for row in right].count(0) # Class 0 samples in right
    num_right_class_1 = [row[-1] for row in right].count(1) # Class 1 samples in right
    
    # Probability scores
    left_class_0_prob, left_class_1_prob, right_class_0_prob, right_class_1_prob = 0.0, 0.0, 0.0, 0.0
    left_total_score, right_total_score = 0.0, 0.0
    
    # check if the left samples are empty
    if not num_left_samples:
        pass
    else:
        left_class_0_prob = num_left_class_0 / num_left_samples
        left_class_1_prob = num_left_class_1 / num_left_samples
        left_total_score = left_class_0_prob**2 + left_class_1_prob**2 # Take the total square probabilities
    
    # Check if the right samples are empty
    if not num_right_samples:
        pass
    else:
        right_class_0_prob = num_right_class_0 / num_right_samples
        right_class_1_prob = num_right_class_1 / num_right_samples
        right_total_score = right_class_0_prob**2 + right_class_1_prob**2 # Take the total square probabilities
    
    # Calculate Gini score for each groups
    left_gini_score = (1 - left_total_score)*num_left_samples / num_total_samples
    right_gini_score = (1 - right_total_score)*num_right_samples / num_total_samples
    
    return left_gini_score + right_gini_score

In [97]:
def best_split(train, lowest_gini=100.0, gini_score=0.0):
    
    # Looping through all the values in each column except class column
    for col in range(len(train[0])-1):
        for row in train:
            groups = test_split(train, col, row[col]) # split into groups based on each value
            gini_score = gini(groups) # Calc Gini score
            
            # Check if the lowest gini is found
            if gini_score < lowest_gini:
                lowest_gini = gini_score # Find the lowest Gini
                best_index, best_value, best_group = col, row[col], groups # Take the best split based values
                
    return {"index": best_index, "val": best_value, "gini": lowest_gini, "sub-tree": best_group}

In [104]:
def leaf_node(group):
    class_vals = [row[-1] for row in group] # Take the class values in the group
    return max(set(class_vals), key=class_vals.count) # Return the most frequent class in this group

In [112]:
def build_tree(node, max_depth, min_samples, depth=0):
    # Take the best split from the passing node
    left, right = best_split(node["sub-tree"])
    
    # Remove the sub_tree since did not decide yet further split or make it as a leaf node 
    del(node["sub-tree"])
    
    # Check if the left or right groups are empty if so they become leaf node
    if not left or not right:
        node["left"] = node["right"] = leaf_node(left+right) # passing all data since one group is empty
    
    # check if the max_depth is reached, if so left and right become leaf node
    # No more further spliting is needed
    if depth >= max_depth:
        node["left"] = leaf_node(left)
        node["right"] - leaf_node(right)
        
    # checking min_samples before split, if less then no spilt needed and become leaf node
    # otherwise further splitting is required
    if len(left) <= min_samples:
        node["left"] = leaf_node(left)
    else:
        # Adding a sub-tree to the left
        node["left"] = best_split(left)
        # Build the left sub-tree
        build_tree(node["left"], max_depth, min_samples, depth+1)
        
    # checking min_samples before split, if less then no spilt needed and become leaf node
    # otherwise further splitting is required
    if len(right) <= min_samples:
        node["right"] = leaf_node(right)
    else:
        # Adding a sub-tree to the left
        node["right"] = best_split(right)
        # Build the left sub-tree
        build_tree(node["right"], max_depth, min_samples, depth+1)

In [57]:
def fit():
    pass

In [58]:
def predict():
    pass

In [59]:
p = []
for i in dataset:
    p.append(i[-1])

In [60]:
p

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

In [61]:
p.count(0)

5

In [62]:
x = 9
if x == 9:
    pass
else:
    print("got it")

In [106]:
gr = test_split(dataset, 0, 6.642287351)

In [107]:
gini(gr)

0.0

In [108]:
gr[0]

[[2.771244718, 1.784783929, 0],
 [1.728571309, 1.169761413, 0],
 [3.678319846, 2.81281357, 0],
 [3.961043357, 2.61995032, 0],
 [2.999208922, 2.209014212, 0]]

In [109]:
gr[1]

[[7.497545867, 3.162953546, 1],
 [9.00220326, 3.339047188, 1],
 [7.444542326, 0.476683375, 1],
 [10.12493903, 3.234550982, 1],
 [6.642287351, 3.319983761, 1]]

In [96]:
best_split(dataset)

{'index': 0,
 'val': 6.642287351,
 'gini': 0.0,
 'sub-tree': [[[2.771244718, 1.784783929, 0],
   [1.728571309, 1.169761413, 0],
   [3.678319846, 2.81281357, 0],
   [3.961043357, 2.61995032, 0],
   [2.999208922, 2.209014212, 0]],
  [[7.497545867, 3.162953546, 1],
   [9.00220326, 3.339047188, 1],
   [7.444542326, 0.476683375, 1],
   [10.12493903, 3.234550982, 1],
   [6.642287351, 3.319983761, 1]]]}

In [98]:
clss_vl = [row[-1] for row in dataset]

In [101]:
clss_vl.count()

5

In [111]:
leaf_node(gr[0])

0