# Import Dataset and Libraries

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('./iris.csv')
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


## Pre-Processing

In [2]:
df['variety'] = df['variety'].map({'Setosa': 0,
                                  'Versicolor': 1,
                                  'Virginica':2},
                                 na_action=None)
df['variety'].unique()

array([0, 1, 2])

# Decision Tree from Scratch

First up is the TreeNode which is exactly what it says, a class for the nodes in the tree, whether they be decision or leaf nodes.

The `left` and `right` parameters are pointers to children. The `feature` is the feature (column) that the decision node will be splitting off of. The `threshold` value is the numeric splitting value according to the corresponding feature; and the `info_gain` parameter displays the information gain for each decision node.

In [3]:
class TreeNode():
    def __init__(self, left=None, right=None, feature=None, threshold=None, info_gain=None, value=None):
        self.threshold=threshold
        self.feature=feature
        self.right=right
        self.left=left
        self.info_gain=info_gain

        # Used for the leaf node
        self.value=value


The `DecisionTree()` class will hold all of the functions necessary to build the Decision Tree. The parameters are `max_depth` and `min_samples_split` which are both hyperparameters to keep the decsion tree from overfitting. The *max_depth* parameters keeps the tree from extending past a certain depth and the *min_samples_split* essentially says that a node must contain at least a certain number of samples in it to make the split. If the amount of sample in an internal node is less than the *min_samples_split*, then that node will become a leaf node.

In [4]:
class DecisionTree():
    def __init__(self, max_depth=None, min_samples_split=None):
        # Decision Tree hyperparameters to prevent from overfitting
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split

        # initialize the root of the tree
        self.root=None        
        
    def dfs_build(self, df, depth=0):
        X = df.iloc[:, :-1].to_numpy()
        Y = df.iloc[:, -1].to_numpy()
        num_samples, _ = np.shape(X)
        print(f"Current Depth {depth}")
        print(f"and dataset {df}")
        if num_samples >= self.min_samples_split and depth <= self.max_depth:
            node_info = self.find_best_split(df)
            print("made it past this at least")
            if "info_gain" in node_info and node_info["info_gain"] > 0:
                print("subtree part")
                left_subtree = self.dfs_build(node_info["left_df"], depth+1)
                right_subtree = self.dfs_build(node_info["right_df"], depth+1)
                return TreeNode(left_subtree, 
                               right_subtree, 
                               node_info["feature"], 
                               node_info["threshold"], 
                               node_info["info_gain"])
        leaf_value = self.leafnode_value(Y)
        return TreeNode(value = leaf_value)
        
    def find_best_split(self, df):
        max_info_gain = -float("inf")
        TreeNode_stats = {}
        for col_index, feature in enumerate(df.columns):
            if col_index == len(df.columns) - 1:
                break
            threshold_vals = df[feature].unique()
            for threshold in threshold_vals:
                left_df = df[df[feature] <= threshold]
                right_df = df[df[feature] > threshold]
                if left_df.empty or right_df.empty:
                    continue
                G = self.gain_from_gini(left_df.iloc[:,-1].to_numpy(), 
                                        right_df.iloc[:,-1].to_numpy(), 
                                        df.iloc[:,-1].to_numpy(), 
                                        threshold)
                if G > max_info_gain:
                    max_info_gain = G
                    TreeNode_stats["info_gain"] = G
                    TreeNode_stats["feature"] = col_index 
                    TreeNode_stats["threshold"] = threshold
                    TreeNode_stats["left_df"] = left_df
                    TreeNode_stats["right_df"] = right_df
                    
        return TreeNode_stats
    
    def gain_from_gini(self, left_data, right_data, labels, threshold):
        
        samples = len(left_data) + len(right_data)
        weight_left = len(left_data) / samples
        weight_right = len(right_data) / samples

        parent_gini = self.get_gini_score(labels)
        leftchild_gini = self.get_gini_score(left_data)
        rightchild_gini = self.get_gini_score(right_data)

        gain = parent_gini - ((weight_left * leftchild_gini) + (weight_right * rightchild_gini))
        return round(gain,2)

    def get_gini_score(self, Y):
        class_labels = np.unique(Y)
        gini_score = 0
        for label in class_labels:
            label_prob = len(Y[Y == label]) / len(Y)
            gini_score +=label_prob ** 2
        return 1. - gini_score
    
    def leafnode_value(self, labels):
        print("calculated some leafnode stufff")
        counts = np.bincount(labels)
        return np.argmax(counts)
    
    def fit(self, dataframe):
        self.root = self.dfs_build(dataframe)
        
    def predict(self, test_set):
        test = test_set.to_numpy()
        results = []
        #2-D array now
        for row in test:
            ptr = self.root
            while ptr.value is None:
                index = ptr.feature
                if row[index] <= ptr.threshold:
                    ptr = ptr.left
                else:
                    ptr = ptr.right
            results.append(ptr.value)
        return results

# Shuffle and Split the Dataset

In [5]:
shuffled = df.sample(frac=1)
shuffled.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
77,6.7,3.0,5.0,1.7,1
71,6.1,2.8,4.0,1.3,1
20,5.4,3.4,1.7,0.2,0
84,5.4,3.0,4.5,1.5,1
12,4.8,3.0,1.4,0.1,0


In [6]:
length, _ = shuffled.shape
part = int(length * 0.2)
test_set = shuffled.iloc[:part, :]
train_set = shuffled.iloc[part:, :]

In [9]:
classifier = DecisionTree(max_depth=3, min_samples_split=3)

In [10]:
classifier.fit(train_set)

Current Depth 0
and dataset      sepal.length  sepal.width  petal.length  petal.width  variety
72            6.3          2.5           4.9          1.5        1
76            6.8          2.8           4.8          1.4        1
95            5.7          3.0           4.2          1.2        1
38            4.4          3.0           1.3          0.2        0
53            5.5          2.3           4.0          1.3        1
..            ...          ...           ...          ...      ...
10            5.4          3.7           1.5          0.2        0
88            5.6          3.0           4.1          1.3        1
73            6.1          2.8           4.7          1.2        1
27            5.2          3.5           1.5          0.2        0
103           6.3          2.9           5.6          1.8        2

[120 rows x 5 columns]
made it past this at least
subtree part
Current Depth 1
and dataset     sepal.length  sepal.width  petal.length  petal.width  variety
38        

made it past this at least
subtree part
Current Depth 4
and dataset      sepal.length  sepal.width  petal.length  petal.width  variety
121           5.6          2.8           4.9          2.0        2
114           5.8          2.8           5.1          2.4        2
106           4.9          2.5           4.5          1.7        2
142           5.8          2.7           5.1          1.9        2
138           6.0          3.0           4.8          1.8        2
101           5.8          2.7           5.1          1.9        2
calculated some leafnode stufff
Current Depth 4
and dataset     sepal.length  sepal.width  petal.length  petal.width  variety
70           5.9          3.2           4.8          1.8        1
calculated some leafnode stufff
Current Depth 3
and dataset      sepal.length  sepal.width  petal.length  petal.width  variety
111           6.4          2.7           5.3          1.9        2
135           7.7          3.0           6.1          2.3        2
146       

In [11]:
preds = classifier.predict(test_set)

In [12]:
targs = test_set["variety"].to_numpy()

In [13]:
def accuracy(preds, targs):
    running_sum = 0
    for prediction, target in zip(preds, targs):
        if prediction == target:
            running_sum += 1
    return float(running_sum) / len(targs)

In [14]:
print(accuracy(preds, targs))

0.9666666666666667
