# Import Libraries and Dataset

In [34]:
import pandas as pd
import numpy as np

df = pd.read_csv('~/machine_learning/Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


Change `Species` column to numeric type and drop a columns to make prediction more difficult

In [35]:
df['Species'] = df['Species'].map({'Iris-setosa': 0,
                                  'Iris-versicolor': 1,
                                  'Iris-virginica':2},
                                 na_action=None)
df.drop(columns=['PetalWidthCm','PetalLengthCm'], inplace=True)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,Species
0,1,5.1,3.5,0
1,2,4.9,3.0,0
2,3,4.7,3.2,0
3,4,4.6,3.1,0
4,5,5.0,3.6,0


# Decision Tree from Scratch

### TreeNode Class



#### Attributes
+ `left`
    + points to the tree that is less than or equal to the threshold value
+ `right`
    + points to the tree that is greater than the threshold value
+ `feature`
    + holds the feature (columns) that the data was split
+ `threshold`
    + holds the specific value of the feature for splitting
+ `info_gain`
    + information gained by the decision node
+ `value`
    + if TreeNode is a leaf than this holds the majority class and prediction of the leaf, otherwise `None`

In [36]:
class TreeNode():
    def __init__(self, left=None, right=None, feature=None, threshold=None, info_gain=None, value=None):
        self.threshold=threshold
        self.feature=feature
        self.right=right
        self.left=left
        self.info_gain=info_gain

        # Used for the leaf node
        self.value=value


### DecisionTree Class



#### Attributes
+ `max_depth`
    + hperparameter to control how deep the tree is allowed to grow
+ `min_samples_split`
    + a node must contain at least a certain number of samples in it to make the split, otherwise the node will become a leaf node
+ `root`
    + points to the root of the DecisionTree
    
#### Methods
+ `dfs_build`
    + recursively builds the decision tree until the tree no longer contains enough samples or becomes too deep
+ `find_best_split`
    + Calculates all possible splits for a given dataframe and selects the feature and threshold with the largest information gain
+ `gain_from_gini`
    + calculates information gain from parent and children gini scores
+ `get_gini_score`
    + self-explanatory
+ `leafnode_value`
    + calculates the majority label of the given dataset and sets the TreeNode's `value` parameter to this majority label
+ `fit`
    + trains the decision tree classifier
+ `predict`
    + predicts the label of a test observation by traversing the DecisionTree until it reaches a leaf node and then outputs the leaf node's `value` attribute

In [44]:
class DecisionTree():
    def __init__(self, max_depth=None, min_samples_split=None):
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split

        self.root=None        
        
    def dfs_build(self, df, depth=0):
        X = df.iloc[:, :-1].to_numpy()
        Y = df.iloc[:, -1].to_numpy()
        num_samples, _ = np.shape(X)
        if num_samples >= self.min_samples_split and depth <= self.max_depth:
            node_info = self.find_best_split(df)
            if "info_gain" in node_info and node_info["info_gain"] > 0:
                left_subtree = self.dfs_build(node_info["left_df"], depth+1)
                right_subtree = self.dfs_build(node_info["right_df"], depth+1)
                return TreeNode(left_subtree, 
                               right_subtree, 
                               node_info["feature"], 
                               node_info["threshold"], 
                               node_info["info_gain"])
        leaf_value = self.leafnode_value(Y)
        return TreeNode(value = leaf_value)
        
    def find_best_split(self, df):
        max_info_gain = -float("inf")
        TreeNode_stats = {}
        for col_index, feature in enumerate(df.columns):
            if col_index == len(df.columns) - 1:
                break
            threshold_vals = df[feature].unique()
            for threshold in threshold_vals:
                left_df = df[df[feature] <= threshold]
                right_df = df[df[feature] > threshold]
                if left_df.empty or right_df.empty:
                    continue
                G = self.gain_from_gini(left_df.iloc[:,-1].to_numpy(), 
                                        right_df.iloc[:,-1].to_numpy(), 
                                        df.iloc[:,-1].to_numpy(), 
                                        threshold)
                if G > max_info_gain:
                    max_info_gain = G
                    TreeNode_stats["info_gain"] = G
                    TreeNode_stats["feature"] = col_index 
                    TreeNode_stats["threshold"] = threshold
                    TreeNode_stats["left_df"] = left_df
                    TreeNode_stats["right_df"] = right_df
                    
        return TreeNode_stats
    
    def gain_from_gini(self, left_data, right_data, labels, threshold):
        
        samples = len(left_data) + len(right_data)
        weight_left = len(left_data) / samples
        weight_right = len(right_data) / samples

        parent_gini = self.get_gini_score(labels)
        leftchild_gini = self.get_gini_score(left_data)
        rightchild_gini = self.get_gini_score(right_data)

        gain = parent_gini - ((weight_left * leftchild_gini) + (weight_right * rightchild_gini))
        return round(gain,2)

    def get_gini_score(self, Y):
        class_labels = np.unique(Y)
        gini_score = 0
        for label in class_labels:
            label_prob = len(Y[Y == label]) / len(Y)
            gini_score +=label_prob ** 2
        return 1. - gini_score
    
    def leafnode_value(self, labels):
        counts = np.bincount(labels)
        return np.argmax(counts)
    
    def fit(self, dataframe):
        self.root = self.dfs_build(dataframe)
        
    def predict(self, test_set):
        test = test_set.to_numpy()
        results = []
        for row in test:
            ptr = self.root
            while ptr.value is None:
                index = ptr.feature
                if row[index] <= ptr.threshold:
                    ptr = ptr.left
                else:
                    ptr = ptr.right
            results.append(ptr.value)
        return results

# Shuffle and Split the Dataset

In [45]:
shuffled = df.sample(frac=1)
shuffled.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,Species
90,91,5.5,2.6,1
41,42,4.5,2.3,0
123,124,6.3,2.7,2
115,116,6.4,3.2,2
132,133,6.4,2.8,2


In [46]:
length, _ = shuffled.shape
part = int(length * 0.2)
test_set = shuffled.iloc[:part, :]
train_set = shuffled.iloc[part:, :]

# Train the Classifier

In [47]:
classifier = DecisionTree(max_depth=2, min_samples_split=3)
classifier.fit(train_set)

# Make Predictions

In [48]:
preds = classifier.predict(test_set)

In [49]:
targs = test_set['Species'].to_numpy()

In [50]:
def accuracy(preds, targs):
    running_sum = 0
    for prediction, target in zip(preds, targs):
        if prediction == target:
            running_sum += 1
    return float(running_sum) / len(targs)

print(accuracy(preds, targs))

0.9666666666666667


# Compare with Scikit Learn Decision Tree Classifier

In [51]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [52]:
features = train_set.iloc[:, :-1]
labels = train_set.iloc[:, -1]

In [53]:
sklearn_classifier = DecisionTreeClassifier(min_samples_leaf=3,max_depth=2)
sklearn_classifier.fit(features.to_numpy(), labels.to_numpy())

In [54]:
y_pred = sklearn_classifier.predict(test_set.iloc[:, :-1].to_numpy())
accuracy_score(test_set.iloc[:, -1].to_numpy(), y_pred)

0.9666666666666667