In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
import random

# Processing Files

In [2]:
df = pd.read_csv('./Iris.csv')
df = df.drop("Id", axis = 1)
df = df.rename(columns = {"species" : "label"})

# Train-Test-Split

In [3]:
def train_test_split(df, test_size):
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    indices = df.index.tolist()
    test_indices = random.sample(population = indices, k = test_size)
    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    return train_df, test_df

In [4]:
random.seed(0)
train_df, test_df = train_test_split(df, 20)
data = train_df.values

# TreeNode

In [5]:
class Node(object):
    def __init__(self,label= None, attr = None, splitval = None, left =None, right = None):
        self.label = label
        self.attr = attr
        self.splitval = splitval
        self.left = left
        self.right = right
        
    def set_label(self, label):
        self.label = label 
        
    def set_attr(self, attr, splitval):
        self.attr = attr
        self.splitval = splitval

# Decision Tree 

In [6]:
class DecisionTree(object):
    def __init__(self, minleaf=2):
        self.minleaf = minleaf 
    
    def entropy(self, data):
        label_column = data[:, -1]
        _,counts = np.unique(label_column, return_counts = True)
        prob = counts/counts.sum()
        entropy = sum(- prob * np.log2(prob))
        return entropy
    
    def infomation_gain(self, data, column_index, splitval):
        split_column_values = data[:, column_index]
        data_below = data[split_column_values <= splitval]
        data_above = data[split_column_values > splitval]

        n_data_points = len(data_below) + len(data_above)
        p_data_below = len(data_below) / n_data_points
        p_data_above = len(data_above) / n_data_points

        information_gain = self.entropy(data) - (p_data_above * self.entropy(data_above) + p_data_below *  self.entropy(data_below))
        return information_gain
    
    def choose_split(self, data):
        bestgain = 0
        _, n_columns = data.shape
        for column_index in range(n_columns-1):
            values = data[:, column_index]
            unique_values = np.unique(values)
            for i in range(1,len(unique_values)):
                splitval = (unique_values[i-1] + unique_values[i]) / 2
                gain = self.infomation_gain(data, column_index, splitval)
                if gain >= bestgain:
                    bestgain = gain
                    bestattr = column_index
                    bestsplitval = splitval
        return bestattr, bestsplitval
    
    def check_label_column_purity(self, data):
        label_column = data[:, -1]
        if len(np.unique(label_column))==1:
            return True
        else:
            return False
    
    def classify_data(self, data):
        label_column = data[:, -1]
        unique_classes, counts_unique_classes = np.unique(label_column, return_counts = True)
        index = counts_unique_classes.argmax()
        classification = unique_classes[index]
        return classification
    
    def fit(self, data):
        if len(data) < self.minleaf or self.check_label_column_purity(data):
            node = Node()
            count = 0 
            label_column = data[:, -1]
            unique_classes, counts_unique_classes = np.unique(label_column, return_counts = True)
            index = counts_unique_classes.argmax()
            for i in counts_unique_classes:
                if counts_unique_classes[index] == i:
                    count+=1
            if count == 1:       
                node.set_label(unique_classes[index])
            else:
                node.set_label('unknown')
            return node

        node = Node()
        column_index ,split = self.choose_split(data)
        node.set_attr(attr = column_index, splitval = split)
        node.left = self.fit(data[data[:, column_index] < split])
        node.right = self.fit(data[data[:, column_index] > split])

        return node
    
    def get_label(self,each_row, n):
        while n.label == None:
            if each_row[n.attr] <= n.splitval:
                n = n.left 
            else:
                n = n.right
        return n.label
    
    def predict(self, rootTree, testData):
        labels = []
        for i in testData:
            label = self.get_label(i, rootTree)
            labels.append(label)
        return labels

In [7]:
tree = DecisionTree(minleaf = 2)
root = tree.fit(data=data)
decision_tree_from_scratch_result = tree.predict(rootTree = root, testData = test_df.values)

# Scikit-learn

In [8]:
from sklearn import tree

In [9]:
X = data[:, :-1]
y = data[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
scikit_learn_result =  clf.predict(test_df.values[:, :-1])

In [10]:
scikit_learn_result == decision_tree_from_scratch_result

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])