In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as SKDecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [2]:
def gini(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return 1 - np.sum(probabilities**2)


In [3]:
def information_gain(y, y_left, y_right):
    p_left = len(y_left) / len(y)
    p_right = len(y_right) / len(y)
    
    return gini(y) - (p_left * gini(y_left) + p_right * gini(y_right))


In [4]:
def split_dataset(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]


In [5]:
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None


In [6]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # Stopping criteria
        if (depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        # Greedily select the best split according to information gain
        best_feature, best_threshold = self._best_split(X, y, n_features)
        
        if best_feature is None:
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        # Split the dataset
        X_left, y_left, X_right, y_right = split_dataset(X, y, best_feature, best_threshold)
        left_child = self._grow_tree(X_left, y_left, depth + 1)
        right_child = self._grow_tree(X_right, y_right, depth + 1)
        return Node(best_feature, best_threshold, left_child, right_child)

    def _best_split(self, X, y, n_features):
        best_gain = -1
        split_index, split_threshold = None, None

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                gain = information_gain(y, y_left, y_right)
                if gain > best_gain:
                    best_gain = gain
                    split_index = feature_index
                    split_threshold = threshold

        return split_index, split_threshold

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature_index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)


In [7]:
# Example dataset
data = {
    'feature1': ["red", "green", "blue", "red", "red", "blue", "green"],
    'feature2': [4, 8, 15, 21, 7, 5, 14],
    'label': [0, 0, 1, 1, 1, 0, 1]
}

df = pd.DataFrame(data)

X = df[['feature1', 'feature2']].values
y = df['label'].values

# Create and train the model
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X, y)

# Make predictions
predictions = clf.predict(X)
print("Predictions:", predictions)


Predictions: [0 0 1 1 1 0 1]


In [13]:
X

array([[ 2,  4],
       [ 3,  8],
       [10, 15],
       [19, 21],
       [24,  7],
       [ 6,  5],
       [ 7, 14]], dtype=int64)

In [14]:
y

array([0, 0, 1, 1, 1, 0, 1], dtype=int64)

In [8]:
test = {
    'feature1': ["blue", "green", "blue", "blue", "red", "green", "green"],
    'feature2': [4, 6, 15, 7, 5, 5, 13],
    'label': [0,0,1,1,0,1,0]
}

dftest = pd.DataFrame(test)

X_test = dftest[['feature1', 'feature2']].values

# Make predictions
predictions = clf.predict(X_test)
print("Predictions:", predictions)

Predictions: [0 0 1 0 1 0 1]


In [9]:
# Example dataset
data = {
    'feature1': [2, 3, 10, 19, 24, 6, 7],
    'feature2': [4, 8, 15, 21, 7, 5, 14],
    'label': [0, 0, 1, 1, 1, 0, 1]
}

df = pd.DataFrame(data)

X = df[['feature1', 'feature2']].values
y = df['label'].values

# Create and train the model
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X, y)

skclf = SKDecisionTreeClassifier(max_depth=3, criterion="gini")
skclf.fit(X, y)

# Make predictions
predictions = clf.predict(X)
print("Predictions:", predictions)

skpredictions = clf.predict(X)
print("Predictions:", skpredictions)


Predictions: [0 0 1 1 1 0 1]
Predictions: [0 0 1 1 1 0 1]


In [10]:
accuracy = accuracy_score(y, predictions)
skaccuracy = accuracy_score(y, skpredictions)

print(f"Custom Decision Tree Accuracy: {accuracy}")
print(f"scikit-learn Decision Tree Accuracy: {skaccuracy}")

Custom Decision Tree Accuracy: 1.0
scikit-learn Decision Tree Accuracy: 1.0


In [11]:
test = {
    'feature1': [21, 13, 1, 9, 4, 16, 17],
    'feature2': [14, 18, 5, 2, 17, 15, 4],
    'label': [0, 1,1,0,0,1,0]
}

dftest = pd.DataFrame(test)

X_test = dftest[['feature1', 'feature2']].values
y_test = df['label'].values

# Make predictions
predictions = clf.predict(X_test)
print("Predictions:", predictions)

skpredictions = clf.predict(X_test)
print("Predictions:", skpredictions)

Predictions: [1 1 0 1 0 1 1]
Predictions: [1 1 0 1 0 1 1]


In [12]:
accuracy = accuracy_score(y_test, predictions)
skaccuracy = accuracy_score(y_test, skpredictions)

print(f"Custom Decision Tree Accuracy: {accuracy}")
print(f"scikit-learn Decision Tree Accuracy: {skaccuracy}")

Custom Decision Tree Accuracy: 0.2857142857142857
scikit-learn Decision Tree Accuracy: 0.2857142857142857
