In [8]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Decision Tree Classifier using Numpy

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.root = None
        self.max_depth = max_depth

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(set(y))

        if depth >= self.max_depth or n_classes == 1 or n_samples <= 2:
            return Node(value=max(y))

        best_gini = float('inf')
        best_feature, best_threshold = None, None

        for feature in range(n_features):
            thresholds = sorted(set(X[:, feature]))

            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if sum(left_indices) == 0 or sum(right_indices) == 0:
                    continue

                gini = self._calculate_gini(y[left_indices], y[right_indices])
                
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _calculate_gini(self, left_labels, right_labels):
        total = len(left_labels) + len(right_labels)
        gini_left = 1.0 - sum([(left_labels == c).mean()**2 for c in set(left_labels)])
        gini_right = 1.0 - sum([(right_labels == c).mean()**2 for c in set(right_labels)])
        return (len(left_labels) / total) * gini_left + (len(right_labels) / total) * gini_right

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

In [3]:
#Load Iris Data
iris = load_iris()
X, y = iris.data, iris.target

In [4]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [5]:
#Create the DT classifier instance
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train, y_train)

#Predict our test set
y_pred = clf.predict(X_test)

#Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9666666666666667


## Decision Tree Regression using Numpy

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.root = None
        self.max_depth = max_depth

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape

        if depth >= self.max_depth or n_samples <= 2:
            return Node(value=np.mean(y))

        best_variance = float('inf')
        best_feature, best_threshold = None, None

        for feature in range(n_features):
            thresholds = sorted(set(X[:, feature]))

            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if sum(left_indices) == 0 or sum(right_indices) == 0:
                    continue

                variance = self._calculate_variance(y[left_indices], y[right_indices])
                
                if variance < best_variance:
                    best_variance = variance
                    best_feature = feature
                    best_threshold = threshold

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _calculate_variance(self, left_value, right_value):
        var_left = np.var(left_value)
        var_right = np.var(right_value)
        return (len(left_value) * var_left + len(right_value) * var_right) / (len(left_value) + len(right_value))
        

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

In [11]:
from sklearn.datasets import load_diabetes
import numpy as np
from sklearn.metrics import mean_squared_error

In [6]:
#Load Diabetes Data
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

In [9]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [13]:
#Create the DT Regressor instance
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(X_train, y_train)

#Predict our test set
y_pred = clf.predict(X_test)

#Accuracy
accuracy = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {accuracy}")

Mean Squared Error: 3656.186930948001


In [14]:
#Create the DT Regressor instance
clf = DecisionTreeRegressor(max_depth=5)
clf.fit(X_train, y_train)

#Predict our test set
y_pred = clf.predict(X_test)

#Accuracy
accuracy = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {accuracy}")

Mean Squared Error: 3773.658597184514


In [15]:
#Create the DT Regressor instance
clf = DecisionTreeRegressor(max_depth=2)
clf.fit(X_train, y_train)

#Predict our test set
y_pred = clf.predict(X_test)

#Accuracy
accuracy = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {accuracy}")

Mean Squared Error: 3866.038156768628


## Sklearn Implementation of DT Classifier

In [16]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
#Load Iris Data
iris = load_iris()
X, y = iris.data, iris.target

In [18]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [19]:
#Create the DT classifier instance
clf = DecisionTreeClassifier(random_state=10)
clf.fit(X_train, y_train)

#Predict our test set
y_pred = clf.predict(X_test)

#Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


## Sklearn Implementation of DT Regressor

In [20]:
## Sklearn Implementation of DT Regressor
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [21]:
#Load Diabetes Data
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

In [22]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [23]:
#Create the DT Regressor instance
clf = DecisionTreeRegressor(random_state=11)
clf.fit(X_train, y_train)

#Predict our test set
y_pred = clf.predict(X_test)

#Accuracy
accuracy = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {accuracy}")

Mean Squared Error: 4686.191011235955
