**Decision Tree - Regression (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import load_iris, load_diabetes, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, auc, roc_curve

**Code**

In [2]:
class Node:
    def __init__(self, *, predicted_value):
        self.predicted_value = predicted_value
        self.feature_index = 0
        self.threshold = 0.0
        self.left = None
        self.right = None

    def is_leaf_node(self):
        return self.left is None and self.right is None

In [3]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._predict(inputs) for inputs in X])

    def _mse(self, y):
        """Calculate mean squared error (MSE) for a given target array y."""
        if len(y) == 0:
            return 0
        mean = np.mean(y)
        return np.mean((y - mean) ** 2)

    def _best_split(self, X, y):
        """Find the best split for the data based on MSE."""
        m = y.size
        if m <= 1:
            return None, None

        best_mse = self._mse(y)
        best_idx, best_thr = None, None

        for idx in range(self.n_features_):
            thresholds, values = zip(*sorted(zip(X[:, idx], y)))
            for i in range(1, m):
                y_left = values[:i]
                y_right = values[i:]
                mse_left = self._mse(y_left)
                mse_right = self._mse(y_right)
                mse_split = (i * mse_left + (m - i) * mse_right) / m

                if thresholds[i] == thresholds[i - 1]:
                    continue
                if mse_split < best_mse:
                    best_mse = mse_split
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2

        return best_idx, best_thr

    def _grow_tree(self, X, y, depth=0):
        """Recursively grow the decision tree."""
        node = Node(predicted_value=np.mean(y))
        if depth < self.max_depth and len(y) > 1:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_value

**Load Dataset**

In [4]:
# Load the California housing dataset
# data = fetch_california_housing()
data = load_diabetes()
X = data.data
y = data.target
print(X.shape), print(y.shape)
print(X[:5])
print(y[:5])

(442, 10)
(442,)
[[ 0.03807591  0.05068012  0.06169621  0.02187239 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990749 -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632753 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06833155 -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 -0.00567042 -0.04559945 -0.03419447
  -0.03235593 -0.00259226  0.00286131 -0.02593034]
 [-0.08906294 -0.04464164 -0.01159501 -0.03665608  0.01219057  0.02499059
  -0.03603757  0.03430886  0.02268774 -0.00936191]
 [ 0.00538306 -0.04464164 -0.03638469  0.02187239  0.00393485  0.01559614
   0.00814208 -0.00259226 -0.03198764 -0.04664087]]
[151.  75. 141. 206. 135.]


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 10), (89, 10), (353,), (89,))

In [6]:
# Train
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train, y_train)

In [7]:
# Predictions
y_pred = tree.predict(X_test)
print(y_pred[:5])

[159.57407407 175.8        159.57407407 230.51515152 109.9223301 ]


In [8]:
# Compute the MSE of the predictions
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")

MSE: 3656.186930948001
