#**Decision Tree - Regression (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import load_iris, load_diabetes, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, auc, roc_curve

**Code**

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, leaf=False, value=None):
        self.feature = feature          # Feature index for splitting
        self.threshold = threshold      # Threshold value for splitting
        self.left = left                # Left child node
        self.right = right              # Right child node
        self.leaf = leaf                # Whether this node is a leaf
        self.value = value              # Predicted value if it's a leaf

In [3]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=2, criterion='mse'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        # Stopping criteria
        if len(y) < self.min_samples_split or (self.max_depth and depth >= self.max_depth):
            return self._create_leaf_node(y)

        # Find the best split
        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self._create_leaf_node(y)

        # Split the data
        left_indices = X[:, best_feature] < best_threshold
        right_indices = X[:, best_feature] >= best_threshold

        left_child = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_child = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)

    def _find_best_split(self, X, y):
        best_score = np.inf
        best_feature = None
        best_threshold = None
        n_features = X.shape[1]

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                score = self._calculate_split_score(X, y, feature, threshold)
                if score < best_score:
                    best_score = score
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _calculate_split_score(self, X, y, feature, threshold):
        # Split the data
        left_indices = X[:, feature] < threshold
        right_indices = X[:, feature] >= threshold

        if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
            return np.inf  # Avoid invalid split

        if self.criterion == 'mse':
            return self._mean_squared_error(y, left_indices, right_indices)
        elif self.criterion == 'mae':
            return self._mean_absolute_error(y, left_indices, right_indices)

    def _mean_squared_error(self, y, left_indices, right_indices):
        left_mse = np.mean((y[left_indices] - np.mean(y[left_indices])) ** 2)
        right_mse = np.mean((y[right_indices] - np.mean(y[right_indices])) ** 2)

        # Weighted MSE
        total_samples = len(y)
        return (np.sum(left_indices) / total_samples) * left_mse + (np.sum(right_indices) / total_samples) * right_mse

    def _mean_absolute_error(self, y, left_indices, right_indices):
        left_mae = np.mean(np.abs(y[left_indices] - np.mean(y[left_indices])))
        right_mae = np.mean(np.abs(y[right_indices] - np.mean(y[right_indices])))

        # Weighted MAE
        total_samples = len(y)
        return (np.sum(left_indices) / total_samples) * left_mae + (np.sum(right_indices) / total_samples) * right_mae

    def _create_leaf_node(self, y):
        value = np.mean(y)  # Use mean value for regression
        return Node(leaf=True, value=value)

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _predict_sample(self, sample, tree):
        if tree.leaf:
            return tree.value

        if sample[tree.feature] < tree.threshold:
            return self._predict_sample(sample, tree.left)
        else:
            return self._predict_sample(sample, tree.right)

    def print_tree(self, tree=None, indent="  "):
        """Prints the structure of the decision tree"""
        if tree is None:
            tree = self.tree
        if tree.leaf:
            print(f"{indent}Leaf: Value {tree.value:.2f}")
        else:
            print(f"{indent}Feature {tree.feature} <= {tree.threshold:.2f}")
            print(f"{indent}Left:")
            self.print_tree(tree.left, indent + "  ")
            print(f"{indent}Right:")
            self.print_tree(tree.right, indent + "  ")

    def accuracy(self, y, y_hat):
        return np.mean((y - y_hat) ** 2)  # Return mean squared error for regression

    def mean_squared_error(self, y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    def mean_absolute_error(self, y_true, y_pred):
        return np.mean(np.abs(y_true - y_pred))


**Load Dataset**

In [4]:
# Load the California housing dataset
# data = fetch_california_housing()
data = load_diabetes()
X = data.data
y = data.target
print(X.shape), print(y.shape)
print(X[:5])
print(y[:5])

(442, 10)
(442,)
[[ 0.03807591  0.05068012  0.06169621  0.02187239 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990749 -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632753 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06833155 -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 -0.00567042 -0.04559945 -0.03419447
  -0.03235593 -0.00259226  0.00286131 -0.02593034]
 [-0.08906294 -0.04464164 -0.01159501 -0.03665608  0.01219057  0.02499059
  -0.03603757  0.03430886  0.02268774 -0.00936191]
 [ 0.00538306 -0.04464164 -0.03638469  0.02187239  0.00393485  0.01559614
   0.00814208 -0.00259226 -0.03198764 -0.04664087]]
[151.  75. 141. 206. 135.]


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 10), (89, 10), (353,), (89,))

In [6]:
# Train
tree = DecisionTreeRegressor(max_depth=5, min_samples_split=4, criterion='mae')
tree.fit(X_train, y_train)

In [7]:
print("Decision Tree Structure:")
tree.print_tree()

Decision Tree Structure:
  Feature 2 <= 0.01
  Left:
    Feature 8 <= -0.00
    Left:
      Feature 8 <= -0.03
      Left:
        Feature 3 <= 0.13
        Left:
          Feature 0 <= 0.08
          Left:
            Leaf: Value 83.55
          Right:
            Leaf: Value 199.00
        Right:
          Leaf: Value 216.00
      Right:
        Feature 5 <= 0.10
        Left:
          Feature 4 <= -0.05
          Left:
            Leaf: Value 141.11
          Right:
            Leaf: Value 100.57
        Right:
          Leaf: Value 241.50
    Right:
      Feature 7 <= 0.11
      Left:
        Feature 5 <= -0.04
        Left:
          Feature 0 <= 0.07
          Left:
            Leaf: Value 180.11
          Right:
            Leaf: Value 283.00
        Right:
          Feature 6 <= 0.03
          Left:
            Leaf: Value 158.85
          Right:
            Leaf: Value 107.62
      Right:
        Feature 0 <= -0.03
        Left:
          Leaf: Value 292.00
        Right:
   

In [8]:
# Predictions
y_pred = tree.predict(X_test)
print(y_pred[:5])

[158.85185185 145.65384615 107.61538462 257.41666667 100.56603774]


In [9]:
# Calculate MSE and MAE
mse = tree.mean_squared_error(y_test, y_pred)
mae = tree.mean_absolute_error(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")

MSE: 3967.26
MAE: 48.21


In [10]:
# Inbuilt Accuracies
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error :", mse)
print(f"Mean Absolute Error :", mae)
print(f"Explained Variance Score :", evs)
print(f"R2 Score :", r2)

Mean Squared Error : 3967.255731354888
Mean Absolute Error : 48.209498060388846
Explained Variance Score : 0.2512521998502245
R2 Score : 0.25120005623363717
