#**Decision Tree - Regression (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

**Code**

In [2]:
class Node:
  def __init__(self, feature=None, threshold=None, left=None, right=None, leaf=False, value=None):
    self.feature = feature
    self.threshold = threshold
    self.left = left
    self.right = right
    self.leaf = leaf
    self.value = value

In [3]:
class DecisionTreeRegressor:
  def __init__(self, criterion='mse', max_depth=None, min_samples_split=2):
    self.criterion = criterion
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.tree = None

  def fit(self, X, y):
    self.tree = self._build_tree(X, y)

  def _build_tree(self, X, y, depth=0):
    # Stopping criteria
    if len(y) < self.min_samples_split or (self.max_depth and depth >= self.max_depth):
        return self._create_leaf_node(y)

    # Find the best split
    best_feature, best_threshold = self._find_best_split(X, y)
    if best_feature is None:
        return self._create_leaf_node(y)

    # Split the data
    left_indices = X[:, best_feature] < best_threshold
    right_indices = X[:, best_feature] >= best_threshold

    left_child = self._build_tree(X[left_indices], y[left_indices], depth + 1)
    right_child = self._build_tree(X[right_indices], y[right_indices], depth + 1)

    return Node(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)

  def _find_best_split(self, X, y):
    best_gain = -np.inf
    best_feature = None
    best_threshold = None
    n_features = X.shape[1]

    for feature in range(n_features):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            gain = self._information_gain(X, y, feature, threshold)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold

  def _information_gain(self, X, y, feature, threshold):
    # Calculate the loss before the split
    parent_loss = self._loss(y)

    # Split the data
    left_indices = X[:, feature] < threshold
    right_indices = X[:, feature] >= threshold

    # Calculate the weighted loss of the children
    n = len(y)
    n_left = np.sum(left_indices)
    n_right = np.sum(right_indices)

    if n_left == 0 or n_right == 0:
        return 0

    child_loss = (n_left / n) * self._loss(y[left_indices]) + (n_right / n) * self._loss(y[right_indices])

    # Calculate information gain (parent loss - child loss)
    return parent_loss - child_loss

  def _loss(self, y):
    if self.criterion == 'mse':
        return np.mean((y - np.mean(y)) ** 2)
    elif self.criterion == 'mae':
        return np.mean(np.abs(y - np.mean(y)))

  def _create_leaf_node(self, y):
    # Leaf node returns the mean value for regression
    return Node(leaf=True, value=np.mean(y))

  def predict(self, X):
    return np.array([self._predict_sample(sample, self.tree) for sample in X])

  def _predict_sample(self, sample, tree):
    if tree.leaf:
        return tree.value

    if sample[tree.feature] < tree.threshold:
        return self._predict_sample(sample, tree.left)
    else:
        return self._predict_sample(sample, tree.right)

  def print_tree(self, tree=None, indent="  "):
    """Prints the structure of the decision tree"""
    if tree is None:
        tree = self.tree
    if tree.leaf:
        print(f"{indent}Leaf: Value {tree.value:.2f}")
    else:
        print(f"{indent}Feature {tree.feature} <= {tree.threshold}")
        print(f"{indent}Left:")
        self.print_tree(tree.left, indent + "  ")
        print(f"{indent}Right:")
        self.print_tree(tree.right, indent + "  ")


**Load Dataset**

In [4]:
# Load dataset for regression
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 10), (89, 10), (353,), (89,))

In [6]:
# Train the custom decision tree regressor
tree_regressor = DecisionTreeRegressor(criterion='mse', max_depth=5, min_samples_split=3)
tree_regressor.fit(X_train, y_train)

In [7]:
# Print the tree structure
tree_regressor.print_tree()

  Feature 2 <= 0.005649978676881689
  Left:
    Feature 8 <= 0.007027139682585861
    Left:
      Feature 8 <= -0.042570854118219384
      Left:
        Feature 4 <= -0.038719686991641515
        Left:
          Feature 4 <= -0.04559945128264711
          Left:
            Leaf: Value 84.08
          Right:
            Leaf: Value 158.75
        Right:
          Feature 0 <= 0.04170844488444244
          Left:
            Leaf: Value 55.40
          Right:
            Leaf: Value 79.83
      Right:
        Feature 6 <= 0.026550272625626974
        Left:
          Feature 4 <= -0.051103262715451604
          Left:
            Leaf: Value 157.67
          Right:
            Leaf: Value 113.60
        Right:
          Feature 0 <= 0.005383060374248237
          Left:
            Leaf: Value 75.60
          Right:
            Leaf: Value 105.84
    Right:
      Feature 7 <= 0.10811110062954676
      Left:
        Feature 0 <= 0.001750521923228816
        Left:
          Feature 0 <= -0.052

In [8]:
# Predictions
y_pred = tree_regressor.predict(X_test)
print(y_pred[:5])

[157.80769231 171.67241379 157.80769231 246.62068966 113.59615385]


In [9]:
# Evaluate the regression performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R2: {r2:.2f}')

MSE: 3754.26
MAE: 47.72
R2: 0.29
