**Random Forest - Regression (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
from collections import Counter
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

**Code**

In [2]:
class Node:
    def __init__(self, *, value=None):
        self.value = value
        self.feature_index = 0
        self.threshold = 0.0
        self.left = None
        self.right = None

    def is_leaf_node(self):
        return self.left is None and self.right is None

In [3]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=None, max_features=None):
        self.max_depth = max_depth
        self.max_features = max_features

    def fit(self, X, y):
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._predict(inputs) for inputs in X])

    def _mse(self, y):
        if len(y) == 0:
            return 0
        return np.mean((y - np.mean(y)) ** 2)

    def _best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None

        best_mse = self._mse(y)
        best_idx, best_thr = None, None

        features = np.random.choice(self.n_features_, self.max_features, replace=False) if self.max_features else range(self.n_features_)

        for idx in features:
            thresholds, values = zip(*sorted(zip(X[:, idx], y)))
            for i in range(1, m):
                y_left = values[:i]
                y_right = values[i:]
                mse_left = self._mse(y_left)
                mse_right = self._mse(y_right)
                mse = (i * mse_left + (m - i) * mse_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if mse < best_mse:
                    best_mse = mse
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2

        return best_idx, best_thr

    def _grow_tree(self, X, y, depth=0):
        node = Node(value=np.mean(y))
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.value

In [4]:
class RandomForestRegressor:
    def __init__(self, n_trees=100, max_depth=None, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTreeRegressor(max_depth=self.max_depth, max_features=self.max_features)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # Collect predictions from each tree
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # Average predictions for regression
        return np.mean(tree_preds, axis=0)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

    def mse(self, y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

**Load Dataset**

In [5]:
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
print(X.shape), print(y.shape)
print(X[:5])
print(y[:5])

(442, 10)
(442,)
[[ 0.03807591  0.05068012  0.06169621  0.02187239 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990749 -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632753 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06833155 -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 -0.00567042 -0.04559945 -0.03419447
  -0.03235593 -0.00259226  0.00286131 -0.02593034]
 [-0.08906294 -0.04464164 -0.01159501 -0.03665608  0.01219057  0.02499059
  -0.03603757  0.03430886  0.02268774 -0.00936191]
 [ 0.00538306 -0.04464164 -0.03638469  0.02187239  0.00393485  0.01559614
   0.00814208 -0.00259226 -0.03198764 -0.04664087]]
[151.  75. 141. 206. 135.]


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 10), (89, 10), (353,), (89,))

In [7]:
# Train
forest = RandomForestRegressor(n_trees=10, max_depth=3, max_features=2)
forest.fit(X_train, y_train)

In [8]:
# Predict the test set
y_pred = forest.predict(X_test)
print(y_pred[:5])

[160.72857673 133.53578709 158.36682956 205.84395314 142.35568254]


In [9]:
# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred) / 100
print(f"Random Forest Mean Squared Error: {mse:.2f}")

Random Forest Mean Squared Error: 32.46


In [10]:
# Custom MSE
custom_mse = forest.mse(y_test, y_pred) / 100
print(f"Custom MSE: {custom_mse:.2f}")

Custom MSE: 32.46
