#**Random Forest - Regression (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

**Code**

In [2]:
class Node:
  def __init__(self, feature_index, threshold, left, right):
    self.feature_index = feature_index
    self.threshold = threshold
    self.left = left
    self.right = right

In [3]:
class LeafNodeRegression:
  def __init__(self, y):
    self.value = np.mean(y)

  def predicted_value(self):
    return self.value

In [4]:
class DecisionTreeRegression:
  def __init__(self, max_depth=None, min_samples_split=2, criterion="mse"):
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.criterion = criterion
    self.tree = None

  def fit(self, X, y):
    self.tree = self._build_tree(X, y)

  def _build_tree(self, X, y, depth=0):
    num_samples, num_features = X.shape

    # Check stopping criteria
    if depth >= self.max_depth or num_samples < self.min_samples_split:
      return LeafNodeRegression(y)

    # Find the best split
    best_feature_index, best_threshold, indices_left, indices_right = self._best_split(X, y, num_features)
    if best_feature_index is None:
      return LeafNodeRegression(y)

    # Recursively build left and right subtrees
    left_subtree = self._build_tree(X[indices_left], y[indices_left], depth + 1)
    right_subtree = self._build_tree(X[indices_right], y[indices_right], depth + 1)

    return Node(best_feature_index, best_threshold, left_subtree, right_subtree)

  def _best_split(self, X, y, num_features):
    best_gain = -1
    best_feature_index = None
    best_threshold = None
    best_indices_left = None
    best_indices_right = None

    for feature_index in range(num_features):
      thresholds = np.unique(X[:, feature_index])
      for threshold in thresholds:
        indices_left = np.where(X[:, feature_index] <= threshold)[0]
        indices_right = np.where(X[:, feature_index] > threshold)[0]

        if len(indices_left) > 0 and len(indices_right) > 0:
          gain = self._information_gain(y, indices_left, indices_right)
          if gain > best_gain:
            best_gain = gain
            best_feature_index = feature_index
            best_threshold = threshold
            best_indices_left = indices_left
            best_indices_right = indices_right

    return best_feature_index, best_threshold, best_indices_left, best_indices_right

  def _information_gain(self, y, left_indices, right_indices):
    impurity_before = self._impurity(y)
    impurity_left = self._impurity(y[left_indices])
    impurity_right = self._impurity(y[right_indices])

    weighted_impurity = (len(left_indices) / len(y)) * impurity_left + (len(right_indices) / len(y)) * impurity_right
    return impurity_before - weighted_impurity

  def _impurity(self, y):
    if self.criterion == "mse":
      return np.mean((y - np.mean(y)) ** 2)
    elif self.criterion == "mae":
      return np.mean(np.abs(y - np.mean(y)))
    else:
      raise ValueError(f"Unknown criterion: {self.criterion}")

  def predict(self, X):
    return np.array([self._traverse_tree(x, self.tree) for x in X])

  def _traverse_tree(self, x, tree):
    if isinstance(tree, LeafNodeRegression):
      return tree.predicted_value()
    else:
      if x[tree.feature_index] <= tree.threshold:
        return self._traverse_tree(x, tree.left)
      else:
        return self._traverse_tree(x, tree.right)

In [5]:
class RandomForestRegression:
  def __init__(self, n_trees=5, max_depth=10, min_samples_split=2, criterion="mse"):
    self.n_trees = n_trees
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.criterion = criterion
    self.trees = []

  def fit(self, X, y):
    for _ in range(self.n_trees):
      tree = DecisionTreeRegression(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion)
      # Bootstrap sampling
      sample_indices = np.random.choice(len(X), len(X), replace=True)
      X_sample = X[sample_indices]
      y_sample = y[sample_indices]
      tree.fit(X_sample, y_sample)
      self.trees.append(tree)

  def predict(self, X):
    # Average the predictions from all trees
    tree_predictions = np.array([tree.predict(X) for tree in self.trees])
    return np.mean(tree_predictions, axis=0)

  def evaluate(self, X, y):
    predictions = self.predict(X)
    mse = np.mean((y - predictions) ** 2)
    r2 = 1 - (np.sum((y - predictions) ** 2) / np.sum((y - np.mean(y)) ** 2))
    return mse, r2


**Load Dataset**

In [6]:
# Example: Load the diabetes dataset for regression
data = load_diabetes()
X, y = data.data, data.target

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 10), (89, 10), (353,), (89,))

In [8]:
# Train the RandomForest regressor
forest_regressor = RandomForestRegression(n_trees=5, max_depth=10, min_samples_split=2, criterion="mse")
forest_regressor.fit(X_train, y_train)

In [9]:
# Predictions
y_pred = forest_regressor.predict(X_test)
print(y_pred[:5])

[153.34444444 186.03333333 135.4        275.525      140.        ]


In [10]:
# Evaluate
mse, r2 = forest_regressor.evaluate(X_test, y_test)
print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.2f}")

MSE: 3094.28
R2: 0.42
