#**XGBoost Random Forest - Regression (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier

**Code**

In [2]:
class Node:
    def __init__(self, feature_index, threshold, left, right):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right

In [3]:
class LeafNodeRegression:
    def __init__(self, y):
        self.value = np.mean(y)

    def predicted_value(self):
        return self.value

In [4]:
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        # Base case: return the mean if max depth reached or not enough samples
        if len(y) < self.min_samples_split or depth >= self.max_depth:
            return np.mean(y)

        n_samples, n_features = X.shape
        best_feature = None
        best_value = None
        best_left_indices = None
        best_right_indices = None
        best_mse = float('inf')

        for feature in range(n_features):
            values = np.unique(X[:, feature])
            for value in values:
                left_indices = np.where(X[:, feature] < value)[0]
                right_indices = np.where(X[:, feature] >= value)[0]
                if len(left_indices) > 0 and len(right_indices) > 0:
                    # Calculate the mean squared error
                    left_mean = np.mean(y[left_indices])
                    right_mean = np.mean(y[right_indices])
                    mse = np.mean((y[left_indices] - left_mean) ** 2) + np.mean((y[right_indices] - right_mean) ** 2)

                    if mse < best_mse:
                        best_mse = mse
                        best_feature = feature
                        best_value = value
                        best_left_indices = left_indices
                        best_right_indices = right_indices

        if best_feature is None:
            return np.mean(y)

        left_tree = self._build_tree(X[best_left_indices], y[best_left_indices], depth + 1)
        right_tree = self._build_tree(X[best_right_indices], y[best_right_indices], depth + 1)

        return (best_feature, best_value, left_tree, right_tree)

    def print_tree(self, tree=None, depth=0):
        if tree is None:
            tree = self.tree
        if isinstance(tree, tuple):
            feature, _, left_tree, right_tree = tree
            print(f"{'  ' * depth}Feature {feature} <= {tree[1]:.2f} ->")
            print(f"{'  ' * (depth + 1)}Left ->")
            self.print_tree(left_tree, depth + 2)
            print(f"{'  ' * (depth + 1)}Right ->")
            self.print_tree(right_tree, depth + 2)
        else:
            print(f"{'  ' * depth}Predict: {tree:.2f}")

    def predict(self, X):
        return np.array([self._predict_single(sample) for sample in X])

    def _predict_single(self, sample):
        node = self.tree
        while isinstance(node, tuple):
            feature, _, left_tree, right_tree = node
            if sample[feature] < node[1]:  # using the threshold here
                node = left_tree
            else:
                node = right_tree
        return node

In [5]:
class RandomForestWithXGBoost:
    def __init__(self, n_forests=1, n_trees=5, max_depth=10, min_samples_split=2, xgboost_trees=2):
        self.n_forests = n_forests
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.xgboost_trees = xgboost_trees
        self.forests = []

    def fit(self, X, y):
        for forest_index in range(self.n_forests):
            print(f"\nTraining Random Forest {forest_index + 1} with {self.n_trees} decision trees.")
            trees = []
            for tree_index in range(self.n_trees):
                tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
                sample_indices = np.random.choice(len(X), len(X), replace=True)
                X_sample = X[sample_indices]
                y_sample = y[sample_indices]
                tree.fit(X_sample, y_sample)
                trees.append(tree)

                # Print the tree structure
                print(f"\nTree {tree_index + 1} Structure:")
                tree.print_tree()

            self.forests.append(trees)

            # Train XGBoost model
            dtrain = xgb.DMatrix(X, label=y)
            params = {
                'max_depth': self.max_depth,
                'objective': 'reg:squarederror',
                'eval_metric': 'rmse'
            }
            self.xgb_model = xgb.train(params, dtrain, num_boost_round=self.xgboost_trees)

    def predict(self, X):
        predictions = []
        for forest_index, trees in enumerate(self.forests):
            forest_predictions = []
            print(f"\nForest {forest_index + 1} predictions:")
            for tree_index, tree in enumerate(trees):
                tree_pred = tree.predict(X)
                forest_predictions.append(tree_pred)
                print(f"  Tree {tree_index + 1} predictions: {tree_pred}")

            predictions.append(forest_predictions)

            # Print combined predictions for the forest
            combined_forest_preds = np.mean(forest_predictions, axis=0)
            print(f"Combined predictions for Forest {forest_index + 1}: {combined_forest_preds}")

        # XGBoost predictions
        dtest = xgb.DMatrix(X)
        xgb_preds = self.xgb_model.predict(dtest)

        # Print XGBoost predictions
        print(f"XGBoost predictions: {xgb_preds}")

        # Return combined forest predictions and XGBoost predictions
        return combined_forest_preds, xgb_preds

    def evaluate(self, X, y):
        # Get combined predictions from the first forest for evaluation
        combined_forest_preds, xgb_preds = self.predict(X)

        # Calculate and print mean squared error for evaluation
        mse_forest = np.mean((y - combined_forest_preds) ** 2)
        mse_xgb = np.mean((y - xgb_preds) ** 2)

        print(f"\nMean Squared Error for Random Forest: {mse_forest}")
        print(f"Mean Squared Error for XGBoost: {mse_xgb}")

In [6]:
class RandomForestEnsemble:
    def __init__(self, n_forests=3, n_trees=5, max_depth=10, min_samples_split=2, xgboost_trees=2):
        self.n_forests = n_forests
        self.forests = [
            RandomForestWithXGBoost(n_trees=n_trees, max_depth=max_depth, min_samples_split=min_samples_split, xgboost_trees=xgboost_trees)
            for _ in range(n_forests)
        ]

    def fit(self, X, y):
        for i, forest in enumerate(self.forests):
            print(f"Training Random Forest {i + 1} with {forest.n_trees} trees...")
            forest.fit(X, y)

    def predict(self, X):
        all_forest_predictions = np.array([forest.predict(X)[0] for forest in self.forests])
        final_predictions = np.mean(all_forest_predictions, axis=0)
        return final_predictions

    def evaluate(self, X, y):
        predictions = self.predict(X)
        mse = np.mean((y - predictions) ** 2)
        return mse

**Load Dataset**

In [7]:
# Load diabetes dataset
diabetes = load_diabetes()
X = diabetes.data  # Features
y = diabetes.target  # Target values

In [8]:
# Create and train the ensemble model
rf_ensemble = RandomForestEnsemble(n_forests=3, n_trees=5, max_depth=3, xgboost_trees=2)
rf_ensemble.fit(X, y)

Training Random Forest 1 with 5 trees...

Training Random Forest 1 with 5 decision trees.

Tree 1 Structure:
Feature 6 <= -0.10 ->
  Left ->
    Predict: 341.00
  Right ->
    Feature 0 <= 0.11 ->
      Left ->
        Feature 9 <= 0.14 ->
          Left ->
            Predict: 150.33
          Right ->
            Predict: 243.00
      Right ->
        Predict: 277.00

Tree 2 Structure:
Feature 6 <= -0.10 ->
  Left ->
    Predict: 341.00
  Right ->
    Feature 2 <= 0.17 ->
      Left ->
        Feature 2 <= 0.16 ->
          Left ->
            Predict: 143.49
          Right ->
            Predict: 346.00
      Right ->
        Predict: 242.00

Tree 3 Structure:
Feature 6 <= -0.09 ->
  Left ->
    Predict: 341.00
  Right ->
    Feature 5 <= -0.11 ->
      Left ->
        Predict: 40.00
      Right ->
        Feature 3 <= 0.13 ->
          Left ->
            Predict: 160.90
          Right ->
            Predict: 270.00

Tree 4 Structure:
Feature 0 <= 0.11 ->
  Left ->
    Feature 6 

In [9]:
# Evaluate the ensemble model
mse = rf_ensemble.evaluate(X, y)
print(f"\nMean Squared Error of the Ensemble: {mse}")


Forest 1 predictions:
  Tree 1 predictions: [150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 243.         150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 341.         150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.33180778 150.33180778 150.33180778
 150.33180778 150.33180778 150.