#**XGBoost Random Forest - Classification (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier

**Code**

In [2]:
class Node:
    def __init__(self, feature_index, threshold, left, right):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right

In [3]:
class LeafNode:
    def __init__(self, y):
        self.labels, self.counts = np.unique(y, return_counts=True)

    def predicted_class(self):
        return self.labels[np.argmax(self.counts)]

In [4]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion="gini"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        if len(unique_classes) == 1 or depth >= self.max_depth or num_samples < self.min_samples_split:
            return LeafNode(y)

        best_split = self._best_split(X, y, num_features)
        if best_split is None:
            return LeafNode(y)

        left_indices = best_split['indices_left']
        right_indices = best_split['indices_right']

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(best_split['feature_index'], best_split['threshold'], left_subtree, right_subtree)

    def _best_split(self, X, y, num_features):
        best_gain = -1
        best_split = None

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                indices_left = np.where(X[:, feature_index] <= threshold)[0]
                indices_right = np.where(X[:, feature_index] > threshold)[0]

                if len(indices_left) > 0 and len(indices_right) > 0:
                    gain = self._information_gain(y, indices_left, indices_right)
                    if gain > best_gain:
                        best_gain = gain
                        best_split = {
                            'feature_index': feature_index,
                            'threshold': threshold,
                            'indices_left': indices_left,
                            'indices_right': indices_right
                        }
        return best_split

    def _information_gain(self, y, left_indices, right_indices):
        impurity_before = self._impurity(y)
        impurity_left = self._impurity(y[left_indices])
        impurity_right = self._impurity(y[right_indices])

        weighted_impurity = (len(left_indices) / len(y)) * impurity_left + (len(right_indices) / len(y)) * impurity_right
        return impurity_before - weighted_impurity

    def _impurity(self, y):
        if self.criterion == "gini":
            return self._gini_impurity(y)
        elif self.criterion == "entropy":
            return self._entropy_impurity(y)
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

    def _gini_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def _entropy_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities + 1e-15))

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, tree):
        if isinstance(tree, LeafNode):
            return tree.predicted_class()
        else:
            if x[tree.feature_index] <= tree.threshold:
                return self._traverse_tree(x, tree.left)
            else:
                return self._traverse_tree(x, tree.right)

    def print_tree(self, tree=None, depth=0):
        if tree is None:
            tree = self.tree
        if isinstance(tree, LeafNode):
            print(f"{'   ' * depth}Predict {tree.predicted_class()}")
        else:
            print(f"{'   ' * depth}Feature {tree.feature_index} <= {tree.threshold}")
            self.print_tree(tree.left, depth + 1)
            print(f"{'   ' * depth}Feature {tree.feature_index} > {tree.threshold}")
            self.print_tree(tree.right, depth + 1)

In [5]:
class RandomForestWithXGBoost:
    def __init__(self, n_forests=1, n_trees=5, max_depth=10, min_samples_split=2, criterion="gini", xgboost_trees=2):
        self.n_forests = n_forests  # Number of random forests
        self.n_trees = n_trees  # Number of trees in each random forest
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.xgboost_trees = xgboost_trees  # Number of XGBoost trees
        self.forests = []  # Store the forests

    def fit(self, X, y):
        for forest_index in range(self.n_forests):
            print(f"\nTraining Random Forest {forest_index + 1} with {self.n_trees} decision trees.")
            trees = []
            for tree_index in range(self.n_trees):
                tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion)
                sample_indices = np.random.choice(len(X), len(X), replace=True)
                X_sample = X[sample_indices]
                y_sample = y[sample_indices]
                tree.fit(X_sample, y_sample)
                trees.append(tree)

                # Print the tree structure
                print(f"\nTree {tree_index + 1} Structure:")
                tree.print_tree()

            self.forests.append(trees)

            # Train XGBoost model
            dtrain = xgb.DMatrix(X, label=y)
            params = {
                'max_depth': self.max_depth,
                'objective': 'multi:softmax',
                'num_class': len(np.unique(y)),
                'eval_metric': 'mlogloss'
            }
            self.xgb_model = xgb.train(params, dtrain, num_boost_round=self.xgboost_trees)

    def predict(self, X):
        predictions = []
        for forest_index, trees in enumerate(self.forests):
            forest_predictions = []
            print(f"\nForest {forest_index + 1} predictions:")
            for tree_index, tree in enumerate(trees):
                tree_pred = tree.predict(X)
                forest_predictions.append(tree_pred)
                print(f"  Tree {tree_index + 1} predictions: {tree_pred}")

            predictions.append(forest_predictions)

            # Print combined predictions for the forest
            combined_forest_preds = np.array([Counter(row).most_common(1)[0][0] for row in np.vstack(forest_predictions).T])
            print(f"Combined predictions for Forest {forest_index + 1}: {combined_forest_preds}")

        # XGBoost predictions
        dtest = xgb.DMatrix(X)
        xgb_preds = self.xgb_model.predict(dtest)

        # Print XGBoost predictions
        print(f"XGBoost predictions: {xgb_preds}")

        # Return combined forest predictions and XGBoost predictions
        return combined_forest_preds, xgb_preds

    def evaluate(self, X, y):
        # Get combined predictions from the first forest for evaluation
        combined_forest_preds, xgb_preds = self.predict(X)

        # Calculate the classification report and confusion matrix
        print("\nClassification Report for Random Forest:")
        print(classification_report(y, combined_forest_preds))
        print("Confusion Matrix for Random Forest:")
        print(confusion_matrix(y, combined_forest_preds))

        print("\nClassification Report for XGBoost:")
        print(classification_report(y, xgb_preds))
        print("Confusion Matrix for XGBoost:")
        print(confusion_matrix(y, xgb_preds))

In [6]:
class RandomForestEnsemble:
    def __init__(self, n_forests=3, n_trees=5, max_depth=10, min_samples_split=2, criterion="gini", xgboost_trees=2):
        self.n_forests = n_forests
        self.forests = [
            RandomForestWithXGBoost(n_trees=n_trees, max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion, xgboost_trees=xgboost_trees)
            for _ in range(n_forests)
        ]

    def fit(self, X, y):
        for i, forest in enumerate(self.forests):
            print(f"Training Random Forest {i+1} with {forest.n_trees} trees...")
            forest.fit(X, y)

    def predict(self, X):
        all_forest_predictions = np.array([forest.predict(X) for forest in self.forests])
        final_predictions = [Counter(row).most_common(1)[0][0] for row in all_forest_predictions.T]
        return final_predictions

    def evaluate(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy

    def print_forest_details(self):
        for i, forest in enumerate(self.forests):
            print(f"\nRandom Forest {i+1}:")
            forest.print_trees()

**Load Dataset**

In [7]:
# Load Iris dataset
data = load_iris()
X, y = data.data, data.target

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Create the ensemble of random forests
ensemble = RandomForestEnsemble(n_forests=3, n_trees=5, max_depth=10, min_samples_split=2, criterion="gini", xgboost_trees=2)

In [10]:
# Train RandomForest with XGBoost integration
n_forests = 3  # Number of random forests
forest = RandomForestWithXGBoost(n_forests=n_forests, n_trees=5, max_depth=10, min_samples_split=2, criterion="gini", xgboost_trees=2)
forest.fit(X_train, y_train)


Training Random Forest 1 with 5 decision trees.

Tree 1 Structure:
Feature 2 <= 1.7
   Predict 0
Feature 2 > 1.7
   Feature 2 <= 4.9
      Feature 3 <= 1.5
         Predict 1
      Feature 3 > 1.5
         Feature 1 <= 2.8
            Predict 2
         Feature 1 > 2.8
            Predict 1
   Feature 2 > 4.9
      Feature 2 <= 5.0
         Feature 0 <= 6.3
            Predict 2
         Feature 0 > 6.3
            Predict 1
      Feature 2 > 5.0
         Predict 2

Tree 2 Structure:
Feature 2 <= 1.9
   Predict 0
Feature 2 > 1.9
   Feature 3 <= 1.7
      Feature 2 <= 5.0
         Feature 0 <= 4.9
            Feature 1 <= 2.4
               Predict 1
            Feature 1 > 2.4
               Predict 2
         Feature 0 > 4.9
            Predict 1
      Feature 2 > 5.0
         Predict 2
   Feature 3 > 1.7
      Predict 2

Tree 3 Structure:
Feature 2 <= 1.9
   Predict 0
Feature 2 > 1.9
   Feature 3 <= 1.7
      Feature 2 <= 4.9
         Feature 3 <= 1.6
            Predict 1
         

In [11]:
# Predictions
predictions = forest.predict(X_test)
print("Predicted classes from each random forest:", predictions)


Forest 1 predictions:
  Tree 1 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0]
  Tree 2 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 3 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 4 predictions: [1 0 2 1 2 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 5 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 2 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Combined predictions for Forest 1: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]

Forest 2 predictions:
  Tree 1 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 2 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 3 predictions: [1 0 2 1 2 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 4 predictions: [1 0 2 1 1 0 1 2 2 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 5 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Combined predicti

In [12]:
# Evaluate the model
forest.evaluate(X_test, y_test)


Forest 1 predictions:
  Tree 1 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0]
  Tree 2 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 3 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 4 predictions: [1 0 2 1 2 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 5 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 2 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Combined predictions for Forest 1: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]

Forest 2 predictions:
  Tree 1 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 2 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 3 predictions: [1 0 2 1 2 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 4 predictions: [1 0 2 1 1 0 1 2 2 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
  Tree 5 predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Combined predicti