#**XGBoost Decision Tree - Regression (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb

**Code**

In [2]:
class Node:
  def __init__(self, feature=None, threshold=None, left=None, right=None, leaf=False, value=None):
    self.feature = feature
    self.threshold = threshold
    self.left = left
    self.right = right
    self.leaf = leaf
    self.value = value

In [3]:
class DecisionTreeRegressor:
    def __init__(self, criterion='mse', max_depth=None, min_samples_split=2):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        if len(y) < self.min_samples_split or (self.max_depth and depth >= self.max_depth):
            return self._create_leaf_node(y)

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self._create_leaf_node(y)

        left_indices = X[:, best_feature] < best_threshold
        right_indices = X[:, best_feature] >= best_threshold

        left_child = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_child = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)

    def _find_best_split(self, X, y):
        best_gain = -np.inf
        best_feature = None
        best_threshold = None
        n_features = X.shape[1]

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, X, y, feature, threshold):
        parent_loss = self._loss(y)

        left_indices = X[:, feature] < threshold
        right_indices = X[:, feature] >= threshold

        n = len(y)
        n_left = np.sum(left_indices)
        n_right = np.sum(right_indices)

        if n_left == 0 or n_right == 0:
            return 0

        child_loss = (n_left / n) * self._loss(y[left_indices]) + (n_right / n) * self._loss(y[right_indices])

        return parent_loss - child_loss

    def _loss(self, y):
        if self.criterion == 'mse':
            return np.mean((y - np.mean(y)) ** 2)
        elif self.criterion == 'mae':
            return np.mean(np.abs(y - np.mean(y)))

    def _create_leaf_node(self, y):
        return Node(leaf=True, value=np.mean(y))

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _predict_sample(self, sample, tree):
        if tree.leaf:
            return tree.value
        if sample[tree.feature] < tree.threshold:
            return self._predict_sample(sample, tree.left)
        else:
            return self._predict_sample(sample, tree.right)

    def print_tree(self, tree=None, depth=0):
        if tree is None:
            tree = self.tree

        if tree.leaf:
            print(f"{'    ' * depth}Leaf Node: Predict {tree.value:.4f}")
        else:
            print(f"{'    ' * depth}Feature {tree.feature} < {tree.threshold:.4f}")
            self.print_tree(tree.left, depth + 1)
            self.print_tree(tree.right, depth + 1)

In [4]:
class CustomBoostingRegressor:
    def __init__(self, n_trees=5, max_depth=5, min_samples_split=3, learning_rate=0.1, criterion='mse'):
        self.n_trees = n_trees
        self.learning_rate = learning_rate
        self.trees = [DecisionTreeRegressor(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split)
                      for _ in range(n_trees)]
        self.initial_prediction = None

    def fit(self, X, y):
        # Initialize with the mean of the target values
        self.initial_prediction = np.mean(y)
        residuals = y - self.initial_prediction

        for i, tree in enumerate(self.trees):
            # Train each tree on the residuals
            tree.fit(X, residuals)
            predictions = tree.predict(X)
            residuals -= self.learning_rate * predictions
            print(f"\nTree {i+1} trained. Printing Tree Structure:")
            tree.print_tree()  # Print the structure of the tree after training

    def predict(self, X):
        # Start with the initial prediction (mean of the target values)
        predictions = np.full(X.shape[0], self.initial_prediction)

        # Add the contribution of each tree
        for i, tree in enumerate(self.trees):
            tree_predictions = tree.predict(X)
            predictions += self.learning_rate * tree_predictions
            print(f"Predictions from Tree {i+1}: {tree_predictions[:5]}")  # Print first 5 predictions from each tree

        return predictions

**Load Dataset**

In [5]:
# Load dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Train the custom boosting regressor
boosting_regressor = CustomBoostingRegressor(n_trees=5, max_depth=5, min_samples_split=3, learning_rate=0.1)
boosting_regressor.fit(X_train, y_train)


Tree 1 trained. Printing Tree Structure:
Feature 2 < 0.0056
    Feature 8 < 0.0070
        Feature 8 < -0.0426
            Feature 4 < -0.0387
                Feature 4 < -0.0456
                    Leaf Node: Predict -69.6532
                    Leaf Node: Predict 5.0135
                Feature 0 < 0.0417
                    Leaf Node: Predict -98.3365
                    Leaf Node: Predict -73.9032
            Feature 6 < 0.0266
                Feature 4 < -0.0511
                    Leaf Node: Predict 3.9301
                    Leaf Node: Predict -40.1404
                Feature 0 < 0.0054
                    Leaf Node: Predict -78.1365
                    Leaf Node: Predict -47.8944
        Feature 7 < 0.1081
            Feature 0 < 0.0018
                Feature 0 < -0.0527
                    Leaf Node: Predict 41.2635
                    Leaf Node: Predict -28.9588
                Feature 4 < -0.0084
                    Leaf Node: Predict 86.6920
                    Leaf Node: 

In [8]:
# Make predictions
y_pred = boosting_regressor.predict(X_test)
print(y_pred[:5])

Predictions from Tree 1: [  4.0711484   17.93586988   4.0711484   92.88414575 -40.14039006]
Predictions from Tree 2: [-80.64365875  12.38552333 -80.64365875  83.59573117 -28.64448971]
Predictions from Tree 3: [-49.66282369  78.81808648 -49.66282369  45.16644715 -53.00909552]
Predictions from Tree 4: [-10.23191745  16.87404781  43.61149283  37.1126503  -65.87690395]
Predictions from Tree 5: [-43.08303405  75.86401919 -43.08303405  42.19722919 -26.79071879]
[135.78151536 173.92429858 141.16585638 183.83216427 132.29038411]


In [9]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [10]:
print(f"\nFinal Ensemble Predictions: {y_pred[:5]}")  # Print first 5 ensemble predictions
print(f'MSE: {mse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R2: {r2:.2f}')


Final Ensemble Predictions: [135.78151536 173.92429858 141.16585638 183.83216427 132.29038411]
MSE: 3870.17
MAE: 54.07
R2: 0.27
