#**XGBoost Decision Tree - Classification (Scratch)**

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from xgboost import XGBClassifier
import xgboost as xgb

**Code**

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, leaf=False, class_value=None):
        self.feature = feature          # Feature index for splitting
        self.threshold = threshold      # Threshold value for splitting
        self.left = left                # Left child node
        self.right = right              # Right child node
        self.leaf = leaf                # Whether this node is a leaf
        self.class_value = class_value  # Class label if it's a leaf

In [3]:
class DecisionTree:
    def __init__(self, criterion='gini', max_depth=None, min_samples_split=2, loss_function='categorical_cross_entropy'):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.loss_function = loss_function
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        # Stopping criteria
        if len(y) < self.min_samples_split or (self.max_depth and depth >= self.max_depth) or len(set(y)) == 1:
            return self._create_leaf_node(y)

        # Find the best split
        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self._create_leaf_node(y)

        # Split the data
        left_indices = X[:, best_feature] < best_threshold
        right_indices = X[:, best_feature] >= best_threshold

        left_child = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_child = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)

    def _find_best_split(self, X, y):
        best_gain = -np.inf
        best_feature = None
        best_threshold = None
        n_features = X.shape[1]

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, X, y, feature, threshold):
        # Calculate the impurity before the split
        parent_impurity = self._impurity(y)

        # Split the data
        left_indices = X[:, feature] < threshold
        right_indices = X[:, feature] >= threshold

        # Calculate the weighted impurity of the children
        n = len(y)
        n_left = np.sum(left_indices)
        n_right = np.sum(right_indices)

        if n_left == 0 or n_right == 0:
            return 0

        child_impurity = (n_left / n) * self._impurity(y[left_indices]) + (n_right / n) * self._impurity(y[right_indices])

        # Calculate information gain
        return parent_impurity - child_impurity

    def _impurity(self, y):
        if self.criterion == 'gini':
            return self._gini_impurity(y)
        elif self.criterion == 'entropy':
            return self._entropy(y)

    def _gini_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        total = len(y)
        return 1 - sum((count / total) ** 2 for count in counts)

    def _entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        total = len(y)
        return -sum((count / total) * np.log2(count / total) for count in counts if count > 0)

    def _create_leaf_node(self, y):
        most_common = np.bincount(y).argmax()
        return Node(leaf=True, class_value=most_common)

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _predict_sample(self, sample, tree):
        if tree.leaf:
            return tree.class_value

        if sample[tree.feature] < tree.threshold:
            return self._predict_sample(sample, tree.left)
        else:
            return self._predict_sample(sample, tree.right)

    def print_tree(self, tree=None, indent="  "):
        """Prints the structure of the decision tree"""
        if tree is None:
            tree = self.tree
        if tree.leaf:
            print(f"{indent}Leaf: Class {tree.class_value}")
        else:
            print(f"{indent}Feature {tree.feature} <= {tree.threshold}")
            print(f"{indent}Left:")
            self.print_tree(tree.left, indent + "  ")
            print(f"{indent}Right:")
            self.print_tree(tree.right, indent + "  ")

**Load Dataset**

In [4]:
# Load Dataset
iris = load_iris()
X, y = iris.data, iris.target

In [5]:
print(X.shape), print(y.shape)
print(X[:5])
print(y[:5])

(150, 4)
(150,)
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0]


In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [7]:
# Train custom decision tree
tree = DecisionTree(criterion='gini', max_depth=3, min_samples_split=2)
tree.fit(X_train, y_train)

In [8]:
# Print the tree structure
print("Decision Tree Structure:")
tree.print_tree()

Decision Tree Structure:
  Feature 2 <= 3.0
  Left:
    Leaf: Class 0
  Right:
    Feature 2 <= 4.8
    Left:
      Feature 3 <= 1.7
      Left:
        Leaf: Class 1
      Right:
        Leaf: Class 2
    Right:
      Feature 3 <= 1.8
      Left:
        Leaf: Class 1
      Right:
        Leaf: Class 2


In [9]:
# Make predictions
predictions = tree.predict(X_test)
print(predictions[:5])

[1 0 2 1 1]


In [10]:
# Evaluate
accuracy = accuracy_score(y_test, predictions)
print(f'Decision Tree Accuracy: {accuracy:.2f}')

Decision Tree Accuracy: 1.00


In [11]:
# XGBoost Implementation
xgboost_model = xgb.XGBClassifier(n_estimators=5, max_depth=3, learning_rate=0.1, objective='multi:softmax')
xgboost_model.fit(X_train, y_train)

In [12]:
# Print each tree in the XGBoost model
print("\nXGBoost Trees:")
for i in range(xgboost_model.n_estimators):
    print(f"\nTree {i + 1}:")
    tree_dump = xgboost_model.get_booster().get_dump()[i]  # Getting the dump of the tree
    print(tree_dump)  # Ensure the tree structure is printed


XGBoost Trees:

Tree 1:
0:[f2<3] yes=1,no=2,missing=2
	1:leaf=0.142011836
	2:leaf=-0.0729483366


Tree 2:
0:[f2<3] yes=1,no=2,missing=2
	1:leaf=-0.0710059255
	2:[f3<1.79999995] yes=3,no=4,missing=4
		3:[f2<5] yes=5,no=6,missing=6
			5:leaf=0.13636364
			6:leaf=-3.25116267e-09
		4:[f2<4.9000001] yes=7,no=8,missing=8
			7:leaf=-2.55448485e-09
			8:leaf=-0.0700730011


Tree 3:
0:[f2<4.80000019] yes=1,no=2,missing=2
	1:[f3<1.5] yes=3,no=4,missing=4
		3:leaf=-0.0726315901
		4:[f0<5.69999981] yes=7,no=8,missing=8
			7:leaf=-2.55448485e-09
			8:leaf=-0.0517241433
	2:[f3<1.79999995] yes=5,no=6,missing=6
		5:[f2<5.0999999] yes=9,no=10,missing=10
			9:leaf=-0.0120000029
			10:leaf=0.0599999987
		6:[f2<4.9000001] yes=11,no=12,missing=12
			11:leaf=0.0428571403
			12:leaf=0.140145987


Tree 4:
0:[f2<3] yes=1,no=2,missing=2
	1:leaf=0.124176256
	2:leaf=-0.0702940747


Tree 5:
0:[f2<3] yes=1,no=2,missing=2
	1:leaf=-0.0683617592
	2:[f3<1.79999995] yes=3,no=4,missing=4
		3:[f2<5] yes=5,no=6,missing=6


In [13]:
# Make predictions with XGBoost
xgb_predictions = xgboost_model.predict(X_test)
print(xgb_predictions[:5])

[1 0 2 1 1]


In [14]:
# Evaluate XGBoost
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f'XGBoost Accuracy: {xgb_accuracy:.2f}')

XGBoost Accuracy: 1.00


In [15]:
# Display classification report
print("\nClassification Report for XGBoost:")
print(classification_report(y_test, xgb_predictions))


Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



#**Encapsulation**

In [16]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, leaf=False, class_value=None):
        self.feature = feature          # Feature index for splitting
        self.threshold = threshold      # Threshold value for splitting
        self.left = left                # Left child node
        self.right = right              # Right child node
        self.leaf = leaf                # Whether this node is a leaf
        self.class_value = class_value  # Class label if it's a leaf

In [17]:
class DecisionTree:
    def __init__(self, criterion='gini', max_depth=None, min_samples_split=2, loss_function='categorical_cross_entropy'):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.loss_function = loss_function
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        # Stopping criteria
        if len(y) < self.min_samples_split or (self.max_depth and depth >= self.max_depth) or len(set(y)) == 1:
            return self._create_leaf_node(y)

        # Find the best split
        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self._create_leaf_node(y)

        # Split the data
        left_indices = X[:, best_feature] < best_threshold
        right_indices = X[:, best_feature] >= best_threshold

        left_child = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_child = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)

    def _find_best_split(self, X, y):
        best_gain = -np.inf
        best_feature = None
        best_threshold = None
        n_features = X.shape[1]

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, X, y, feature, threshold):
        # Calculate the impurity before the split
        parent_impurity = self._impurity(y)

        # Split the data
        left_indices = X[:, feature] < threshold
        right_indices = X[:, feature] >= threshold

        # Calculate the weighted impurity of the children
        n = len(y)
        n_left = np.sum(left_indices)
        n_right = np.sum(right_indices)

        if n_left == 0 or n_right == 0:
            return 0

        child_impurity = (n_left / n) * self._impurity(y[left_indices]) + (n_right / n) * self._impurity(y[right_indices])

        # Calculate information gain
        return parent_impurity - child_impurity

    def _impurity(self, y):
        if self.criterion == 'gini':
            return self._gini_impurity(y)
        elif self.criterion == 'entropy':
            return self._entropy(y)

    def _gini_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        total = len(y)
        return 1 - sum((count / total) ** 2 for count in counts)

    def _entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        total = len(y)
        return -sum((count / total) * np.log2(count / total) for count in counts if count > 0)

    def _create_leaf_node(self, y):
        most_common = np.bincount(y).argmax()
        return Node(leaf=True, class_value=most_common)

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _predict_sample(self, sample, tree):
        if tree.leaf:
            return tree.class_value

        if sample[tree.feature] < tree.threshold:
            return self._predict_sample(sample, tree.left)
        else:
            return self._predict_sample(sample, tree.right)

    def print_tree(self, tree=None, indent="  "):
        """Prints the structure of the decision tree"""
        if tree is None:
            tree = self.tree
        if tree.leaf:
            print(f"{indent}Leaf: Class {tree.class_value}")
        else:
            print(f"{indent}Feature {tree.feature} <= {tree.threshold}")
            print(f"{indent}Left:")
            self.print_tree(tree.left, indent + "  ")
            print(f"{indent}Right:")
            self.print_tree(tree.right, indent + "  ")

In [18]:
def xgboost_ensemble(X_train, y_train, X_test, y_test):
    """Train and evaluate an XGBoost ensemble model."""
    xgboost_model = xgb.XGBClassifier(n_estimators=5, max_depth=3, learning_rate=0.1, objective='multi:softmax')
    xgboost_model.fit(X_train, y_train)

    # Print each tree in the XGBoost model
    print("\nXGBoost Trees:")
    for i in range(xgboost_model.n_estimators):
        print(f"\nTree {i + 1}:")
        tree_dump = xgboost_model.get_booster().get_dump()[i]  # Getting the dump of the tree
        print(tree_dump)  # Ensure the tree structure is printed

    # Make predictions with XGBoost
    xgb_predictions = xgboost_model.predict(X_test)

    # Evaluate XGBoost
    xgb_accuracy = accuracy_score(y_test, xgb_predictions)
    print(f'XGBoost Accuracy: {xgb_accuracy:.2f}')

    # Display classification report
    print("\nClassification Report for XGBoost:")
    print(classification_report(y_test, xgb_predictions))
    return xgb_predictions, xgb_accuracy

**Load Dataset**

In [19]:
# Load Dataset
iris = load_iris()
X, y = iris.data, iris.target

In [20]:
print(X.shape), print(y.shape)
print(X[:5])
print(y[:5])

(150, 4)
(150,)
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0]


In [21]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [22]:
# Train custom decision tree
tree = DecisionTree(criterion='gini', max_depth=3, min_samples_split=2)
tree.fit(X_train, y_train)

In [23]:
# Print the tree structure
print("Decision Tree Structure:")
tree.print_tree()

Decision Tree Structure:
  Feature 2 <= 3.0
  Left:
    Leaf: Class 0
  Right:
    Feature 2 <= 4.8
    Left:
      Feature 3 <= 1.7
      Left:
        Leaf: Class 1
      Right:
        Leaf: Class 2
    Right:
      Feature 3 <= 1.8
      Left:
        Leaf: Class 1
      Right:
        Leaf: Class 2


In [24]:
# Make predictions
predictions = tree.predict(X_test)
print(predictions[:5])

[1 0 2 1 1]


In [25]:
# Evaluate
accuracy = accuracy_score(y_test, predictions)
print(f'Decision Tree Accuracy: {accuracy:.2f}')

Decision Tree Accuracy: 1.00


In [26]:
# Call the XGBoost ensemble function
xgboost_ensemble(X_train, y_train, X_test, y_test)


XGBoost Trees:

Tree 1:
0:[f2<3] yes=1,no=2,missing=2
	1:leaf=0.142011836
	2:leaf=-0.0729483366


Tree 2:
0:[f2<3] yes=1,no=2,missing=2
	1:leaf=-0.0710059255
	2:[f3<1.79999995] yes=3,no=4,missing=4
		3:[f2<5] yes=5,no=6,missing=6
			5:leaf=0.13636364
			6:leaf=-3.25116267e-09
		4:[f2<4.9000001] yes=7,no=8,missing=8
			7:leaf=-2.55448485e-09
			8:leaf=-0.0700730011


Tree 3:
0:[f2<4.80000019] yes=1,no=2,missing=2
	1:[f3<1.5] yes=3,no=4,missing=4
		3:leaf=-0.0726315901
		4:[f0<5.69999981] yes=7,no=8,missing=8
			7:leaf=-2.55448485e-09
			8:leaf=-0.0517241433
	2:[f3<1.79999995] yes=5,no=6,missing=6
		5:[f2<5.0999999] yes=9,no=10,missing=10
			9:leaf=-0.0120000029
			10:leaf=0.0599999987
		6:[f2<4.9000001] yes=11,no=12,missing=12
			11:leaf=0.0428571403
			12:leaf=0.140145987


Tree 4:
0:[f2<3] yes=1,no=2,missing=2
	1:leaf=0.124176256
	2:leaf=-0.0702940747


Tree 5:
0:[f2<3] yes=1,no=2,missing=2
	1:leaf=-0.0683617592
	2:[f3<1.79999995] yes=3,no=4,missing=4
		3:[f2<5] yes=5,no=6,missing=6


(array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
        0, 2, 2, 2, 2, 2, 0, 0], dtype=int32),
 1.0)