# Assignment 7.2 - XGBoost

Welcome to the assignment for week 7.

Please submit your solution of this notebook in the Whiteboard at the corresponding Assignment entry as .ipynb-file and as .pdf.

#### Please state both names of your group members here:
Jane and John Doe

In [1]:
# Paola Gega, Daniel Thompson

## Task 7.2.1: XGBoost - Regression

* Build an XGBoost classifier using `numpy` only. Train your XGBoost model on the `California Housing` regression task. Report on the performance predicting unseen test samples. **(RESULTS)**

In [2]:
import numpy as np

# Class structure that might help. Feel free to modify as needed.
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTree:
    """Decision tree for XGBoost"""
    def __init__(
            self,
            max_depth,
            eta,
            min_split_loss,
            reg_lambda,
            min_samples_split=1,
            ):
        self.max_depth = max_depth
        self.eta = eta
        self.min_split_loss = min_split_loss
        self.reg_lambda = reg_lambda

        self.min_samples_split = min_samples_split
        self.root = None

    def _find_best_split(self, X, gradients, hessians):
        """Find the best feature and threshold to split on"""
        n_samples, n_features = X.shape
        best_gain = -np.inf
        best_feature_index = None
        best_threshold = None

        # Step 1: Sum all gradients and hessians for parent node
        G = np.sum(gradients)
        H = np.sum(hessians)

        # Step 2: Loop over all features
        for feature_index in range(n_features):
            # Step 3: Loop over all possible thresholds
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                # Sum gradients and hessians for potential left and right split
                left_mask = X[:, feature_index] <= threshold
                right_mask = X[:, feature_index] > threshold

                if np.sum(left_mask) < self.min_samples_split or np.sum(right_mask) < self.min_samples_split:
                    continue

                # Step 4: compute gain
                G_left = np.sum(gradients[left_mask])
                H_left = np.sum(hessians[left_mask])
                G_right = G - G_left
                H_right = H - H_left

                gain = 0.5 * (
                    (G_left ** 2) / (H_left + self.reg_lambda) +
                    (G_right ** 2) / (H_right + self.reg_lambda) -
                    (G ** 2) / (H + self.reg_lambda)
                ) - self.min_split_loss

                # Step 5: store best split based on gain
                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold, best_gain
    
    def _build_tree(self, X, gradients, hessians, depth=0):
        """Recursively build the decision tree."""
        n_samples = X.shape[0]

        if (depth >= self.max_depth or n_samples < self.min_samples_split):
            value = -np.sum(gradients) / (np.sum(hessians) + self.reg_lambda)
            return Node(value=self.eta * value)

        feature_index, threshold, gain = self._find_best_split(X, gradients, hessians)

        if gain <= 0:
            value = -np.sum(gradients) / (np.sum(hessians) + self.reg_lambda)
            return Node(value=self.eta * value)

        left_mask = X[:, feature_index] <= threshold
        right_mask = X[:, feature_index] > threshold

        left_node = self._build_tree(X[left_mask], gradients[left_mask], hessians[left_mask], depth + 1)
        right_node = self._build_tree(X[right_mask], gradients[right_mask], hessians[right_mask], depth + 1)

        return Node(feature_index=feature_index, threshold=threshold, left=left_node, right=right_node)

    def fit(self, X, gradients, hessians):
        """Build the tree"""
        self.root = self._build_tree(X, gradients, hessians)

    def _predict_one(self, x, node):
        """Predict the value for a single sample."""
        if node.value is not None:
            return node.value

        if x[node.feature_index] <= node.threshold:
            return self._predict_one(x, node.left)
        else:
            return self._predict_one(x, node.right)
    
    def predict(self, X):
        """Predict using the tree"""
        return np.array([self._predict_one(x, self.root) for x in X])

In [3]:
class XGBoost:
    """XGBoost implementation"""
    def __init__(
            self,
            n_estimators=100,
            max_depth=3,
            eta=0.1,
            min_split_loss=0,
            reg_lambda=1,
            min_samples_split=1,
            criterion="mse",
            ):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.eta = eta
        self.min_split_loss = min_split_loss
        self.reg_lambda = reg_lambda
        self.min_samples_split = min_samples_split
        self.criterion = criterion  # "mse" or "cross_entropy"
        self.trees = []

    def _get_gradients_and_hessians(self, y_true, y_pred):
        """Compute gradients and Hessians based on the loss function."""
        if self.criterion == "mse":
            gradients = y_pred - y_true
            hessians = np.ones_like(y_true)
        elif self.criterion == "cross_entropy":
            # For binary classification using logistic loss
            y_pred = 1 / (1 + np.exp(-y_pred))  # Sigmoid
            gradients = y_pred - y_true
            hessians = y_pred * (1 - y_pred)
        else:
            raise ValueError("Invalid criterion. Choose 'mse' or 'cross_entropy'.")
        return gradients, hessians

    def fit(self, X, y):
        """Train the XGBoost model"""
        # Initialize the prediction
        n_samples = X.shape[0]
        y_pred = np.zeros(n_samples)

        for _ in range(self.n_estimators):
            # Compute gradients and Hessians
            gradients, hessians = self._get_gradients_and_hessians(y, y_pred)
            # Fit a decision tree to the gradients and Hessians
            tree = DecisionTree(
                max_depth=self.max_depth,
                eta=self.eta,
                min_split_loss=self.min_split_loss,
                reg_lambda=self.reg_lambda,
                min_samples_split=self.min_samples_split,
            )
            tree.fit(X, gradients, hessians)
            # Update ensemble
            self.trees.append(tree)
            y_pred += tree.predict(X)

    def predict(self, X):
        """Make predictions"""
        y_pred = np.zeros(X.shape[0])
        for tree in self.trees:
            y_pred += tree.predict(X)
        return y_pred
    
    # Probabilities for classification :) - The Bonus task
    def predict_proba(self, X):
        """Predict probabilities for binary classification"""
        y_pred = self.predict(X)
        return 1 / (1 + np.exp(-y_pred))  # Sigmoid function for binary classification

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing

# Load California Housing data
data = fetch_california_housing()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Train XGBoost model
model = XGBoost(n_estimators=50, max_depth=4, eta=0.1, reg_lambda=1)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
print(f"MSE on training set: {mse_train:.4f}")
print(f"R^2 on training set: {r2_train:.4f}")
print()

y_predict = model.predict(X_test)
mse = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
print(f"MSE on test set: {mse:.4f}")
print(f"R^2 on test set: {r2:.4f}")


MSE on training set: 0.2601
R^2 on training set: 0.8051

MSE on test set: 0.2916
R^2 on test set: 0.7796


**Report:** We fit a random forest regressor to get a baseline metric and got a test MSE of about .25.  Our XGBoost model is not quite as accurate and certainly takes much longer to run, but at least the MSE is fairly close.

## Task 7.2.2: XGBoost - Classification (BONUS)

* Train an XGBoost model on the `Breast Cancer` binary classification task. Report on the performance predicting unseen test samples. **(RESULTS)**

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()

# Access the features and labels
X = data.data  # Shape: (569, 30)
y = data.target  # Shape: (569,) - 0 for malignant, 1 for benign

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Train XGBoost model
model = XGBoost(n_estimators=50, max_depth=4, eta=0.1, reg_lambda=1)
model.fit(X_train, y_train)
y_train_pred = model.predict_proba(X_train)
y_train_labels = (y_train_pred > 0.5).astype(int)
accuracy_train = accuracy_score(y_train, y_train_labels)
precision_train = precision_score(y_train, y_train_labels)
recall_train = recall_score(y_train, y_train_labels)
f1_train = f1_score(y_train, y_train_labels)
print(f"Accuracy on training set: {accuracy_train:.4f}")
print(f"Precision on training set: {precision_train:.4f}")
print(f"Recall on training set: {recall_train:.4f}")
print(f"F1 score on training set: {f1_train:.4f}")
print()

y_predict = model.predict_proba(X_test)
y_pred_labels = (y_predict > 0.5).astype(int)
accuracy_test = accuracy_score(y_test, y_pred_labels)
precision_test = precision_score(y_test, y_pred_labels)
recall_test = recall_score(y_test, y_pred_labels)
f1_test = f1_score(y_test, y_pred_labels)
print(f"Accuracy on test set: {accuracy_test:.4f}")
print(f"Precision on test set: {precision_test:.4f}")
print(f"Recall on test set: {recall_test:.4f}")
print(f"F1 score on test set: {f1_test:.4f}")

Accuracy on training set: 0.8404
Precision on training set: 0.7976
Recall on training set: 1.0000
F1 score on training set: 0.8874

Accuracy on test set: 0.8601
Precision on test set: 0.8165
Recall on test set: 1.0000
F1 score on test set: 0.8990


**Report:** Seems to work, more or less.

In [6]:
# With gradients and hessians calculated using cross-entropy for the loss function
model = XGBoost(n_estimators=50, max_depth=4, eta=0.1, reg_lambda=1, criterion="cross_entropy")
model.fit(X_train, y_train)
y_train_pred = model.predict_proba(X_train)
y_train_labels = (y_train_pred > 0.5).astype(int)
accuracy_train = accuracy_score(y_train, y_train_labels)
precision_train = precision_score(y_train, y_train_labels)
recall_train = recall_score(y_train, y_train_labels)
f1_train = f1_score(y_train, y_train_labels)
print(f"Accuracy on training set: {accuracy_train:.4f}")
print(f"Precision on training set: {precision_train:.4f}")
print(f"Recall on training set: {recall_train:.4f}")
print(f"F1 score on training set: {f1_train:.4f}")
print()

y_predict = model.predict_proba(X_test)
y_pred_labels = (y_predict > 0.5).astype(int)
accuracy_test = accuracy_score(y_test, y_pred_labels)
precision_test = precision_score(y_test, y_pred_labels)
recall_test = recall_score(y_test, y_pred_labels)
f1_test = f1_score(y_test, y_pred_labels)
print(f"Accuracy on test set: {accuracy_test:.4f}")
print(f"Precision on test set: {precision_test:.4f}")
print(f"Recall on test set: {recall_test:.4f}")
print(f"F1 score on test set: {f1_test:.4f}")

Accuracy on training set: 1.0000
Precision on training set: 1.0000
Recall on training set: 1.0000
F1 score on training set: 1.0000

Accuracy on test set: 0.9580
Precision on test set: 0.9770
Recall on test set: 0.9551
F1 score on test set: 0.9659


**Report:** With cross-entropy selected as our loss function in the XGBoost algorithm, we appear to get pretty decent classification results.

## Congratz, you made it! :)