# Assignment 7.2 - XGBoost

Welcome to the assignment for week 7.

Please submit your solution of this notebook in the Whiteboard at the corresponding Assignment entry as .ipynb-file and as .pdf.

#### Please state both names of your group members here:
Jane and John Doe

In [1]:
# Paola Gega, Daniel Thompson

## Task 7.2.1: XGBoost - Regression

* Build an XGBoost classifier using `numpy` only. Train your XGBoost model on the `California Housing` regression task. Report on the performance predicting unseen test samples. **(RESULTS)**

In [2]:
import numpy as np

# Class structure that might help. Feel free to modify as needed.
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTree:
    """Decision tree for XGBoost"""
    def __init__(
            self,
            max_depth,
            eta,
            min_split_loss,
            reg_lambda,
            min_samples_split=1,
            ):
        self.max_depth = max_depth
        self.eta = eta
        self.min_split_loss = min_split_loss
        self.reg_lambda = reg_lambda

        self.min_samples_split = min_samples_split
        self.root = None

    def _find_best_split(self, X, gradients, hessians):
        """Find the best feature and threshold to split on"""
        if len(y) <= self.min_samples_split:
            return None, None, None

        best_feature_index, best_threshold, best_gain = None, None, 0
        n_features = X.shape[1]
        # Step 1: Sum all gradients and hessians for parent node
        leaf_weight = - np.sum(gradients) / np.sum(hessians) + self.reg_lambda

        # Step 2: Loop over all features
        for feature_index in range(n_features):
            # Sort gradients and hessians by feature values
            sorted_indices = np.argsort(X[:, feature_index])
            X_sorted = X[sorted_indices, feature_index]
            gradients_sorted = gradients[sorted_indices]
            hessians_sorted = hessians[sorted_indices]

            # Step 3: Loop over all possible thresholds
            thresholds = self._get_split_candidates(X[:, feature_index])
            for threshold in thresholds:
                # Sum gradients and hessians for potential left and right split
                left_indices = X_sorted <= threshold
                right_indices = ~left_indices

                if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
                    continue
                # (Check for minimum child weight constraint, if applicable)
                G_left = np.sum(gradients_sorted[left_indices])
                H_left = np.sum(hessians_sorted[left_indices])
                G_right = np.sum(gradients_sorted[right_indices])
                H_right = np.sum(hessians_sorted[right_indices])
                
                # Step 4: compute gain
                left_weight = - G_left / (H_left + self.reg_lambda)
                right_weight = - G_right / (H_right + self.reg_lambda)
                gain = 0.5 * (G_left**2 / (H_left + self.reg_lambda) 
                              + G_right**2 / (H_right + self.reg_lambda) 
                              - (G_left + G_right)**2 / (H_left + H_right + self.reg_lambda)) - self.min_split_loss

                # Step 5: store best split based on gain
                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold
        return best_feature_index, best_threshold, best_gain

    def _get_split_candidates(self, feature_values):
        unique_feature_values = np.unique(feature_values)
        return (unique_feature_values[:-1]+unique_feature_values[1:])/2

    def _build_tree(self, X, gradients, hessians, depth=0):
        """Recursively build the decision tree."""
        n_samples, n_features = X.shape

        if depth >= self.max_depth:
            leaf_weight = - np.sum(gradients) / np.sum(hessians) + self.reg_lambda
            return Node(value=leaf_weight)

        feature_index, threshold, gain = self._find_best_split(X, gradients, hessians)

        if feature_index is None:
            leaf_weight = - np.sum(gradients) / np.sum(hessians) + self.reg_lambda
            return Node(value=leaf_weight)

        left_indices = X[:, feature_index] <= threshold
        right_indices = ~left_indices

        left_subtree = self._build_tree(X[left_indices],
                                        gradients[left_indices],
                                        hessians[left_indices],
                                        depth + 1)
        right_subtree = self._build_tree(X[right_indices],
                                         gradients[right_indices],
                                         hessians[right_indices],
                                         depth + 1)

        return Node(feature_index=feature_index, threshold=threshold, left=left_subtree, right=right_subtree)
    
    def fit(self, X, gradients, hessians):
        """Build the tree"""
        self.root = self._build_tree(X, gradients, hessians)

    def _predict_one(self, x, node):
        """Predict the class for a single sample."""
        if node.value is not None:
            return node.value

        if x[node.feature_index] <= node.threshold:
            return self._predict_one(x, node.left)
        else:
            return self._predict_one(x, node.right)
    
    def predict(self, X):
        """Predict using the tree"""
        return np.array([self._predict_one(x, self.root) for x in X])

In [3]:
class XGBoost:
    """XGBoost implementation"""
    
    def __init__(
        self,
        n_estimators=100,
        max_depth=10,
        eta=0.1,
        min_split_loss=0,
        reg_lambda=1e-5,
        criterion="mse",
    ):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.eta = eta
        self.min_split_loss = min_split_loss
        self.reg_lambda = reg_lambda
        self.criterion = criterion  # "mse" or "cross_entropy"
        self.trees = []

    def _get_gradients_and_hessians(self, y_true, y_pred):
        """Compute gradients and Hessians based on the loss function."""
        if self.criterion == "mse":
            gradients = y_pred - y_true
            hessians = np.ones_like(y_true)
        elif self.criterion == "cross_entropy":
            # For binary classification using logistic loss
            y_pred = 1 / (1 + np.exp(-y_pred))  # Sigmoid
            gradients = y_pred - y_true
            hessians = y_pred * (1 - y_pred)
        else:
            raise ValueError("Invalid criterion. Choose 'mse' or 'cross_entropy'.")
        return gradients, hessians
    
    def fit(self, X, y):
        """Train the XGBoost model"""
        # Initialize the prediction
        F_0 = np.mean(y)
        F_m = np.full(y.shape, F_0)

        for m in range(self.n_estimators):
            # Compute gradients and Hessians
            gradients, hessians = self._get_gradients_and_hessians(y, F_m)
            # Fit a decision tree to the gradients and Hessians
            tree = DecisionTree(self.max_depth, self.eta, self.min_split_loss, self.reg_lambda)
            tree.fit(X, gradients, hessians)
            # Update ensemble
            self.trees.append(tree)
            F_m += self.eta * tree.predict(X)
    
    def predict(self, X):
        """Make predictions"""
        F_m = np.zeros(X.shape[0])
        for tree in self.trees:
            F_m += self.eta * tree.predict(X)
        return F_m
    
    # Probabilities for classification :) - The Bonus task
    def predict_proba(self, X):
        """Predict probabilities for binary classification"""
        F_m = np.zeros(X.shape[0])
        for tree in self.trees:
            F_m += self.eta * tree.predict(X)
        probs = 1 / (1 + np.exp(-F_m))  # Sigmoid
        return np.vstack([1 - probs, probs]).T


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing


# Load California Housing data
data = fetch_california_housing()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Don't know if standardizing is necessary or not...
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a XGBoost model
model = XGBoost(n_estimators=10, max_depth=5)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print("MSE on training set:", mean_squared_error(y_train, y_train_pred))
print("R^2 on training set:", r2_score(y_train, y_train_pred))
y_test_pred = model.predict(X_test)
print("MSE on test set:", mean_squared_error(y_test, y_test_pred))
print("R^2 on test set:", r2_score(y_test, y_test_pred))


MSE on training set: 4.80496077296282
R^2 on training set: -2.6010696397706687
MSE on test set: 4.815143760528366
R^2 on test set: -2.638964538536548


## Task 7.2.2: XGBoost - Classification (BONUS)

* Train an XGBoost model on the `Breast Cancer` binary classification task. Report on the performance predicting unseen test samples. **(RESULTS)**

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()

# Access the features and labels
X = data.data  # Shape: (569, 30)
y = data.target  # Shape: (569,) - 0 for malignant, 1 for benign

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Don't know if standardizing is necessary or not...
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a XGBoost model
model = XGBoost(n_estimators=10, max_depth=5)
model.fit(X_train, y_train)
y_train_pred = np.where(model.predict_proba(X_train) >= 0.5, 1, 0)
print("Accuracy on training set:", accuracy_score(y_train, y_train_pred))
y_test_pred = np.where(model.predict_proba(X_test) >= 0.5, 1, 0)
print("Accuracy on test set:", accuracy_score(y_test, y_test_pred))

## Congratz, you made it! :)