# Gradient Boosting
In this document, we implement the gradient boosting algorithm (machine learning model) from scratch in Python.

It is worth mentioning that, since gradient boositng comprises multiple sequential trees (decision trees), we import the DecisionTree class prvided in "decision_tree_scratch.ipynb".

## Import Libraries

In [1]:
import numpy as np
from collections import Counter
import import_ipynb
from decision_tree_scratch import DecisionTree

## Class

In [2]:
class GradientBoosting:
    def __init__(self, n_trees=10, min_samples_split=2, max_depth=10, n_features=None,
                learning_rate=0.1, mode='classification'):
        self.n_trees           = n_trees               # number of trees in the forest
        self.min_samples_split = min_samples_split     # stopping criterion: minimum number of samples in a leaf node
        self.max_depth         = max_depth             # stopping criterion: maximum depth of the tree
        self.n_features        = n_features            # number of features
        self.trees             = []                    # list of trees for the forest
        self.learning_rate     = learning_rate         # learning rate
        self.initial_pred      = None                  # initail preds based on the prior probability of the positive class
        self.mode              = mode.lower()          # classification or regression
        
    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(self.min_samples_split, self.max_depth, self.n_features, mode=self.mode)
            X_sample, y_sample = self._bootstrap_samples(X, y)
            tree.fit(X, y)
            self.trees.append(tree)

        if self.mode == 'classification':
            # log-odds for initialization
            p = np.clip(np.mean(y), 1e-10, 1 - 1e-10)  # to avoid divide-by-zero
            self.initial_pred = np.log(p / (1 - p))
        elif self.mode == 'regression':
            self.initial_pred = np.mean(y)

    def predict(self, X):
        # fill with the initial log-odds
        preds = np.full(X.shape[0], self.initial_pred)

        # apply trees
        for tree in self.trees:
            preds += self.learning_rate * tree.predict(X)

        if self.mode == 'regression':
            return preds

        # convert values (log-odds), called preds here, to probabilities using sigmoid function
        probs = 1 / (1 + np.exp(-preds))
        
        predictions = np.where(probs >= 0.5, 1, 0)
        
        return predictions

    ################################### Auxiliary Functions #####################################
    # form bootstrap (smapling) data
    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        
        return X[indices], y[indices]

## Fit and Evaluate Model

In [3]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### Classification

In [4]:
def classification():
    dataset = datasets.load_breast_cancer()
    X, y = dataset.data, dataset.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    clf = GradientBoosting()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    accuracy = np.sum(y_pred == y_test) / len(y_test)
    print(f"Results for Classification - Accuracy: {accuracy*100:.2f}")

### Regression

In [5]:
def regression():
    dataset = datasets.fetch_california_housing()
    X, y = dataset.data, dataset.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    reg = GradientBoosting(mode='regression')
    reg.fit(X_train, y_train)
    
    y_pred = reg.predict(X_test)
    loss = mean_squared_error(y_pred, y_test)
    print(f"Results for Regression - Loss: {loss:.4f}")

In [6]:
if __name__ == '__main__':
    classification()
    regression()

Results for Classification - Accuracy: 62.28
Results for Regression - Loss: 4.7000
