# XGBoost

每次迭代添加一个新树来填补上上次产生的残差值,来达到贴近真实值的目的.

### 数学原理

- 目标: $Obj^{(t)} = \sum_{i=1}^nl(y_i, \hat y_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) $ 其中: $\hat y_i^{(t)} = \hat y_i^{(t-1)} + f_t(x_i)$
  - 利用二阶泰勒展开式: $f(x + \Delta x) \approx f(x) + f'(x)\Delta x + \frac12f''(x)\Delta x^2$
  $$
  \begin{align*}
  &令: \\
  &g_i = \partial_{\hat y^{(t-1)}}l(y_i, \hat y_i^{(t-1)}) \\
  &h_i = \partial^2_{\hat y^{(t-1)}}l(y_i, \hat y_i^{(t-1)}) \\
  &则: \\
  &Obj^{(t)} \approx \sum_{i=1}^n\left[l(y_i, \hat y_i^{(t-1)}) + g_i f_t(x_i) + \frac12h_if_t^2(x_i)\right] + \Omega(f_t) \\
  &去除常数项 \\
  &Obj^{(t)} \approx \sum_{i=1}^n\left[ g_i f_t(x_i) + \frac12h_if_t^2(x_i)\right] + \Omega(f_t) \\
  &定义 f_t(x): \\
  &f_t(x) = w_{q(x)} \quad (w \in \mathbb{R}^T: 叶节点的权重, q(x): 数据对应叶节点的 index) \\
  &定义正则项: \\
  &\Omega(f_t) = \gamma T + \frac12\lambda\sum_{j=1}^Tw_j^2 \\
  &令: \\
  &I_j = \{i\;|\;q(x_i) = j\} \\
  &则: \\
  &Obj^{(t)} \approx \sum_{j=1}^T\left[(\sum_{i\in I_j}g_i)w_j + \frac12(\sum_{i\in I_j}h_i + \lambda)w_j^2\right] + \gamma T \\
  &令: \\
  &G_j = \sum_{i \in I_j}g_i \\
  &H_j = \sum_{i \in I_j}h_i \\
  &则: \\
  &Obj^{(t)} \approx \sum_{j=1}^T\left[G_jw_j + \frac12(H_j + \lambda)w_j^2\right] + \gamma T \\
  &求最小值,得: \\
  &w_j^* = -\frac{G_j}{H_j + \lambda} \\
  &Obj^* = -\frac12\sum_{j=1}^T\frac{G_j^2}{H_j + \lambda} + \gamma T \\
  &对于一个节点分裂为两个叶节点时,得: \\
  &Gain = \frac{1}{2} \left[\frac{G_L^2}{H_L + \lambda} + \frac{G_R^2}{H_R + \lambda} - \frac{(G_L + G_R)^2}{H_L + H_R + \lambda}\right] - \gamma \\
  &结论: 使得 Gain 增加最多, 则是最佳分裂 \\
  \end{align*}
  $$

特征分裂算法:
1. 每个节点, 列举所有的特征
    1. 每个特征下数据排序
    2. 使用线性扫描的方法,决定最佳分裂

### 定义[决策树](./0-Decision-Tree.ipynb#%E5%86%B3%E7%AD%96%E6%A0%91)

In [1]:
import numpy as np


class Node:
    def __init__(self, left, right, rule):
        self.left = left
        self.right = right
        self.feature = rule[0]
        self.threshold = rule[1]


class Leaf:
    def __init__(self, value):
        self.value = value


class XGBoostDecisionTree:
    def __init__(
        self,
        classifier=True,
        max_depth=None,
        seed=None,
    ):
        if seed:
            np.random.seed(seed)

        self.depth = 0
        self.root = None

        self.reg_lambda = 1.
        self.classifier = classifier
        self.max_depth = max_depth if max_depth else np.inf
        self.loss = LogisticRegression() if classifier else LinearSquareLoss()


    def fit(self, X, Y, Y_pred):
        self.n_classes = max(Y) + 1 if self.classifier else None
        self.n_feats = X.shape[1] 
        self.root = self._grow(X, Y, Y_pred)

    def predict(self, X):
        return np.array([self._traverse(x, self.root) for x in X])

    def predict_class_probs(self, X):
        assert self.classifier, "`predict_class_probs` undefined for classifier = False"
        return np.array([self._traverse(x, self.root, prob=True) for x in X])

    def _grow(self, X, Y, Y_pred):
        # if all labels are the same, return a leaf
        if len(set(Y)) == 1:
            v = self._compute_weight(Y, Y_pred)
            return Leaf(v) 

        # if we have reached max_depth, return a leaf
        if self.depth >= self.max_depth:
            v = self._compute_weight(Y, Y_pred)
            return Leaf(v) 

        N, M = X.shape
        self.depth += 1
        feat_idxs = np.random.choice(M, self.n_feats, replace=False)

        feat, thresh = self._segment(X, Y, Y_pred, feat_idxs)
        l = np.argwhere(X[:, feat] <= thresh).flatten()
        r = np.argwhere(X[:, feat] > thresh).flatten()

        # grow the children that result from the split
        left = self._grow(X[l, :], Y[l], Y_pred[l])
        right = self._grow(X[r, :], Y[r], Y_pred[r])
        return Node(left, right, (feat, thresh))

    def _segment(self, X, Y, Y_pred, feat_idxs):
        best_gain = -np.inf
        split_idx, split_thresh = None, None
        for i in feat_idxs:
            vals = X[:, i]
            levels = np.unique(vals)
            thresholds = (levels[:-1] + levels[1:]) / 2
            gains = np.array([self._impurity_gain(Y, Y_pred, t, vals) for t in thresholds])

            if gains.max() > best_gain:
                split_idx = i
                best_gain = gains.max()
                split_thresh = thresholds[gains.argmax()]

        return split_idx, split_thresh
    
    def _gain(self, y, y_pred):
        nominator = np.power((self.loss.gradient(y, y_pred)).sum(), 2)
        denominator = self.loss.hess(y, y_pred).sum()
        return 0.5 * (nominator / denominator + self.reg_lambda)
    
    def _compute_weight(self, y, y_pred):
        nominator = self.loss.gradient(y, y_pred).sum()
        denominator = self.loss.hess(y, y_pred).sum()
        return -nominator / (denominator + self.reg_lambda)

    def _impurity_gain(self, y, y_pred, split_thresh, feat_values):
        # generate split
        left = np.argwhere(feat_values <= split_thresh).flatten()
        right = np.argwhere(feat_values > split_thresh).flatten()

        if len(left) == 0 or len(right) == 0:
            return 0

        left_gain = self._gain(y[left], y_pred[left])
        right_gain = self._gain(y[right], y_pred[right])
        gain = self._gain(y, y_pred)
        return left_gain + right_gain - gain

    def _traverse(self, X, node, prob=False):
        if isinstance(node, Leaf):
            return node.value
        if X[node.feature] <= node.threshold:
            return self._traverse(X, node.left, prob)
        return self._traverse(X, node.right, prob)

class LinearSquareLoss():
    """Least squares loss"""

    def gradient(self, label, pred):
        return pred - label

    def hess(self, label, pred):
        return np.ones_like(pred)
    

class LogisticRegression():
    """logistic regression loss"""
    
    def _sigmoid(self, z):
        return 1. / (1. + np.exp(-z))

    def gradient(self, label, pred):
        pred = self._sigmoid(pred)
        return pred - label

    def hess(self, label, pred):
        pred = self._sigmoid(pred)
        eps = np.finfo(float).eps
        return np.maximum(pred * (1. - pred), eps)

### 算法实现

In [4]:
class XGBoost(object):
    """The XGBoost classifier.

    Reference: http://xgboost.readthedocs.io/en/latest/model.html

    Parameters:
    -----------
    n_estimators: int
        The number of classification trees that are used.
    learning_rate: float
        The step length that will be taken when following the negative gradient during
        training.
    max_depth: int
        The maximum depth of a tree.
    """

    def __init__(
        self, 
        classifier=True,
        n_estimators=200, 
        learning_rate=0.01, 
        max_depth=2
    ):
        self.n_estimators = n_estimators  # Number of trees
        self.learning_rate = learning_rate  # Step size for weight update
        self.max_depth = max_depth  # Maximum depth for tree

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = XGBoostDecisionTree(classifier=classifier, max_depth=self.max_depth)
            self.trees.append(tree)

    def fit(self, X, y):
        y_pred = np.zeros_like(y)
        for i in range(self.n_estimators):
            tree = self.trees[i]
            tree.fit(X, y, y_pred)
            update_pred = tree.predict(X)
            y_pred += update_pred

    def predict(self, X):
        y_pred = None
        # Make predictions
        for tree in self.trees:
            # Estimate gradient and update prediction
            update_pred = tree.predict(X)
            if y_pred is None:
                y_pred = np.zeros_like(update_pred)
            y_pred += update_pred

        return y_pred
