课程链接:[GYH的快速会议](https://meeting.tencent.com/v2/cloud-record/share?id=14667b16-6001-4298-815d-ff437f044454&from=3)

In [56]:
import numpy as np
import collections

In [54]:
def Gini(y):
    c = Counter(y)
    return 1 - sum([(val / y.shape[0]) ** 2 for val in c.values()])

def MSE(y):
    return ((y - y.mean())**2).sum() / y.shape[0]

In [3]:
class Node:
    def __init__(self, depth, idx):
        self.depth = depth
        self.idx = idx
        
        self.left = None
        self.right = None
        self.feature = None
        self.pivot = None

In [44]:
class Tree:

    def __init__(self,max_depth):
        self.max_depth = max_depth

        self.X = None
        self.y = None
        self.feature_importances_ = None

    def _able_to_split(self,node):
        return (node.depth < self.max_depth) & (node.idx.sum() >= 2)

    def _get_inner_split_mse(self,to_left,to_right):
        total = to_left.sum() + to_right.sum()
        left_val = to_left.sum()/total * MSE(self.y[to_left])
        right_val = to_right.sum() / total * MSE(self.y[to_right])
        return left_val+right_val

    def _get_inner_split_gini(self,to_left,to_right):
        total = to_left.sum() + to_right.sum()
        left_val = to_left.sum() / total * Gini(self.y[to_left])
        right_val = to_right.sum() / total * Gini(self.y[to_right])
        return left_val + right_val

    def _inner_split_mse(self,col,idx):
        data2use = self.X[:,col]
        best_val = np.infty
        for pivot in data2use[:-1]:
            to_left = (idx==1) & (data2use<=pivot)
            to_right = (idx==1) & (~to_left)
            if to_left.sum() == 0 or to_left.sum() == idx.sum():
                continue
            Hyx = self._get_inner_split_mse(to_left,to_right)
            if best_val > Hyx:
                best_val, best_pivot = Hyx,pivot
                best_to_left, best_to_right = to_left, to_right
        return best_val, best_to_left, best_to_right,best_pivot

    def _inner_split_gini(self,col,idx):
        data2use = self.X[:,col]
        best_val = np.infty
        for pivot in data2use[:-1]:
            to_left = (idx==1) & (data2use<=pivot)
            to_right = (idx==1) & (~to_left)
            if to_left.sum() == 0 or to_left.sum() == idx.sum():
                continue
            Hyx = self._get_inner_split_gini(to_left,to_right)
            if best_val > Hyx:
                best_val, best_pivot = Hyx,pivot
                best_to_left, best_to_right = to_left, to_right
        return best_val, best_to_left, best_to_right,best_pivot

    def _get_conditional_entropy_mse(self,idx):
        best_val = np.infty
        for c in range(self.X.shape[1]):
            Hyx ,_idx_left, _idx_right,pivot = self._inner_split_mse(c,idx)
            if best_val > Hyx:
                best_val, idx_left, idx_right,best_feature, best_pivot = \
                    Hyx, _idx_left, _idx_right, c, pivot
        return best_val, idx_left, idx_right,best_feature, best_pivot

    def _get_conditional_entropy_gini(self,idx):
        best_val = np.infty
        for c in range(self.X.shape[1]):
            Hyx ,_idx_left, _idx_right,pivot = self._inner_split_gini(c,idx)
            if best_val > Hyx:
                best_val, idx_left, idx_right,best_feature, best_pivot = \
                    Hyx, _idx_left, _idx_right, c, pivot
        return best_val, idx_left, idx_right,best_feature, best_pivot

    def split_mse(self,node):
        if not self._able_to_split(node):
            return None, None, None, None
        # H(Y)
        entropy = MSE(self.y[node.idx==1])
        # H(Y|X)
        conditional_entropy,idx_left,idx_right,feature,pivot = self._get_conditional_entropy_mse(node.idx)
        info_gain = entropy - conditional_entropy
        relative_info_gain = info_gain * node.idx.sum() / self.X.shape[0]
        self.feature_importances_[feature] += relative_info_gain

        node.left = Node(node.depth+1,idx_left)
        node.right = Node(node.depth+1,idx_right)
        self.depth = max(node.depth+1, self.depth)
        return idx_left,idx_right,feature,pivot

    def split_gini(self,node):
        if not self._able_to_split(node):
            return None, None, None, None
        # H(Y)
        entropy = Gini(self.y[node.idx==1])
        # H(Y|X)
        conditional_entropy,idx_left,idx_right,feature,pivot = self._get_conditional_entropy_gini(node.idx)
        info_gain = entropy - conditional_entropy
        relative_info_gain = info_gain * node.idx.sum() / self.X.shape[0]
        self.feature_importances_[feature] += relative_info_gain

        node.left = Node(node.depth+1,idx_left)
        node.right = Node(node.depth+1,idx_right)
        self.depth = max(node.depth+1, self.depth)
        return idx_left,idx_right,feature,pivot

    def build_prepare(self):
        self.depth = 0
        self.feature_importances_ = np.zeros(self.X.shape[1])
        self.root = Node(depth=0,idx=np.ones(self.X.shape[0]))

    def build_node_mse(self,cur_node):
        if cur_node is None:
            return
        idx_left, idx_right, feature, pivot = self.split_mse(cur_node)
        cur_node.feature, cur_node.pivot = feature,pivot
        self.build_node_mse(cur_node.left)
        self.build_node_mse(cur_node.right)

    def build_mse(self):
        self.build_prepare()
        self.build_node_mse(self.root)

    def build_node_gini(self,cur_node):
        if cur_node is None:
            return
        idx_left, idx_right, feature, pivot = self.split_gini(cur_node)
        cur_node.feature, cur_node.pivot = feature,pivot
        self.build_node_gini(cur_node.left)
        self.build_node_gini(cur_node.right)

    def build_gini(self):
        self.build_prepare()
        self.build_node_gini(self.root)

    def _search_prediction_mse(self,node,x):
        if node.left is None and node.right is None:
            return self.y[node.idx].mean()
        if x[node.feature] <= node.pivot:
            node = node.left
        else:
            node = node.right
        return self._search_prediction_mse(node,x)

    def predict_mse(self,x):
        return self._search_prediction_mse(self.root,x)

    def _search_prediction_gini(self,node,x):
        if node.left is None and node.right is None:
            return collections.Counter(self.y[node.idx]).most_common(1)[0][0]
        if x[node.feature] <= node.pivot:
            node = node.left
        else:
            node = node.right
        return self._search_prediction_gini(node,x)

    def predict_gini(self,x):
        return self._search_prediction_gini(self.root,x)

In [51]:
class DecisionTreeRegressor:
    
    def __init__(self, max_depth):
        self.tree = Tree(max_depth=max_depth)
        
    def fit(self, X, y):
        self.tree.X = X
        self.tree.y = y
        self.tree.build_mse()
        self.feature_importances_ = (self.tree.feature_importances_ / self.tree.feature_importances_.sum())
        return self
    
    def predict(self, X):
        return np.array([self.tree.predict_mse(x) for x in X])

In [52]:
class DecisionTreeClassifier(Tree):
    def __init__(self, max_depth):
        self.tree = Tree(max_depth=max_depth)

    def fit(self, X, y):
        self.tree.X = X
        self.tree.y = y
        self.tree.build_gini()
        self.feature_importances_ = (self.tree.feature_importances_ / self.tree.feature_importances_.sum())
        return self
        
    def predict(self, X):
        return np.array([self.tree.predict_gini(x) for x in X])

In [57]:
# 测试
from sklearn.tree import DecisionTreeRegressor as dt
from sklearn.tree import DecisionTreeClassifier as dc
from sklearn.datasets import make_regression,make_classification

X, y = make_regression(
    n_samples=200, n_features=10, n_informative=5, random_state=0
)

# 自己建立的回归树
my_cart_R = DecisionTreeRegressor(max_depth=2)
my_cart_R.fit(X, y)
res1 = my_cart_R.predict(X)
importance1 = my_cart_R.feature_importances_
# sklearn中的回归树
sklearn_cart_R = dt(max_depth=2)
sklearn_cart_R.fit(X, y)
res2 = sklearn_cart_R.predict(X)
importance2 = sklearn_cart_R.feature_importances_


# 预测一致的比例
print("回归树预测一致的比例为：")
print(((res1-res2)<1e-8).mean())
# 特征重要性一致的比例
print("回归树特征重要性一致的比例为：")
print(((importance1-importance2)<1e-8).mean())

# 模拟分类数据集
M, n = make_classification(
    n_samples=200, n_features=10, n_informative=5, random_state=0
)


# 自己建立的分类树
my_cart_C = DecisionTreeClassifier(max_depth=2)
my_cart_C.fit(M, n)
res3 = my_cart_C.predict(M)
importance3 = my_cart_C.feature_importances_
# sklearn中的分类树
sklearn_cart_C = dc(max_depth=2)
sklearn_cart_C.fit(M, n)
res4 = sklearn_cart_C.predict(M)
importance4 = sklearn_cart_C.feature_importances_

# 预测一致的比例
print("分类树预测一致的比例为：")
print(((res3-res4)<1e-8).mean())
# 特征重要性一致的比例
print("分类树特征重要性一致的比例为：")
print(((importance3-importance4)<1e-8).mean())

回归树预测一致的比例为：
1.0
回归树特征重要性一致的比例为：
1.0
分类树预测一致的比例为：
1.0
分类树特征重要性一致的比例为：
1.0
