# Tree based regression

### 9.2 Building trees with continuous and discrete features

In [2]:
import numpy as np 

In [3]:
def loadDataSet(fileName):
    '''
    数据集加载函数
    :param fileName: 数据集文件名
    :return: 数据集
    '''
    dataMat = []
    with open(fileName) as fr:
        for line in fr.readlines():
            curLine = line.strip().split('\t')
            fltLine = [float(x) for x in curLine] # 将字符串转换为浮点数
            dataMat.append(fltLine)
    return np.mat(dataMat)

In [4]:
def binSplitDataSet(dataSet, feature, value):
    '''
    二分数据集
    :param dataSet: 数据集
    :param feature: 特征
    :param value: 特征值
    :return: 左子集和右子集
    '''
    mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
    mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
    return mat0, mat1

In [5]:
testMat = np.mat(np.eye(4))
mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
print(testMat, "\n", mat0, "\n", mat1)

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]] 
 [[0. 1. 0. 0.]] 
 [[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


### 9.3 Using CART for regression

##### 9.3.1 Building the tree

In [6]:
def regLeaf(dataSet):
    '''
    计算叶子节点的值，即聚类中心点来代表这一类数据
    :param dataSet: 数据集
    :return: 叶子节点的值
    '''
    return np.mean(dataSet[:, -1]) # 返回最后一列的均值

def regErr(dataSet):
    '''
    计算数据集的误差
    :param dataSet: 数据集
    :return: 数据集的误差
    '''
    return np.var(dataSet[:, -1]) * np.shape(dataSet)[0] # 返回最后一列的方差乘以数据集的大小

In [7]:
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
    '''
    选择最佳划分
    :param dataSet: 数据集
    :param leafType: 叶子节点类型
    :param errType: 误差类型
    :param ops: 停止条件
    :return: 最佳划分特征和特征值
    '''
    tolS = ops[0] # 容许的误差下降值
    tolN = ops[1] # 节点的最少样本数
    if len(set(dataSet[:, -1].T.tolist()[0])) == 1: # 如果最后一列的值只有一个，返回None
        return None, leafType(dataSet)
    m, n = np.shape(dataSet)
    S = errType(dataSet) # 当前误差
    bestS = np.inf # 初始化最佳误差为无穷大
    bestIndex = 0 # 初始化最佳划分特征为0
    bestValue = 0 # 初始化最佳划分特征值为0
    for featIndex in range(n - 1):
        for splitVal in set(dataSet[:, featIndex].T.tolist()[0]): # 遍历所有特征和特征值
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) # 二分数据集
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): # 如果子集的大小小于最小样本数，跳过
                continue
            newS = errType(mat0) + errType(mat1) # 计算新的误差
            if newS < bestS: # 如果新的误差小于最佳误差
                bestIndex = featIndex # 更新最佳划分特征
                bestValue = splitVal # 更新最佳划分特征值
                bestS = newS # 更新最佳误差
    if (S - bestS) < tolS: # 如果误差下降小于容许的误差下降值，返回None
        return None, leafType(dataSet)
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) # 二分数据集
    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): # 如果子集的大小小于最小样本数，返回None
        return None, leafType(dataSet)
    return bestIndex, bestValue # 返回最佳划分特征和特征值

In [8]:
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
    """
    创建回归树
    :param dataSet: 数据集
    :param leafType: 生成叶子节点的函数
    :param errType: 误差计算函数
    :param ops: 用户定义的参数
    :return: 树
    """
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)  # 选择最佳划分
    if feat is None:  # 如果没有最佳划分，返回叶子节点
        return val
    retTree = {} # 创建树的字典
    retTree['spInd'] = feat # 特征索引
    retTree['spVal'] = val # 特征值
    lSet, rSet = binSplitDataSet(dataSet, feat, val) # 二分数据集
    retTree['left'] = createTree(lSet, leafType, errType, ops) # 递归创建左子树
    retTree['right'] = createTree(rSet, leafType, errType, ops) # 递归创建右子树
    return retTree

In [9]:
myDat = loadDataSet('ex00.txt') # 加载数据集
regTree = createTree(myDat,ops=(0,1)) # 创建树
print(regTree) # 打印树

{'spInd': 0, 'spVal': 0.48813, 'left': {'spInd': 0, 'spVal': 0.620599, 'left': {'spInd': 0, 'spVal': 0.625336, 'left': {'spInd': 0, 'spVal': 0.625791, 'left': {'spInd': 0, 'spVal': 0.643601, 'left': {'spInd': 0, 'spVal': 0.651376, 'left': {'spInd': 0, 'spVal': 0.6632, 'left': {'spInd': 0, 'spVal': 0.683921, 'left': {'spInd': 0, 'spVal': 0.819823, 'left': {'spInd': 0, 'spVal': 0.837522, 'left': {'spInd': 0, 'spVal': 0.846455, 'left': {'spInd': 0, 'spVal': 0.919384, 'left': {'spInd': 0, 'spVal': 0.976414, 'left': {'spInd': 0, 'spVal': 0.985425, 'left': {'spInd': 0, 'spVal': 0.989888, 'left': {'spInd': 0, 'spVal': 0.993349, 'left': 1.035533, 'right': 1.077553}, 'right': {'spInd': 0, 'spVal': 0.988852, 'left': 0.744207, 'right': 1.069062}}, 'right': 1.227946}, 'right': {'spInd': 0, 'spVal': 0.953112, 'left': {'spInd': 0, 'spVal': 0.975022, 'left': 0.862911, 'right': 0.673579}, 'right': {'spInd': 0, 'spVal': 0.948268, 'left': {'spInd': 0, 'spVal': 0.951949, 'left': 1.06469, 'right': {'spInd

### 9.4 Tree pruning

##### 9.4.1 Prepruning

In [10]:

myDat = loadDataSet('ex2.txt')  # 加载数据集
regTree = createTree(myDat)  # 创建树
print(regTree)  # 打印树

regTree = createTree(myDat, ops=(10000, 4))  # 创建树
print(regTree)  # 打印树

{'spInd': 0, 'spVal': 0.499171, 'left': {'spInd': 0, 'spVal': 0.729397, 'left': {'spInd': 0, 'spVal': 0.952833, 'left': {'spInd': 0, 'spVal': 0.958512, 'left': 105.24862350000001, 'right': 112.42895575000001}, 'right': {'spInd': 0, 'spVal': 0.759504, 'left': {'spInd': 0, 'spVal': 0.790312, 'left': {'spInd': 0, 'spVal': 0.833026, 'left': {'spInd': 0, 'spVal': 0.944221, 'left': 87.3103875, 'right': {'spInd': 0, 'spVal': 0.85497, 'left': {'spInd': 0, 'spVal': 0.910975, 'left': 96.452867, 'right': {'spInd': 0, 'spVal': 0.892999, 'left': 104.825409, 'right': {'spInd': 0, 'spVal': 0.872883, 'left': 95.181793, 'right': 102.25234449999999}}}, 'right': 95.27584316666666}}, 'right': {'spInd': 0, 'spVal': 0.811602, 'left': 81.110152, 'right': 88.78449880000001}}, 'right': 102.35780185714285}, 'right': 78.08564325}}, 'right': {'spInd': 0, 'spVal': 0.640515, 'left': {'spInd': 0, 'spVal': 0.666452, 'left': {'spInd': 0, 'spVal': 0.706961, 'left': 114.554706, 'right': {'spInd': 0, 'spVal': 0.698472, '

##### 9.4.2 Postpruning

In [11]:
def isTree(obj):
    '''
    判断对象是否为树
    :param obj: 对象
    :return: True or False
    '''
    return (type(obj).__name__ == 'dict') # 如果对象的类型是字典，返回True，否则返回False

def getMean(tree):
    '''
    获取树的均值
    :param tree: 树
    :return: 均值
    '''
    if isTree(tree['right']): # 如果右子树是树，递归获取均值
        return getMean(tree['right'])
    if isTree(tree['left']): # 如果左子树是树，递归获取均值
        return getMean(tree['left'])
    return tree # 返回叶子节点的值

In [12]:
def prune(tree, testData):
    '''
    剪枝函数
    :param tree: 树
    :param testData: 测试数据
    :return: 剪枝后的树
    '''
    if np.shape(testData)[0] == 0: # 如果测试数据为空，返回树
        return getMean(tree)
    if (isTree(tree['left']) or isTree(tree['right'])): # 如果左子树或右子树是树
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) # 二分测试数据
        if isTree(tree['left']): # 如果左子树是树，递归剪枝
            tree['left'] = prune(tree['left'], lSet)
        if isTree(tree['right']): # 如果右子树是树，递归剪枝
            tree['right'] = prune(tree['right'], rSet)
    if not isTree(tree['left']) and not isTree(tree['right']): # 如果左子树和右子树都是叶子节点
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) # 二分测试数据
        errorNoMerge = sum(np.power(lSet[:, -1] - tree['left'], 2)) + sum(np.power(rSet[:, -1] - tree['right'], 2)) # 计算不合并的误差
        treeMean = (tree['left'] + tree['right']) / 2.0 # 计算合并后的均值
        errorMerge = sum(np.power(testData[:, -1] - treeMean, 2)) # 计算合并后的误差
        if errorMerge < errorNoMerge: # 如果合并后的误差小于不合并的误差，返回合并后的均值
            print("merging")
            return treeMean
        else:
            return tree # 返回原来的树
    else:
        return tree # 返回原来的树

In [13]:
myTree = createTree(myDat, ops=(0, 1)) # 创建树
myDatTest = loadDataSet('ex2test.txt') # 加载测试数据
prune(myTree, myDatTest) # 剪枝
print(myTree) # 打印剪枝后的树

merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
{'spInd': 0, 'spVal': 0.499171, 'left': {'spInd': 0, 'spVal': 0.729397, 'left': {'spInd': 0, 'spVal': 0.952833, 'left': {'spInd': 0, 'spVal': 0.965969, 'left': 92.5239915, 'right': {'spInd': 0, 'spVal': 0.956951, 'left': {'spInd': 0, 'spVal': 0.958512, 'left': {'spInd': 0, 'spVal': 0.960398, 'left': 112.386764, 'right': 123.559747}, 'right': 135.837013}, 'right': {'spInd': 0, 'spVal': 0.953902, 'left': {'spInd': 0, 'spVal': 0.954711, 'left': 82.016541, 'right': 100.935789}, 'right': 130.92648}}}, 'right': {'spInd': 0, 'spVal': 0.759504, 'left': {'spInd': 0, 'spVal': 0.763328, 'left': {'spInd': 0, 'spVal': 0.769043, 'left': {'spInd': 0, 'spVal': 0.790312, 'left': {

### 9.5 Model trees

In [14]:
def linearSolve(dataSet):
    '''
    线性回归求解函数
    :param dataSet: 数据集
    :returns: 
        ws: 权重向量
        X: 特征矩阵
        Y: 标签矩阵
    '''
    m, n = np.shape(dataSet)
    X = np.mat(np.ones((m, n))) # 创建一个m行n列的矩阵，元素全为1
    Y = np.mat(np.ones((m, 1))) # 创建一个m行1列的矩阵，元素全为1
    X[:, 1:n] = dataSet[:, 0:n - 1] # 将数据集的前n-1列赋值给X的第2到第n列
    Y = dataSet[:, -1] # 获取数据集的最后一列
    xTx = X.T * X # 计算X的转置乘以X
    if np.linalg.det(xTx) == 0.0: # 如果xTx的行列式为0，返回None
        print("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (X.T * Y) # 计算权重向量
    return ws, X , Y

In [15]:
def modelLeaf(dataSet):
    '''
    当数据集是线性回归时，返回权重向量
    :param dataSet: 数据集
    :return: 权重向量
    '''
    ws, X, Y = linearSolve(dataSet) # 线性回归求解
    return ws # 返回权重向量

def modelErr(dataSet):
    '''
    计算线性回归的误差
    :param dataSet: 数据集
    :return: 误差
    '''
    ws, X, Y = linearSolve(dataSet) # 线性回归求解
    yHat = X * ws # 计算预测值
    return sum(np.power(Y - yHat, 2)) # 返回误差

In [16]:
myDat2 = loadDataSet('exp2.txt') # 加载数据集
regTree = createTree(myDat2, modelLeaf, modelErr, ops=(1, 10)) # 创建树
print(regTree) # 打印树

{'spInd': 0, 'spVal': 0.285477, 'left': matrix([[1.69855694e-03],
        [1.19647739e+01]]), 'right': matrix([[3.46877936],
        [1.18521743]])}


### 9.6 Example: comparing tree methods to standard regression

In [17]:
def regTreeEval(model, inDat):
    '''
    计算回归树的预测值
    :param model: 模型
    :param inDat: 输入数据
    :return: 预测值
    '''
    return float(model) # 返回模型的值

def modelTreeEval(model, inDat):
    '''
    计算线性回归树的预测值
    :param model: 模型
    :param inDat: 输入数据
    :return: 预测值
    '''
    n = np.shape(inDat)[1] # 获取输入数据的列数
    X = np.mat(np.ones((1, n + 1))) # 创建一个1行n+1列的矩阵，元素全为1
    X[:, 1:n + 1] = inDat # 将输入数据赋值给X的第2到第n+1列
    return float(X * model) # 返回预测值

In [18]:
def treeForeCast(tree, inData, modelEval=regTreeEval):
    '''
    计算树的预测值
    :param tree: 树
    :param inData: 输入数据
    :param modelEval: 模型评估函数
    :return: 预测值
    '''
    if not isTree(tree): # 如果树不是树，返回模型评估的值
        return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']: # 如果输入数据的特征值大于特征值，递归计算右子树
        if isTree(tree['right']):
            return treeForeCast(tree['right'], inData, modelEval)
        else:
            return modelEval(tree['right'], inData) # 返回模型评估的值
    else: # 如果输入数据的特征值小于等于特征值，递归计算左子树
        if isTree(tree['left']):
            return treeForeCast(tree['left'], inData, modelEval)
        else:
            return modelEval(tree['left'], inData) # 返回模型评估的值

In [19]:
def createForeCast(tree, testData, modelEval=regTreeEval):
    '''
    计算树的预测值
    :param tree: 树
    :param inData: 输入数据
    :param modelEval: 模型评估函数
    :return: 预测值
    '''
    m = len(testData) # 获取测试数据的大小
    yHat = np.mat(np.zeros((m, 1))) # 创建一个m行1列的矩阵，元素全为0
    
    for i in range(m): # 遍历测试数据
        yHat[i, 0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
    return yHat # 返回预测值

In [20]:
trainMat = loadDataSet('bikeSpeedVsIq_train.txt') # 加载训练数据
testMat = loadDataSet('bikeSpeedVsIq_test.txt') # 加载测试数据

# 创建回归树
myTree1 = createTree(trainMat, ops=(1, 20))
print(myTree1)
yHat1 = createForeCast(myTree1, testMat[:, 0])
print("--------------\n")
# print yHat1
# print "ssss==>", testMat[:, 1]
print("回归树:", np.corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1])

# 创建模型树
myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
print(myTree2)
yHat2 = createForeCast(myTree2, testMat[:, 0], modelTreeEval)
print("--------------\n")
print("模型树:", np.corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]) # 打印相关系数

# 线性回归
ws, X, Y = linearSolve(trainMat) # 线性回归求解
print("线性回归:", ws.T) # 打印权重向量
m = len(testMat[:,0]) # 获取测试数据的大小
yHat3 = np.mat(np.zeros((m, 1))) # 创建一个m行1列的矩阵，元素全为0
for i in range(np.shape(testMat)[0]): # 遍历测试数据
    yHat3[i] = testMat[i, 0] * ws[1, 0] + ws[0, 0] # 计算预测值
print("线性回归：", np.corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]) # 打印相关系数

{'spInd': 0, 'spVal': 10.0, 'left': {'spInd': 0, 'spVal': 17.0, 'left': {'spInd': 0, 'spVal': 20.0, 'left': 168.34161286956524, 'right': 157.0484078846154}, 'right': {'spInd': 0, 'spVal': 14.0, 'left': 141.06067981481482, 'right': 122.90893026923078}}, 'right': {'spInd': 0, 'spVal': 7.0, 'left': 94.7066578125, 'right': {'spInd': 0, 'spVal': 5.0, 'left': 69.02117757692308, 'right': 50.94683665}}}
--------------

回归树: -0.8775461454371768
{'spInd': 0, 'spVal': 4.0, 'left': {'spInd': 0, 'spVal': 12.0, 'left': {'spInd': 0, 'spVal': 16.0, 'left': {'spInd': 0, 'spVal': 20.0, 'left': matrix([[47.58621512],
        [ 5.51066299]]), 'right': matrix([[37.54851927],
        [ 6.23298637]])}, 'right': matrix([[43.41251481],
        [ 6.37966738]])}, 'right': {'spInd': 0, 'spVal': 9.0, 'left': matrix([[-2.87684083],
        [10.20804482]]), 'right': {'spInd': 0, 'spVal': 6.0, 'left': matrix([[-11.84548851],
        [ 12.12382261]]), 'right': matrix([[-17.21714265],
        [ 13.72153115]])}}}, 'righ

### 9.7 Using Tkinter to create a GUI in Python

In [22]:
import tkinter
import numpy as np
import matplotlib
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
matplotlib.use('TkAgg')

In [26]:
def reDraw(tolS, tolN, chkBtnVar):
    reDraw.f.clf() # 清空画布
    reDraw.a = reDraw.f.add_subplot(111) # 添加子图
    
    # 检查复选框是否被选中
    if chkBtnVar.get() == 1:
        if tolN < 2:
            tolN = 2
        # 如果选中，使用模型树
        myTree = createTree(reDraw.rawDat, modelLeaf, modelErr, ops=(tolS, tolN))
        yHat = createForeCast(myTree, reDraw.testDat, modelTreeEval)
    else:
        # 如果不选中，使用回归树
        myTree = createTree(reDraw.rawDat, ops=(tolS, tolN))
        yHat = createForeCast(myTree, reDraw.testDat)
    print(np.shape(reDraw.rawDat[:, 0]))
    x = np.squeeze(np.asarray(reDraw.rawDat[:, 0])) # 将数据转换为一维数组
    y = np.squeeze(np.asarray(reDraw.rawDat[:, 1])) # 将数据转换为一维数组
    reDraw.a.scatter(x, y, s=5) # 绘制散点图
    reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0, c='red') # 绘制预测值
    reDraw.canvas.draw() # 刷新画布

In [24]:
def getInputs(tolNentry, tolSentry):
    try:
        tolN = int(tolNentry.get()) # 获取最小样本数
    except:
        tolN = 10 # 如果获取失败，设置为10
        print("enter Integer for tolN")
        tolNentry.delete(0, tkinter.END) # 清空输入框
        tolNentry.insert(0, '10')
        
    try:
        tolS = float(tolSentry.get()) # 获取容许的误差下降值
    except:
        tolS = 1.0
        print("enter Float for tolS")
        tolSentry.delete(0, tkinter.END)
        tolSentry.insert(0, '1.0')
    return tolN, tolS # 返回最小样本数和容许的误差下降值

def drawNewTree(tolNentry, tolSentry, chkBtnVar):
    tolN, tolS = getInputs(tolNentry,tolSentry) # 获取最小样本数和容许的误差下降值
    reDraw(tolS, tolN, chkBtnVar) # 绘制新树

In [30]:
root = tkinter.Tk() # 创建窗口
tkinter.Label(root, text="Plot Place Holder").grid(row=3, columnspan=3) # 创建标签, 占据3列

tkinter.Label(root, text="tolN").grid(row=1, column=0) # 创建标签, 占据1行1列
tolNentry = tkinter.Entry(root) # 创建输入框
tolNentry.grid(row=1, column=1) # 创建输入框, 占据1行2列
tolNentry.insert(0, "10") # 设置输入框的默认值

tkinter.Label(root, text="tolS").grid(row=2, column=0) # 创建标签, 占据2行1列
tolSentry = tkinter.Entry(root) # 创建输入框
tolSentry.grid(row=2, column=1) # 创建输入框, 占据2行2列
tolSentry.insert(0, "1.0") # 设置输入框的默认值

chkBtnVar = tkinter.IntVar() # 创建变量
chkBtn = tkinter.Checkbutton(root, text="Model tree", variable=chkBtnVar) # 创建复选框
chkBtn.grid(row=3, column=0, columnspan=2) # 创建复选框, 占据3行1列

tkinter.Button(root, text="ReDraw", command=lambda: drawNewTree(tolNentry, tolSentry, chkBtnVar)).grid(row=1, column=2, rowspan=3) # 创建按钮, 占据1行3列
tkinter.Button(root, text="Quit", fg="black", command=root.quit).grid(row=1, column=2)

reDraw.f = Figure(figsize=(5, 4), dpi=100) # 创建图形
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root) # 创建画布
reDraw.canvas.draw() # 显示画布
reDraw.canvas.get_tk_widget().grid(row=0, rowspan=3) # 创建画布, 占据0行1列

reDraw.rawDat = loadDataSet('sine.txt') # 加载数据集
reDraw.testDat = np.arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01) # 创建测试数据
reDraw(1.0, 10 , chkBtnVar) # 绘制图形

(200, 1)


In [31]:
root.mainloop() # 进入消息循环

(200, 1)
(200, 1)
(200, 1)
(200, 1)
(200, 1)
(200, 1)
(200, 1)
