# ch9 树回归

## CART(classification and regression trees,分类回归树)

CART算法只做二元切分，所以树的结构固定：树包含左右键，

可以存储另一棵子树或者单个值。字典包含特征和特征值这两个键，

给出切分算法所有的特征和特征值。

CART算法实现

In [1]:
import numpy as np        

In [2]:
def loadData(filename):
    # 加载数据
    dataArr=[]
    fr=open(filename)
    for line in fr.readlines():
        curLine=line.strip().split('\t')
        # 将数据映射为浮点型
        fltLine=list(map(float,curLine))
        dataArr.append(fltLine)
    return np.array(dataArr)

In [3]:
def binSplitDataSet(dataArr,dim,value):
    # 按第dim列的value值，将数据集划分为两部分
    # index
    idx1=np.nonzero(dataArr[:,dim]>value)[0]
    arr1=dataArr[idx1,:]
    # print("arr1.shape",arr1.shape)
    # index
    idx2=np.nonzero(dataArr[:,dim]<=value)[0]
    arr2=dataArr[idx2,:]
    # print("arr2.shape",arr2.shape)
    return arr1,arr2

按照误差计算方式，找到数据集上最佳二元切分方式，

如果无法划分，即到了叶结点，则返回相应的值(这里是返回实际样本值的均值)。

**回归树的切分函数：**

In [4]:
def regLeaf(dataArr):
    # 均值
    return np.mean(dataArr[:,-1])

In [5]:
def regErr(dataArr):
    # 总的方差
    return np.var(dataArr[:,-1]) * dataArr.shape[0] 

In [6]:
def chooseBestSplit(dataArr,leafType=regLeaf,errType=regErr,opt=(1,4)):
    # 如果找到好的切分数据集的方式，则返回特征编号和特征值，
    # 如果找不到好的二元切分方式，则返回None，并产生一个叶节点，
    # 叶节点的值，也返回None
    # opt(tolErr,tolNum)为用户指定的，用于控制参数的停止时机,
    # tolErr是容许的误差下降值,
    # tolNum是切分的最少样本数。
    tolErr,tolNum=opt
    # 如果待划分的特征都相同，则返回None，并生成叶节点(叶节点返回的是均值)
    lst=dataArr[:,-1].T.tolist()
    if len(set(lst))==1:
        print("leaf node")
        return None,leafType(dataArr)
    m,n=dataArr.shape
    E=errType(dataArr)
    # 最优划分方式下的方差，特征列编号及其特征值
    bestErr=np.inf;bestIndex=0;bestValue=0
    for idx in range(n-1):
        for val in set(dataArr[:,idx]):
            arr1,arr2=binSplitDataSet(dataArr,idx,val)
            if len(arr1)<tolNum or len(arr2)<tolNum:
                continue
            # 两个结点的总方差
            err=errType(arr1)+errType(arr2)
            if err<bestErr:
                bestIndex=idx
                bestValue=val
                bestErr=err
    # 如果误差减少不大，则提前退出
    if E-bestErr <tolErr:
        return None,leafType(dataArr)
    # 继续建树
    print("arr1,arr2")
    arr1,arr2=binSplitDataSet(dataArr,bestIndex,bestValue)
    # 如果划分出的数据集样本个数少于阈值，则返回叶结点
    if len(arr1)<tolNum or len(arr2)<tolNum:
        return None,leafType(dataArr)
    # 返回特征编号，特征值
    print('bestIndex,bestValue')
    return bestIndex,bestValue

**构建回归树**

In [7]:
def createTree(dataArr,leafType=regLeaf,errType=regErr,opt=(1,4)):
    # 建树，使用字典存储树
    idx,val=chooseBestSplit(dataArr,leafType,errType,opt)
    # 叶节点
    if idx==None:
        return val
    # 树
    print('tree')
    tree={};
    tree['idx']=idx;tree['val']=val
    lArr,rArr=binSplitDataSet(dataArr,idx,val)
    # 分支
    print('branch')
    tree['left'] =createTree(lArr,leafType,errType,opt)
    tree['right']=createTree(rArr,leafType,errType,opt)
    # 返回回归树
    return tree

运行CART代码

In [8]:
dataArr=loadData('./ex00.txt')
dataArr[:1]

array([[0.036098, 0.155096]])

In [9]:
tree=createTree(dataArr)
tree

arr1,arr2
bestIndex,bestValue
tree
branch


{'idx': 0,
 'val': 0.48813,
 'left': 1.0180967672413792,
 'right': -0.04465028571428572}

In [10]:
data1=loadData('./ex0.txt')
tree1=createTree(data1)
tree1

arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch


{'idx': 1,
 'val': 0.39435,
 'left': {'idx': 1,
  'val': 0.582002,
  'left': {'idx': 1,
   'val': 0.797583,
   'left': 3.9871632,
   'right': 2.9836209534883724},
  'right': 1.980035071428571},
 'right': {'idx': 1,
  'val': 0.197834,
  'left': 1.0289583666666666,
  'right': -0.023838155555555553}}

## 9-4 树剪枝

一棵树结点过多，可能造成“过拟合”，
通过决策树剪枝，来避免过拟合。

之前的函数chooseBestSplit()参数opt()作为提前终止条件，
在建树的过程中发挥作用，这种方式叫做预剪枝。

建立回归树之后，使用测试集，减去在测试集上误差变化不大的分支，
这种方式叫做后剪枝。

后剪枝实现过程：
```
基于测试集：
    如果是树或子树：
        计算将两个分支（或结点）合并的误差
        计算将两个分支（或结点）不合并的误差
        如果合并能降低误差：
            则合并这两个分支或结点
```
代码

In [11]:
def isTree(obj):
    # 树使用字典存储，
    # 所以类型是dict的就是子树(或树)
    return (type(obj).__name__=='dict')

In [12]:
def getMean(tree):
    # 计算两个子树的平均总方差
    if isTree(tree['left']):
        tree['left']=getMean(tree['left'])
    if isTree(tree['right']):
        tree['right']=getMean(tree['right'])
    return (tree['left']+tree['right'])/2.0

In [13]:
def prune(tree,testArr):
    # 剪枝
    # 没有测试数据了，则对树做塌陷处理
    if testArr.shape[0]==0:
        return getMean(tree)
    # 分支，则使用回归树划分测试集
    if isTree(tree['left']) or isTree(tree['right']):
        lArr,rArr=binSplitDataSet(testArr,tree['idx'],tree['val'])
    # 左分支
    if isTree(tree['left']):
        tree['left']= prune(tree['left'],lArr)
    # 右分支
    if isTree(tree['right']):
        tree['right']=prune(tree['right'],rArr)
    # 叶结点
    if not isTree(tree['left']) and not isTree(tree['right']):
        # 计算总的方差
        # 不合并的总方差
        lArr,rArr=binSplitDataSet(testArr,tree['idx'],tree['val'])
        lerr=np.power(lArr[:,-1] - tree['left'],2)
        rerr=np.power(rArr[:,-1] - tree['right'],2)
        errNoMerge=np.sum(lerr) + np.sum(rerr)
        # 合并的总方差
        treeMean=(tree['left']+tree['right'])/2.0
        errMerge=np.sum(np.power(testArr[:,-1] - treeMean,2))
        # 比较
        if errMerge<errNoMerge:
            print('merge')
            # 合并分支
            # 返回两个分支的总方差之和
            return treeMean
        else:
            # 保留原分支
            return tree
    else:
        return tree

In [14]:
# test
trainArr=loadData('./ex2.txt')
tree2=createTree(trainArr)
tree2

arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree

{'idx': 0,
 'val': 0.499171,
 'left': {'idx': 0,
  'val': 0.729397,
  'left': {'idx': 0,
   'val': 0.952833,
   'left': {'idx': 0,
    'val': 0.958512,
    'left': 105.24862350000001,
    'right': 112.42895575000001},
   'right': {'idx': 0,
    'val': 0.759504,
    'left': {'idx': 0,
     'val': 0.790312,
     'left': {'idx': 0,
      'val': 0.833026,
      'left': {'idx': 0,
       'val': 0.944221,
       'left': 87.3103875,
       'right': {'idx': 0,
        'val': 0.85497,
        'left': {'idx': 0,
         'val': 0.910975,
         'left': 96.452867,
         'right': {'idx': 0,
          'val': 0.892999,
          'left': 104.825409,
          'right': {'idx': 0,
           'val': 0.872883,
           'left': 95.181793,
           'right': 102.25234449999999}}},
        'right': 95.27584316666666}},
      'right': {'idx': 0,
       'val': 0.811602,
       'left': 81.110152,
       'right': 88.78449880000001}},
     'right': 102.35780185714285},
    'right': 78.08564325}},
  'righ

In [15]:
testArr=loadData('./ex2test.txt')
prune(tree2,testArr)

merge
merge
merge
merge
merge
merge
merge
merge
merge


{'idx': 0,
 'val': 0.499171,
 'left': {'idx': 0,
  'val': 0.729397,
  'left': {'idx': 0,
   'val': 0.952833,
   'left': {'idx': 0,
    'val': 0.958512,
    'left': 105.24862350000001,
    'right': 112.42895575000001},
   'right': {'idx': 0,
    'val': 0.759504,
    'left': {'idx': 0,
     'val': 0.790312,
     'left': {'idx': 0,
      'val': 0.833026,
      'left': {'idx': 0,
       'val': 0.944221,
       'left': 87.3103875,
       'right': {'idx': 0,
        'val': 0.85497,
        'left': {'idx': 0,
         'val': 0.910975,
         'left': 96.452867,
         'right': {'idx': 0,
          'val': 0.892999,
          'left': 104.825409,
          'right': {'idx': 0,
           'val': 0.872883,
           'left': 95.181793,
           'right': 102.25234449999999}}},
        'right': 95.27584316666666}},
      'right': {'idx': 0,
       'val': 0.811602,
       'left': 81.110152,
       'right': 88.78449880000001}},
     'right': 102.35780185714285},
    'right': 78.08564325}},
  'righ

## 9-5 模型树

用树对数据建模，需要吧叶结点设定为分段线性函数。

对于给定数据集，先用线性模型对其进行拟合，然后计算真实值与模型预测值之间
的插差值，差值平方求和就是需要的误差。


In [24]:
def linear(dataArr):
    m,n=dataArr.shape
    # 初始化X,Y
    X=np.ones((m,n));Y=np.ones((m,1))
    # 赋值
    X[:,1:n]=dataArr[:,0:n-1];Y=dataArr[:,-1]
    X=np.mat(X);Y=Y.reshape(m,1)
    # print(X.shape,Y.shape)
    xTx=np.dot(X.T,X)
    # print("xTx.shape",xTx.shape)
    # np.linalg.det(X)表示计算矩阵X的行列式
    if np.linalg.det(xTx) == 0.0:
        # 说明不可逆,报错并返回
        print("This matrix cannot do inverse")
        # 求伪逆
        xTx_I=np.linalg.pinv(xTx)
    else:
        # 求逆
        xTx_I=xTx.I
    t=np.dot(X.T,Y)
    # ws = xTx_I*(X.T*Y)
    ws = np.dot(xTx_I,t)
    return ws,X,Y

In [25]:
def modelLeaf(dataArr):
    ws,X,Y=linear(dataArr)
    return ws

In [26]:
def modelErr(dataArr):
    # 在给定的数据集上计算误差
    # 权重矩阵，自变量，因变量
    ws,X,Y=linear(dataArr)
    # 预测值
    yHat=X*ws
    # 返回平方误差
    return np.sum(np.power(Y-yHat,2))

In [28]:
data3=loadData('./exp2.txt')
ws,x,y=linear(data3)

tree3=createTree(data3,modelLeaf,modelErr,(1,10))
tree3

arr1,arr2
bestIndex,bestValue
tree
branch


{'idx': 0,
 'val': 0.285477,
 'left': matrix([[1.69855694e-03],
         [1.19647739e+01]]),
 'right': matrix([[3.46877936],
         [1.18521743]])}

## 9-6 示例

使用corrcoef函数分析模型

In [54]:
def regTreeEval(model, inDat):
    # 对叶结点数据的预测
    # 回归树模型
    return float(model)

In [55]:
def modelTreeEval(model, inDat):
    # 对叶节点的预测
    # 模型树
    n = np.shape(inDat)[1]
    X = np.mat(np.ones((1,n+1)))
    X[:,1:n+1]=inDat
    return float(X*model)

In [60]:
def treeForeCast(tree, inData, modelEval=regTreeEval):
    # 对于给定的tree，输入值，模型类型，
    # 该函数返回一个预测值
    if not isTree(tree): 
        return modelEval(tree, inData)
    if inData[tree['idx']] > tree['val']:
        if isTree(tree['left']): 
            return treeForeCast(tree['left'], inData, modelEval)
        else: 
            return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']): 
            return treeForeCast(tree['right'], inData, modelEval)
        else: 
            return modelEval(tree['right'], inData)

In [61]:
def createForeCast(tree, testData, model=regTreeEval):
    # 返回预测值向量
    m=len(testData)
    yHat = np.mat(np.zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree, np.mat(testData[i]), model)
    return yHat

**回归树模型**

In [70]:
def modelTest(trainArr,testArr,model):
    # return R^2
    if model=='regTree':
        tree=createTree(trainArr,opt=(1,20))
        # 估计值
        yhat=createForeCast(regTree,testArr[:,0])
    else:
        tree=createTree(trainArr,modelLeaf,modelErr,(1,20))
        # 估计值
        yhat=createForeCast(modelTree,testArr[:,0],modelTreeEval)
    # R^2   
    return np.corrcoef(yhat,testArr[:,1],rowvar=0)[0,1]

In [73]:
#  test 
trainArr=loadData('./bikeSpeedVsIq_train.txt')
testArr=loadData('./bikeSpeedVsIq_test.txt')

**回归树**

In [74]:
modelTest(trainArr,testArr,'regTree')

arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch


0.9640852318222141

**模型树**

In [75]:
modelTest(trainArr,testArr,'modelTree')

arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch
arr1,arr2
bestIndex,bestValue
tree
branch


0.9760412191380593

R^2越接近1，则效果越好。