In [1]:
import numpy as np
import random
# 一个是随机选取样本，一个是随机选取特征
# 决策树的个数、特征属性的个数、递归次数（即决策树的深度）
# https://github.com/sfeng-m/tree-model/blob/master/RandomForest/randomForest.py

In [2]:
# 决策树部分

# 计算GINI，gini表示不纯度，越小越纯，越大越不纯
def calGini(dataSet):
    # 创建字典，统计该数据集中的各个标签的数量
    lables = calculateDiffCount(dataSet)
    # 计算gini
    length = len(dataSet)
    gini = 1.0
    for key in lables.keys():
        gini -= (lables[key]/length)**2
    return gini

# 对数据集dataSet，对于第col列特征，根据value划分为两个数据集
def splitData(dataSet,col,value):
    data1 = []
    data2 = []
    for line in dataSet:
        if(line[col] >= value):
            data1.append(line)
        else:
            data2.append(line)
    return data1,data2

# 数标签
def calculateDiffCount(datas):
    results = {}
    for data in datas:
        if data[-1] not in results:
            results[data[-1]] = 1
        else:
            results[data[-1]] += 1
    return results


# 递归调用，选取最佳的特征和最佳特征当中的最佳分割值
def BuildCartDecisionTree(dataSet,features,maxDepth,depth):
    
    if(depth >= maxDepth):
        return calculateDiffCount(dataSet)
    
    depth+=1
    
    # 目前的gini
    currentgini = calGini(dataSet)
    # 列数
    column_length = len(dataSet[0])
    # 行数（样本数）
    rows_length = len(dataSet)
    
    # giniIndex的差
    best_gini_gain = 0.0
    
    best_value = None
    best_set = None
    
    for col in features:
        values = set([x[col] for x in dataSet])
        for value in values:
            data1,data2 = splitData(dataSet,col,value)
            p = len(data1)/rows_length
            gini = p*calGini(data1)+(1-p)*calGini(data2)
            gain = currentgini-gini
            if(gain > best_gini_gain):
                best_gini_gain = gain
                best_value = (col,value)
                best_set = (data1,data2)
                

    if(best_gini_gain > 0.0):
        trueBranch = BuildCartDecisionTree(best_set[0], features,maxDepth,depth)
        falseBranch = BuildCartDecisionTree(best_set[1], features,maxDepth,depth)
        return (best_value[0],
                best_value[1],
                falseBranch,
                trueBranch)
    else:
        return calculateDiffCount(dataSet)


In [3]:
def getFeatures(dataSet,n_features):
    return random.sample(range(len(dataSet[0])-1),n_features)

In [4]:
def loadData():
#     dataMat = []; labelMat = []
    alldataMat = []
    pima = open("../Pima.csv")
    for line in pima:
        lineArr = line.strip().split(',')
        float_map = map(float,lineArr)
        float_list = list(float_map)
#         dataMat.append(float_list[0:-1])
#         labelMat.append(int(lineArr[-1]))
        float_list[-1] = int(lineArr[-1])
        alldataMat.append(float_list)
    return alldataMat

In [5]:
# 切分数据集，以便交叉验证
# input(数据集，个数)
# def spiltDataSet(dataSet,labelMat,bag_nums):
#     spiltedDataSet = []
#     spiltedLabelsSet = []
#     oneBagLength = int(len(dataSet)/bag_nums)
#     for i in range(bag_nums):
#         data = []
#         label = []
#         for index in range(i*oneBagLength,(i+1)*oneBagLength):
#             data.append(dataSet[index])
#             label.append(labelMat[index])
#         spiltedDataSet.append(data)
#         spiltedLabelsSet.append(label)
#     return spiltedDataSet,spiltedLabelsSet

def spiltDataSet(dataSet,bag_nums):
    spiltedDataSet = []
    oneBagLength = int(len(dataSet)/bag_nums)
    data = []
    for index in random.sample(range(len(dataSet)),len(dataSet)):
        if(len(data) < oneBagLength):
            data.append(dataSet[index])
        else:
            spiltedDataSet.append(data)
            data = []
    return spiltedDataSet

In [11]:
# 树的个数，随机选取的特征数，树的最大深度，随机分成的数据集的个数
def buildRandomForest(n_trees,n_features,max_tree_depth,n_dataSets):
    RForest = []
    # 载入数据
    dataMat = loadData()
    # 随机分数据集
    spiltedDataSet = spiltDataSet(dataMat,n_dataSets)
    # 构建n_trees棵树的森林
    for i in range(n_trees):
        features = getFeatures(dataMat,n_features)
        tree = BuildCartDecisionTree(spiltedDataSet[random.randint(0,len(spiltedDataSet)-1)],features,max_tree_depth,1)
        RForest.append(tree)
    return RForest

In [12]:
# 基于决策树分类数据data
def classifyByTree(tree,data):
    lengthOfNode = len(tree)
    if(lengthOfNode == 4):
        feature = tree[0]
        value = tree[1]
        data_value = data[feature]
        if(data_value >= value):
            return classifyByTree(tree[3],data)
        else:
            return classifyByTree(tree[2],data)
    else:
        return max(tree,key=tree.get)

In [13]:
def classifyByForest(forest,data):
    results = {}
    for tree in forest:
        result = classifyByTree(tree,data)
        if result not in results:
            results[result] = 1
        else:
            results[result] += 1
    return max(results,key=results.get)

In [49]:
def acc():
    forest = buildRandomForest(200,3,10,5)
    dataMat = loadData()
    count = 0
    for index in random.sample(range(len(dataMat)),len(dataMat)-1):
        result = classifyByForest(forest,dataMat[index])
        if(result == dataMat[index][-1]):
            count += 1
    return float(count/len(dataMat))

In [50]:
acc()

0.8619791666666666