In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

dataset = pd.read_csv('data.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
age       15 non-null object
work      15 non-null object
hourse    15 non-null object
loan      15 non-null object
class     15 non-null object
dtypes: object(5)
memory usage: 680.0+ bytes


In [6]:
#获取数据集的形状
n_data = dataset.shape[0]
print("n_data : ", n_data)
# 得到变量列表，得到格式为list
cols = dataset.columns.tolist()
print("cols: ", cols)

n_data :  15
cols:  ['age', 'work', 'hourse', 'loan', 'class']


In [13]:
#创建obj_vals列表，并将描述型变量存入
obj_vars = []
for col in cols:
    #print(dataset[col].dtype)
    if dataset[col].dtype == "object":
        obj_vars.append(col)
print(obj_vars)

object
object
object
object
object
['age', 'work', 'hourse', 'loan', 'class']


In [14]:
# 将描述变量转化为数值型变量
# 并将转化为的数据附加到原始数据上
le = preprocessing.LabelEncoder()
for col in obj_vars:
    tran = le.fit_transform(dataset[col].tolist())
    tran_dataset = pd.DataFrame(tran, columns=['num_'+col])
    dataset = pd.concat([dataset, tran_dataset], axis=1)

In [82]:
#coding:utf-8
import pandas as pd
import numpy as np
from math import log

def calcEmpiricalEntropy(dataset):
    '''
    @parameter
    dataset： 数据集
    output： Empirical Entropy 计算给定数据集的经验熵(H(D))
    式子(5-7)
    '''
    numEntries = dataset.shape[0]  
    labelCounts = {} 
    cols = dataset.columns.tolist() 
    classlabel = dataset[cols[-1]].tolist() # 将数组或者矩阵转换成列表 
    #print("classlabel: ", classlabel)
    for label in classlabel:
        if label not in labelCounts.keys():
            labelCounts[label] = 1
        else:
            labelCounts[label] += 1

    EmpiricalEntropy = 0.0

    for _, value in labelCounts.items():
        prob = value/numEntries
        EmpiricalEntropy -= prob*log(prob, 2)

    return EmpiricalEntropy

def splitDataSet(dataset, axis, value):
    '''
    input：数据集 所占列 选择值
    output：按照给定维度上(axis)的特征的具体取值(value)划分好的子集
    描述：按照给定特征划分数据集；选择所占列中等于选择值的项
    '''
    
    cols = dataset.columns.tolist()
    axisFeat = dataset[axis].tolist()
    #print("axisFeat: ", axisFeat)
    # 更新数据集
    # 去除当前特征值所在的列
    retDataSet = pd.concat( [dataset[featVec] for featVec in cols if featVec != axis] , axis=1)
    
    # 删除与当前特征值不等的行
    i = 0
    dropIndex = [] #删除项的索引集
    for featVec in axisFeat:
        if featVec != value:
            dropIndex.append(i)
        i += 1
        
    newDataSet = retDataSet.drop(dropIndex)
    
    return newDataSet.reset_index(drop=True)


def chooseMaxInfoGainFeature(dataset):
    '''
    输入：数据集
    输出：最好的划分特征
    描述：选择最好的数据集划分维度
    式子(5-7),(5-8)
    '''
    numFeatures = dataset.shape[1] - 1 # 最后一列是结果
    HD = calcEmpiricalEntropy(dataset)
    #print("HD: ", HD)
    bestInfoGain = 0.0
    bestFeature = -1
    cols = dataset.columns.tolist()
    
    for i in range(numFeatures): #  
        equalVals = set(dataset[cols[i]].tolist()) # 这些特征的具体取值范围
        empirCondEntropy = 0.0
        for value in equalVals: # i--> n 对特征的取值进行求经验熵的和 第一个求和号
            # 函数 splitDataSet() 获取由特征不同取值划分的数据集
            subDataSet = splitDataSet(dataset, cols[i], value)
            # print("subDataSet: ", subDataSet)
            # |Di| : subDataSet.shape[0] 
            # |D| : dataset.shape[0]
            prob = subDataSet.shape[0] / dataset.shape[0]
            empirCondEntropy += prob * calcEmpiricalEntropy(subDataSet)
        infoGain = HD - empirCondEntropy
        #print(cols[i],infoGain)
        if infoGain > bestInfoGain:
            bestInfoGain = infoGain
            bestFeature = cols[i]
    return bestFeature, bestInfoGain

def majorityVote(classList):
    '''
    输入：分类类别列表
    输出：子节点的分类
    描述：数据集已经处理了所有属性，但是类标签依然不是唯一的，
          采用多数判决的方法决定该子节点的分类
    '''
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    sortedValue = sorted(classCount.items(), key=lambda item:item[1], reversed=True)
    return sortedValue[0][0]

def createTree(dataset):
    '''
    输入：数据集，删除特征
    输出：决策树
    描述：递归构建决策树，利用上述的函数
    '''
    cols = dataset.columns.tolist()[:-1]
    classList = dataset[dataset.columns.tolist()[-1]].tolist()

    
    
    # 终止条件
    # 若数据集中所有实例属于同一类Ck，则为单节点树，并将Ck作为该节点的类标记
    if classList.count(classList[0]) == len(classList):
        return classList[0]

    # 若特征集为空集，则为单节点树，并将数据集中实例数最大的类Ck作为该节点的类标记
    if len(cols) == 0:
        return majorityVote(classList)
    
    
    
    print('特征集和类别:',dataset.columns.tolist())
    bestFeature, bestInfoGain = chooseMaxInfoGainFeature(dataset)
    print('bestFeture:',bestFeature)

    decisionTree = {bestFeature:{}}

    # 得到列表包括节点所有的属性值
    featValues = set(dataset[bestFeature])
    for value in featValues:
        decisionTree[bestFeature][value] = createTree( splitDataSet(dataset, bestFeature, value) )
    return decisionTree



In [84]:
if __name__ == '__main__':
    dataset = pd.read_csv("data.csv")
    DeciTree = createTree(dataset)
    print(DeciTree)


特征集和类别: ['age', 'work', 'house', 'loan', 'class']
bestFeture: house
特征集和类别: ['age', 'work', 'loan', 'class']
bestFeture: work
{'house': {'是': '是', '否': {'work': {'是': '是', '否': '否'}}}}


NameError: name 'dtModel' is not defined

In [None]:
特征集和类别: ['age', 'work', 'hourse', 'loan', 'class']
bestFeture: hourse
hourse
特征集和类别: ['age', 'work', 'loan', 'class']
bestFeture: work
work
{'hourse': {'是': '是', '否': {'work': {'是': '是', '否': '否'}}}}

