In [1]:
import numpy as np
from math import log
from functools import reduce
import operator
import matplotlib.pyplot as plt

In [2]:
with open('./lenses.txt', 'r') as f:
    lines = f.readlines()
    x_train = [ line.strip().split('\t') for line in lines ]

# 计算一个样本集合的信息熵
def calc_ent(data_set):
    class_     = list(set([ sample[-1] for sample in data_set ]))
    sample_num = len(data_set)
    class_count = {}
    # 计算 k 个类
    for sample in data_set:
        class_name              = sample[-1]
        class_count[class_name] = class_count.get(class_name, 0) + 1
    ent = 0.0
    for class_name, count in class_count.items():
        p    = count * 1.0 / sample_num
        ent -= p * log(p, 2)
    return ent

# 根据属性和属性对应的一个属性值划分
def split_set_by_feat(data_set, feat, feat_val):
    after_split = []
    for sample in data_set:
        if sample[feat] == feat_val:
            reduced_set = sample[:feat] # extend 改变源数据，不返回拷贝
            reduced_set.extend(sample[feat + 1:])
            after_split.append(reduced_set)
    return after_split

# 根据现有样本集找到最好的划分
def find_best_feat(data_set):
    feat_num = len(data_set[0]) - 1 # 每个样本的属性个数
    sample_num = len(data_set)
    base_ent = calc_ent(data_set)
    ent      = 0.0
    max_gain = 0.0 # 最大信息增益
    best_feat = -1 # 最好的属性划分
    for feat in range(feat_num):
        feat_values = set([ sample[feat] for sample in data_set ])
        ent = 0.0
        for feat_val in feat_values:
            sub_set = split_set_by_feat(data_set, feat, feat_val)
            ent    += len(sub_set) * 1.0 / sample_num * calc_ent(sub_set)
        # 最大增益
        if base_ent - ent > max_gain:
            max_gain  = base_ent - ent
            best_feat = feat
    return best_feat

In [3]:
test = [[1, 1, 'yes'],
        [1, 1, 'yes'],
        [1, 0, 'no'],
        [0, 1, 'no'],
        [0, 1, 'no'],
       ]
# labels = ['no surface', 'flippers']
# calc_ent(x_train)
# split_set_by_feat(test, 1, 1)
find_best_feat(x_train)

3

已经有划分属性和计算信息熵的函数，可以考虑构建决策树

In [9]:
def clac_majority_class(data_set):
    count = {}
    for sample in data_set:
        class_name        = sample[-1]
        count[class_name] = count.get(class_name, 0) + 1
    sorted_count = sorted(count.items(), key = operator.itemgetter(1), reverse = True)
    return sorted_count[0][0]

def create_tree(data_set, feat_names):
    # 递归造树，但是终止条件核心就是，无法再划分
    
    # 当前样本集和为同一类别 返回这个类别 (no lenses/soft/hard)
    reduced_class = [ sample[-1] for sample in data_set if data_set[0][-1] == sample[-1] ]
    if len(reduced_class) == len(data_set):
        return reduced_class[0]
    # 属性集合已经空了 判定为数量最多的类型
#     if len(data_set[0]) == 1:
    if len(feat_names) == 0:
        return clac_majority_class(data_set)
    # 不然的话，欢迎来到划分属性的世界
    best_feat      = find_best_feat(data_set)
    best_feat_vals = set([ sample[best_feat] for sample in data_set ])
    best_feat_name = feat_names[best_feat]
#     print(best_feat)
#     print(best_feat_vals)
#     print(best_feat_name)
    cur_node       = { best_feat_name : {} } # 当前的属性划分，和划分属性值，对应可能是结果，也可能又是一个节点
    del(feat_names[best_feat])
    for feat_val in best_feat_vals:
        cur_node[best_feat_name][feat_val] = create_tree(split_set_by_feat(data_set, best_feat, feat_val), feat_names)
    return cur_node

# clac_majority_class(x_train)
tree = create_tree(x_train, ['age', 'prescript', 'astigmatic', 'tear_rate'])
tree

{'tear_rate': {'reduced': 'no lenses',
  'normal': {'astigmatic': {'no': {'age': {'pre': 'soft',
      'presbyopic': {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}},
      'young': 'soft'}},
    'yes': 'hard'}}}}