In [145]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math
from math import log

import pprint

In [146]:
# table 5.1 from book
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    
    return datasets, labels

datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns = labels)

In [147]:
class Node:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {'label' : self.label, 'feature' : self.feature, 'tree' : self.tree}
        
    def __repr__(self):
        return '{}'.format(self.result)
    
    def add_node(self, val, node):
        self.tree[val] = node
        
    def predict(self, features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)

class DTree:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}
        
    def cal_entropy(datasets):
        n = len(datasets)
        label_count = {}
        # get distribution(Pi)
        for i in range(n):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        # print(label_count) {'否': 6, '是': 9}
        empirical_entropy = -sum([(p/n) * log(p/n, 2) for p in label_count.values()])

        return empirical_entropy

    # empirical conditional entropy
    def cal_conditional_entropy(self, datasets, axis=0):
        n = len(datasets)
        feature_sets = {}
        for i in range(n):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])

        empirical_conditional_entropy = sum([(len(p)/n) * cal_entropy(p)
                                            for p in feature_sets.values()])

        return empirical_conditional_entropy

    # information gain
    def info_gain(entropy, con_entropy):
        return entropy - con_entropy

    def get_info_gain(self, datasets):
        feature_count = len(datasets[0]) - 1
        empirical_entropy = cal_entropy(datasets)
        best_feature = []

        for c in range(feature_count):
            c_info_gain = info_gain(empirical_entropy, cal_conditional_entropy(datasets, axis=c))
            best_feature.append((c, c_info_gain))

        best = max(best_feature, key=lambda x : x[-1])
        # best : ((feature_id, feature_info_gain))
        return best
    
    def train(self, train_data):
        '''
        Input : Dataset(DataFrame), Feature_set A, threshold eta
        Output : T
        '''
        _ = train_data.iloc[:, :-1]
        y_train = train_data.iloc[:, -1]
        features = train_data.columns[:-1]

        
        # 1. if all the data in D all belong to the same class C, 
        #    set T as a root node and use C as the label, return T
        if len(y_train.value_counts()) == 1:
            return Node(root = True, label = y_train.iloc[0])
        
        # 2. if feature A is empty, set T as root node and use the most C as the label, 
        #    return T
        if len(features) == 0:
            return Node(root = True, 
                        label = y_train.value_counts().sort_values(ascending=False).index[0])
        
        # 3. calculate the largest inforamtion gain, use Ag to representative the best feature
        max_feature_id, max_info_gain = self.get_info_gain(np.array(train_data))
        max_feature_name = features[max_feature_id]
        
        # 4. if the information gain is smaller than threshold, set T as root node, and use
        #    the most C as the label, return T
        if max_info_gain < self.epsilon:
            return Node(root = True, 
                       label = y_train.value_counts().sort_values(ascending=False).index[0])

        # 5. splitting D according to every possible values in feature A
        node_tree = Node(root = False, feature_name = max_feature_name, 
                         feature = max_feature_id)
        
        feature_list = train_data[max_feature_name].value_counts().index
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([
                                                        max_feature_name], axis=1)
            
            # 6. create tree recursively
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)
        
        pprint.pprint(node_tree.tree)
    
        return node_tree 
    
    def fit(self, train_data):
        self._tree = self.train(train_data)
        
        return self._tree
    
    def predict(self, x_test):
        return self._tree.predict(x_test)

In [148]:
datasets, labels = create_data()
data_df = pd.DataFrame(datasets, columns = labels)
dt = DTree()
tree = dt.fit(data_df)

{'否': {'label': '否', 'tree': {}, 'feature': None},
 '是': {'label': '是', 'tree': {}, 'feature': None}}
{'否': {'label': None, 'tree': {'是': {'label': '是', 'tree': {}, 'feature': None}, '否': {'label': '否', 'tree': {}, 'feature': None}}, 'feature': 1},
 '是': {'label': '是', 'tree': {}, 'feature': None}}


In [149]:
tree

{'label': None, 'tree': {'是': {'label': '是', 'tree': {}, 'feature': None}, '否': {'label': None, 'tree': {'是': {'label': '是', 'tree': {}, 'feature': None}, '否': {'label': '否', 'tree': {}, 'feature': None}}, 'feature': 1}}, 'feature': 2}

In [150]:
tree.predict(['老年', '否', '否', '一般'])

'否'