In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math
from math import log

import sys
import pprint

In [142]:
# table 5.1 from book
def create_data():
    datasets = [['teen', 'no', 'no', 'intermediate', 'no'],
               ['teen', 'no', 'no', 'good', 'no'],
               ['teen', 'yes', 'no', 'good', 'yes'],
               ['teen', 'yes', 'yes', 'intermediate', 'yes'],
               ['teen', 'no', 'no', 'intermediate', 'no'],
               ['middle-age', 'no', 'no', 'intermediate', 'no'],
               ['middle-age', 'no', 'no', 'good', 'no'],
               ['middle-age', 'yes', 'yes', 'good', 'yes'],
               ['middle-age', 'no', 'yes', 'very-good', 'yes'],
               ['middle-age', 'no', 'yes', 'very-good', 'yes'],
               ['old', 'no', 'yes', 'very-good', 'yes'],
               ['old', 'no', 'yes', 'good', 'yes'],
               ['old', 'yes', 'no', 'good', 'yes'],
               ['old', 'yes', 'no', 'very-good', 'yes'],
               ['old', 'no', 'no', 'intermediate', 'no'],
               ]
    labels = ['age', 'have job', 'own house', 'credit situation', 'type']
    
    return datasets, labels

In [143]:
class Node:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {'label' : self.label, 'feature' : self.feature, 'tree' : self.tree}
        
    def __repr__(self):
        return '{}'.format(self.result)
    
    def add_node(self, val, node):
        self.tree[val] = node
        
    def predict(self, test_features):
        if self.root is True:
            return self.label
        # self.feature : 2, 1
        # test_features : 'own house', 'have job'
        return self.tree[test_features[self.feature]].predict(test_features)

class DTree:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}
        
    def cal_entropy(self, datasets):
        n = len(datasets)
        label_count = {}
        # get distribution(Pi)
        for i in range(n):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        # print(label_count) {'no': 6, 'yes': 9}
        empirical_entropy = -sum([(p/n) * log(p/n, 2) for p in label_count.values()])

        return empirical_entropy

    # empirical conditional entropy
    def cal_conditional_entropy(self, datasets, axis=0):
        n = len(datasets)
        feature_sets = {}
        for i in range(n):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])

        empirical_conditional_entropy = sum([(len(p)/n) * self.cal_entropy(p)
                                            for p in feature_sets.values()])

        return empirical_conditional_entropy

    # information gain
    def info_gain(self, entropy, con_entropy):
        return entropy - con_entropy

    def get_info_gain(self, datasets):
        feature_count = len(datasets[0]) - 1
        empirical_entropy = self.cal_entropy(datasets)
        best_feature = []

        for c in range(feature_count):
            c_info_gain = self.info_gain(empirical_entropy, 
                                         self.cal_conditional_entropy(datasets, axis=c))
            best_feature.append((c, c_info_gain))

        best = max(best_feature, key=lambda x : x[-1])
        # best : ((feature_id, feature_info_gain))
        return best
    
    def train(self, train_data):
        '''
        Input : Dataset(DataFrame), Feature_set A, threshold epsilon
        Output : T(Decision Tree)
        '''
        _ = train_data.iloc[:, :-1]
        y_train = train_data.iloc[:, -1]
        features = train_data.columns[:-1]

        
        # 1. if all the data in D belong to the same class C, 
        #    set T as single node and use C as the label, return T
        if len(y_train.value_counts()) == 1:
            return Node(root = True, label = y_train.iloc[0])
        
        # 2. if feature A is empty, set T as single node and use the most C as the label, 
        #    return T
        if len(features) == 0:
            return Node(root = True, 
                        label = y_train.value_counts().sort_values(ascending=False).index[0])
        
        # 3. calculate the largest inforamtion gain, use Ag to representative the best feature
        max_feature_id, max_info_gain = self.get_info_gain(np.array(train_data))
        max_feature_name = features[max_feature_id]
        
        # 4. if the information gain is smaller than threshold, set T as single node,
        #    and use the most C as the label, return T 
        if max_info_gain < self.epsilon:
            return Node(root = True, 
                       label = y_train.value_counts().sort_values(ascending=False).index[0])

        # 5. splitting D according to every possible values in the feature A
        # create new node 
        node_tree = Node(root = False, feature_name = max_feature_name, 
                         feature = max_feature_id)
        print(node_tree.feature_name, node_tree.feature)
        
        feature_list = train_data[max_feature_name].value_counts().index
        for f in feature_list:
            # drop the largest information gain feature from train_data
            # sub_train_df : A - Ag
            sub_train_df = train_data[train_data[max_feature_name] == f].drop([
                                                        max_feature_name], axis=1)
            
            print(max_feature_name, f)
            # 6. create tree recursively
            sub_tree = self.train(sub_train_df)
            print('add_node', max_feature_name, f)
            node_tree.add_node(f, sub_tree)
        
        #pprint.pprint(node_tree.tree)
    
        return node_tree 
    
    def fit(self, train_data):
        self._tree = self.train(train_data)
        
        return self._tree
    
    def predict(self, x_test):
        return self._tree.predict(x_test)

In [144]:
datasets, labels = create_data()
data_df = pd.DataFrame(datasets, columns = labels)
dt = DTree()
tree = dt.fit(data_df)

own house 2
own house no
have job 1
have job no
add_node have job no
have job yes
add_node have job yes
add_node own house no
own house yes
add_node own house yes


In [145]:
tree

{'tree': {'yes': {'tree': {}, 'label': 'yes', 'feature': None}, 'no': {'tree': {'yes': {'tree': {}, 'label': 'yes', 'feature': None}, 'no': {'tree': {}, 'label': 'no', 'feature': None}}, 'label': None, 'feature': 1}}, 'label': None, 'feature': 2}

In [152]:
tree.predict(['old', 'yes', 'no', 'intermediate'])

'yes'