In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math
from math import log

import sys
import pprint


# table 5.1 from book
def create_data():
    datasets = [['teen', 'no', 'no', 'intermediate', 'no'],
               ['teen', 'no', 'no', 'good', 'no'],
               ['teen', 'yes', 'no', 'good', 'yes'],
               ['teen', 'yes', 'yes', 'intermediate', 'yes'],
               ['teen', 'no', 'no', 'intermediate', 'no'],
               ['middle-age', 'no', 'no', 'intermediate', 'no'],
               ['middle-age', 'no', 'no', 'good', 'no'],
               ['middle-age', 'yes', 'yes', 'good', 'yes'],
               ['middle-age', 'no', 'yes', 'very-good', 'yes'],
               ['middle-age', 'no', 'yes', 'very-good', 'yes'],
               ['old', 'no', 'yes', 'very-good', 'yes'],
               ['old', 'no', 'yes', 'good', 'yes'],
               ['old', 'yes', 'no', 'good', 'yes'],
               ['old', 'yes', 'no', 'very-good', 'yes'],
               ['old', 'no', 'no', 'intermediate', 'no'],
               ['teen', 'no', 'no', 'intermediate', 'yes']
               ]
    labels = ['age', 'have job', 'own house', 'credit situation', 'type']
    
    return datasets, labels

In [2]:
def print_node(node, depth=0):  
    if node.splitting_feature is None:
        print(depth, (node.splitting_feature, node.splitting_point, 
                      node.class_label, len(node.label_data)))
    else:
        print(depth, (node.splitting_feature, node.splitting_point))
        for c in node.child:
            print_node(c, depth+1)

In [3]:
class Node:
    def __init__(self, splitting_feature=None, splitting_point=None, class_label=None, label_data=None):
        self.splitting_feature = splitting_feature
        self.splitting_point = splitting_point
        self.child = []
        self.class_label = class_label
        self.label_data = label_data  # store the labels of the samples
        
    def add_child(self, node):
        self.child.append(node)
        
    def predict(self, test_data):
        pass

In [4]:
class CTree:
    def __init__(self):
        # initialize the root node
        self.tree = Node()
    
    def gini_index(self, data, A, a):
        '''
        @ A: splitting vairable (feature)
        @ a: splitting point (possible value of A)
        '''
        # splitting the data set accroding to whether A == a
        D1 = data.loc[data[A] == a]
        D2 = data.loc[data[A] != a]
        
        # count the value of |C_k| respectively
        D1_label_count = {}
        for i in range(D1.shape[0]):
            label = D1.iloc[i, -1]
            if label not in D1_label_count:
                D1_label_count[label] = 0
            D1_label_count[label] += 1 
        
        D2_label_count = {}
        for i in range(D2.shape[0]):
            label = D2.iloc[i, -1]
            if label not in D2_label_count:
                D2_label_count[label] = 0
            D2_label_count[label] += 1 
        
        # calculate the gini index
        gini_D1 = (D1.shape[0] / data.shape[0]) * sum([1 - c_k/D1.shape[0] for c_k in D1_label_count.values()])
        gini_D2 = (D2.shape[0] / data.shape[0]) * sum([1 - c_k/D2.shape[0] for c_k in D2_label_count.values()])
        
        return gini_D1 + gini_D2
    
    def fit(self, data, threshold):
        self.train(data, self.tree, threshold)
    
    def train(self, data, node, threshold):
        '''
        leaf nodes: splitting_feature --> None
                    splitting_point --> None
                    child --> None
                    class_label --> not None
                    label_data --> not None
        ----------------------------------------
        others :    splitting_feature --> not None
                    splitting_point --> not None
                    child --> not None
                    class_label --> None
                    label_data --> None
        '''
        labels = data.iloc[:, -1]
        train_data = data.iloc[:, 0:-1]
        features_list = train_data.columns.values
        
        # if the number of samples less than the threshold, 
        # setting it as the leaf node and return 
        if len(data) < threshold:
            # use the most class among samples as the class label
            node.class_label = labels.value_counts().sort_values(ascending=False).index[0]
            node.label_data = labels
            return 
        
        # if all the samples are belong to the same class,
        # setting it as the leaf node and return
        if len(labels.value_counts()) == 1:
            # use the label of samples as the class label
            node.class_label = labels.iloc[0]
            node.label_data = labels
            return 
        
        # if there' no data in the data set, just return
        if train_data.empty:
            node.class_label = labels.value_counts().sort_values(ascending=False).index[0]
            node.label_data = labels
            return
        
        # initialize the gini index as positive infinity
        gini = float("inf")
        for A in features_list:
            for a in train_data[A]:
                gini_c = self.gini_index(data, A, a)
                if gini_c < gini:
                    feature = A
                    point = a
                    gini = gini_c

        node.splitting_feature = feature
        node.splitting_point = point
        node.add_child(Node())
        node.add_child(Node())
        
        D1 = data.loc[data[node.splitting_feature] == node.splitting_point].drop(node.splitting_feature, axis=1)
        D2 = data.loc[data[node.splitting_feature] != node.splitting_point].drop(node.splitting_feature, axis=1)
    
        self.train(D1, node.child[0], threshold)
        self.train(D2, node.child[1], threshold)
        
        return 
    
    
    def find_leaf(self, node, leaf):    # find all leaf nodes
        for t in node.child:
            if t.class_label is not None:
                leaf.append(t.label_data)
            else:
                for c in node.child:
                    self.find_leaf(c, leaf)
    
    
    def gini_pruning(self, leaf_nodes):
        gini = 0
        for node in leaf_nodes:
            label_count = pd.value_counts(node)
            gini_curr = 0
            for i in range(len(label_count)):
                gini_curr += math.pow((label_count[i]/len(node)), 2)
            gini += 1 - gini_curr
            
        return gini
    
    
    def g_t(self, node):
        leaf_nodes = []
        # find all the leaf nodes
        self.find_leaf(node, leaf_nodes)
        # |T_t|
        T_t = len(leaf_nodes)
        # C(T_t)
        C_T_t = self.gini_pruning(leaf_nodes)
        
        # collect data labels from child nodes
        labels = []
        for n in leaf_nodes:
            for l in n:
                labels.append(l)
        # C(t)    
        C_t = self.gini_pruning(labels)
        
        gt = (C_t - C_T_t) / (T_t - 1)
        
        return gt
        
    
    def pruning(self):
        self.alpha = float('inf')
        alpha_set = set()
        self.sub_tree = set()
        self.sub_tree.add(self.tree)
        
        while self.tree.child[0].splitting_feature != None or self.tree.child[1].splitting_feature != None:
            print(1)
            self.cut_brunch(self.tree, False)
            self.cut_brunch(self.tree, True)
            alpha_list.add(self.alpha)
            self.alpha = float('inf')
            
        self.sub_tree.add(self.tree)
        
    
    def cut_brunch(self, node, pruning):
        '''
        leaf nodes: splitting_feature --> None
                    splitting_point --> None
                    child --> None
                    class_label --> not None
                    label_data --> not None
        ----------------------------------------
        others :    splitting_feature --> not None
                    splitting_point --> not None
                    child --> not None
                    class_label --> None
                    label_data --> None
        '''
        # if is leaf node
        if node.splitting_feature is None:
            return
        # if is not leaf node
        else:     
            self.cut_brunch(node.child[0], pruning)
            self.cut_brunch(node.child[1], pruning)
            gt = self.g_t(node)
            print(2)
            
            if pruning:
                if gt == self.alpha:
                    # collect the labels from leaf nodes
                    leaf_label = []
                    self.find_leaf(node, leaf_label)
                    for node in leaf_label:
                        for label in node:
                            leaf_label.append(label)
                    label_count = Counter(leaf_label)
                    
                    # pruning
                    node.splitting_feature = None
                    node.splitting_point = None
                    node.child[0] = None
                    node.child[1] = None
                    node.label_data = leaf_label
                    node.class_label = leaf_count.most_common(1)[0][0]
                    
                    self.sub_tree.add(self.tree)
            else:
                # alpha = min(alpha, g(t))
                if gt < self.alpha:
                    self.alpha = gt   
                print(self.alpha)
        
            return 

In [5]:
datasets, labels = create_data()
data_df = pd.DataFrame(datasets, columns = labels)
threshold = 0
ct = CTree()
ct.fit(data_df, threshold)
print_node(ct.tree)

0 ('own house', 'no')
1 ('have job', 'no')
2 ('age', 'teen')
3 ('credit situation', 'intermediate')
4 (None, None, 'no', 3)
4 (None, None, 'no', 1)
3 (None, None, 'no', 3)
2 (None, None, 'yes', 3)
1 (None, None, 'yes', 6)


In [6]:
ct.pruning()

1
2
-0.4444444444444444
2
-0.4444444444444444
2
-0.4444444444444444
2
-0.4444444444444444
2


KeyboardInterrupt: 

In [105]:
a = ['y','n', 'y']
pd.value_counts(a)

2

In [97]:
len(3)

TypeError: object of type 'int' has no len()