# Decision Tree
Implementation for ID3, C4.5 and CART alogrithm:

ID3 is based on entropy

C4.5 is based on informantion gain ratio

CART is based on gini index


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math
from math import log

import pprint

## Prepare test data

In [2]:
# Example:
def create_data():
    datasets = [['Young', 'No', 'No', 'Bad', 'No'],
               ['Young', 'No', 'No', 'Good', 'No'],
               ['Young', 'Yes', 'No', 'Good', 'Yes'],
               ['Young', 'Yes', 'Yes', 'Bad', 'Yes'],
               ['Young', 'No', 'No', 'Bad', 'No'],
               ['Elder', 'No', 'No', 'Bad', 'No'],
               ['Elder', 'No', 'No', 'Good', 'No'],
               ['Elder', 'Yes', 'Yes', 'Good', 'Yes'],
               ['Elder', 'No', 'Yes', 'Perfet', 'Yes'],
               ['Elder', 'No', 'Yes', 'Perfet', 'Yes'],
               ['Older', 'No', 'Yes', 'Perfet', 'Yes'],
               ['Older', 'No', 'Yes', 'Good', 'Yes'],
               ['Older', 'Yes', 'No', 'Good', 'Yes'],
               ['Older', 'Yes', 'No', 'Perfet', 'Yes'],
               ['Older', 'No', 'No', 'Bad', 'No'],
               ]
    labels = ['Age', 'Work', 'House', 'Load', 'Class']
    return datasets, labels

In [3]:
datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns = labels)
train_data

Unnamed: 0,Age,Work,House,Load,Class
0,Young,No,No,Bad,No
1,Young,No,No,Good,No
2,Young,Yes,No,Good,Yes
3,Young,Yes,Yes,Bad,Yes
4,Young,No,No,Bad,No
5,Elder,No,No,Bad,No
6,Elder,No,No,Good,No
7,Elder,Yes,Yes,Good,Yes
8,Elder,No,Yes,Perfet,Yes
9,Elder,No,Yes,Perfet,Yes


In [12]:
d = {'Young':1, 'Elder':2, 'Older':3, 'Bad':1, 'Good':2, 'Perfet':3, 'Yes':0, 'No':1}
data = []
for i in range(15):
    tmp = []
    t = datasets[i]
    for tt in t:
        tmp.append(d[tt])
    data.append(tmp)
        
data = np.array(data)
print(data)
print(data.shape)
X, y = data[:,:-1], data[:, -1]

[[1 1 1 1 1]
 [1 1 1 2 1]
 [1 0 1 2 0]
 [1 0 0 1 0]
 [1 1 1 1 1]
 [2 1 1 1 1]
 [2 1 1 2 1]
 [2 0 0 2 0]
 [2 1 0 3 0]
 [2 1 0 3 0]
 [3 1 0 3 0]
 [3 1 0 2 0]
 [3 0 1 2 0]
 [3 0 1 3 0]
 [3 1 1 1 1]]
(15, 5)


# Build basic function

entropy：$H(x) = -\sum_{i=1}^{n}p_i\log{p_i}$

conditional entropy: $H(X|Y)=\sum{P(X|Y)}\log{P(X|Y)}$

information gain : $g(D, A)=H(D)-H(D|A)$

information gain ratio: $g_R(D, A) = \frac{g(D,A)}{H_{A}(D)}$

gini index:$Gini(D)=\sum_{k=1}^{K}p_k\log{p_k}=1-\sum_{k=1}^{K}p_k^2$

In [28]:
# Entropy
def entropy(y):
    N = len(y)
    count = []
    for value in set(y):
        count.append(len(y[y == value]))
    count = np.array(count)
    entro = -np.sum((count / N) * (np.log2(count / N)))
    return entro

entropy(y)

0.9709505944546686

In [27]:
# Conditional entropy
def cond_entropy(X, y, cond):
    N = len(y)
    cond_X = X[: , cond]
    tmp_entro = []
    for val in set(cond_X):
        tmp_y = y[np.where(cond_X == val)]
        tmp_entro.append(len(tmp_y) / N * entropy(tmp_y))
    cond_entro = sum(tmp_entro)
    return cond_entro
    
cond_entropy(X, y, 0)

0.8879430945988998

In [30]:
# Information gain
def info_gain(X, y, cond):
    return entropy(y) - cond_entropy(X, y, cond)

In [31]:
# information gain ratio
def info_gain_ratio(X, y, cond):
    return  (entropy(y) - cond_entropy(X, y, cond))/ cond_entropy(X, y, cond)

In [35]:
# Test
info_gain(X, y, 3)

0.36298956253708536

In [38]:
def best_split(X, y, method = "info_gain"):
    """
    According the method, information gain or information gain ratio,  to calcuate the maximum information gain or ratio, then return the axis of the feature.
    """
    _, M = X.shape
    info_gains = []
    if method == "info_gain":
        split = info_gain
    elif method == "info_gain_ratio":
        split = info_gain_ratio
    else:
        print("Not valid method, only info_gain and info_gain_ratio is valid")
        return

    for i in range(M):
        tmp_gain = split(X, y, i)
        info_gains.append(tmp_gain)
    best_feature = np.argmax(info_gains)

    return best_feature

In [39]:
# test
best_split(X, y)

2

In [41]:
def majorityCnt(y):
    """
    return the majority class
    """
    unique, counts = np.unique(y, return_counts=True)
    max_idx = np.argmax(counts)
    return unique[max_idx]

majorityCnt(y)

0

# ID3 and C.5 

In [54]:
class DecisionTreeClassifier:
    """
    Implementation for ID3 and C4.5 alogrithm:
    Method:
    info_gain: ID3 is based on entropy
    info_gain_ration: C4.5 is based on informantion gain ratio 
    """
    def __init__(self, threshold, method = "info_gain"):
        self.threshold = threshold
        self.method = method

    def fit(self, X, y, labels):
        labels = labels.copy()
        M, N = X.shape
        if len(np.unique(y)) == 1:
            return y[0]

        if N == 1:
            return majorityCnt(y)

        bestSplit = best_split(X, y, method=self.method)
        bestFeatureLabel = labels[bestSplit]
        Tree = {bestFeatureLabel: {}}
        del (labels[bestSplit])

        featureValues = np.unique(X[:, bestSplit])
        for val in featureValues:
            idx = np.where(X[:, bestSplit] == val)
            sub_X = X[idx]
            sub_y = y[idx]
            sub_labels = labels
            Tree[bestFeatureLabel][val] = self.fit(sub_X, sub_y, sub_labels)

        return Tree

My_tree = DecisionTreeClassifier(threshold=0.1, method = "info_gain_ratio") 
My_tree.fit(X, y, labels)

  return  (entropy(y) - cond_entropy(X, y, cond))/ cond_entropy(X, y, cond)


{'House': {0: 0, 1: {'Work': {0: 0, 1: 1}}}}

# Sklearn for Decision Tree

In [56]:
# data
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, [0, 1, -1]])
    # print(data)
    return data[:,:2], data[:,-1]

X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [59]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.tree import export_graphviz
import graphviz

In [60]:
clf = DecisionTreeClassifier()
clf.fit(data[:,:-1], data[:, -1])

In [61]:
clf.predict(np.array([1,1, 0, 1]).reshape(1, -1))

array([0])

In [65]:
tree_pic = export_graphviz(clf, out_file= "mytree.pdf")
#with open("mytree.pdf") as f:
#    dot_graph = f.read()

In [None]:
graphviz.Source(dot_graph)

# Cart Tree
to be updated