In [36]:
from sklearn import datasets
import pandas as pd
import math

In [3]:
iris = datasets.load_iris()

In [11]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [13]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [14]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df.head()

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a


In [15]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [16]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [67]:
def cal_entropy(y, base):
    d = {}
    for i in y[0]:
        d[i] = d.get(i,0) + 1
    entropy = 0
    total = len(y[0])
    for i in set(y[0]):
        prob = d[i]/total
        to_add = -1 * prob * math.log(prob,base)
        entropy = entropy + to_add
    return entropy

In [65]:
def cal_info_gain(df, y, f):
    d = {}
    for i in df[f]:
        d[i] = d.get(i,0) + 1
    info_2 = 0
    total = len(df[f])
    for i in set(df[f]):
        entropy = cal_entropy(y[df[f] == i], 10)
        prob = (d[i]/total) * entropy
        info_2 = info_2 + prob
    info_1 = cal_entropy(y,10)
    info_gain = info_1 - info_2
    return info_gain

In [66]:
def cal_split_info(df, f):
    d = {}
    for i in df[f]:
        d[i] = d.get(i,0) + 1
    split_info = 0
    total = len(df[f])
    for i in set(df[f]):
        prob = d[i]/total
        entropy = -1 * prob * math.log(prob,10)
        split_info = split_info + entropy
    return split_info

In [42]:
def cal_gain_ratio(df, y, f):
    gain_ratio = cal_info_gain(df,y,f)/cal_split_info(df,f)
    return gain_ratio

In [71]:
def build_tree(df, y, unused_features, level):
    
    #Printing Level
    print("Level:", level)
    level += 1
    
    #Printing count of different outputs
    d = {}
    for i in y[0]:
        d[i] = d.get(i,0) + 1
    for i in d:
        print("No of", i, "=", d[i])
        
    #Calculating entropy
    print("Entropy =", cal_entropy(y,2))
    
    #base case
    # 1. Pure Node
    if (len(set(y[0])) == 1):
        print("Leaf Node")
        return
    # 2. No Features
    if (len(unused_features) == 0):
        print("No Features Left")
        return
    
    
    best_feature = ""
    best_gain_ratio = -math.inf
    for f in unused_features:
        current_gain_ratio = cal_gain_ratio(df,y,f)
        if (current_gain_ratio > best_gain_ratio):
            best_feature = f
            best_gain_ratio = current_gain_ratio
        
    # here you should know the best feature
    # print it out
    print("Best Feature", best_feature)
    print("Best Gain Ratio", best_gain_ratio)
    
    # remove best feature from unused features
    unused_features.remove(best_feature)
    
    # loop over possible values of best feature
    for i in set(df[best_feature]):
        print()
    # call build tree recursively
        build_tree(df[df[best_feature] == i], y[df[best_feature] == i], unused_features, level)

In [72]:
y = pd.DataFrame(iris.target)
unused_features = set(df.columns)
level = 0
build_tree(df, y, unused_features, level)

Level: 0
No of 0 = 50
No of 1 = 50
No of 2 = 50
Entropy = 1.584962500721156
Best Feature pw_labeled
Best Gain Ratio 0.6996382036222092

Level: 1
No of 1 = 10
Entropy = 0.0
Leaf Node

Level: 1
No of 0 = 50
Entropy = 0.0
Leaf Node

Level: 1
No of 1 = 40
No of 2 = 16
Entropy = 0.863120568566631
Best Feature pl_labeled
Best Gain Ratio 0.43340994956210654

Level: 2
No of 1 = 1
Entropy = 0.0
Leaf Node

Level: 2
No of 1 = 39
No of 2 = 8
Entropy = 0.6581912658132185
Best Feature sl_labeled
Best Gain Ratio 0.12674503775809323

Level: 3
No of 1 = 14
Entropy = 0.0
Leaf Node

Level: 3
No of 2 = 1
Entropy = 0.0
Leaf Node

Level: 3
No of 1 = 23
No of 2 = 7
Entropy = 0.783776947484701
Best Feature sw_labeled
Best Gain Ratio 0.07092036405148884

Level: 4
No of 1 = 14
No of 2 = 6
Entropy = 0.8812908992306927
No Features Left

Level: 4
No of 1 = 3
No of 2 = 1
Entropy = 0.8112781244591328
No Features Left

Level: 4
No of 1 = 6
Entropy = 0.0
Leaf Node

Level: 3
No of 1 = 2
Entropy = 0.0
Leaf Node

Level: 