In [15]:
from sklearn import datasets
import pandas as pd
import math

In [16]:
iris = datasets.load_iris()

In [17]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [18]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [19]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df.head(5)

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a


In [20]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [21]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

y = pd.DataFrame(iris.target)
unused_features = set(df.columns)
build_tree(df, y, unused_features)


In [22]:
#Calculating Entropy
def calculate_entropy(y, base):
    
    d={}
    for i in y[0]:
        d[i] = d.get(i,0) + 1
    
    possible_outputs = set(y[0])
    total = len(y[0])
    entropy = 0
    
    for output in possible_outputs:
        p = d[output]/total
        e = (-1) * p * math.log(p, base)
        entropy = entropy + e
    
    return entropy

In [23]:
#Calculating Split Info
def calculate_split_info(df, sf):
    
    d = {}
    for i in df[sf]:
        d[i] = d.get(i,0) + 1
    
    possible_values = set(df[sf])
    total = len(df[sf])
    split_info = 0
    
    for i in possible_values:
        p = d[i]/total
        si = (-1) * p * math.log(p,10)
        split_info = split_info + si
        
    return split_info
    

In [24]:
#Calculating Info Gain
def calculate_info_gain(df, y, sf):
    
    d = {}
    for i in df[sf]:
        d[i] = d.get(i,0) + 1
    
    possible_values = set(df[sf])
    total = len(df[sf])
    info_f = 0
    
    for val in possible_values:
        entropy = calculate_entropy(y[df[sf] == val], 10)
        p = (d[val]/total) * entropy
        info_f = info_f + p
    
    info_orig = calculate_entropy(y, 10)
    info_gain = info_orig - info_f
    
    return info_gain

In [25]:
#Calculating Gain Ratio
def calculate_gain_ratio(df, y, sf):
    gain_ratio = calculate_info_gain(df, y, sf)/calculate_split_info(df, sf)
    return gain_ratio

In [26]:
def build_tree(df, y, unused_features, level):
    
    #Printing the Current Level
    print('Level:',level)
    level = level+1
    
    #Calculating the Number of Each Possible Output
    dy = {}
    for i in y[0]:
        dy[i] = dy.get(i,0) + 1
    
    #Printing the Number of Each Possible Output
    for k, v in dy.items():
        print("Count of", k, "=", v)
    
    #Printing the Current Entropy
    print("Current Entropy  is =", calculate_entropy(y, 2))

    #base case
    # 1. y contains only one distinct value
    if len(set(y[0]))==1:
        print('Reached leaf Node')
        return
    
    # 2. unused is empty
    if len(unused_features)==0:
        print('No Features Feft')
        return
    
    possible_target = set(y[0]) 
    
    best_feature = ""
    best_gain_ratio = -10000
    
    #Iterating over each Unused Feature
    for f in unused_features:
        #Calculating the Gain Ratio for Each Feature
        gain_ratio_f = calculate_gain_ratio(df, y, f)
        
        # Finding the Best Feature
        if gain_ratio_f > best_gain_ratio:
            best_gain_ratio = gain_ratio_f
            best_feature = f
        
    
    # Printing the Splitting Feature
    print("Splitting on feature", best_feature, "with gain ratio", best_gain_ratio)
    
    # remove best feature from unused features
    #unused_features.discard(best_feature)
    unused_features_new = unused_features - {best_feature}
    
    # loop over possible values of best feature
    possible_values = set(df[best_feature])
    for j in possible_values:
        print()
        # call build tree recursively
        build_tree(df[df[best_feature]==j], y[df[best_feature]==j], unused_features_new, level)
    
    

## Descision Tree- Iris dataset

In [27]:
y = pd.DataFrame(iris.target)
unused_features = set(df.columns)
level = 0
build_tree(df, y, unused_features, level)

Level: 0
Count of 0 = 50
Count of 1 = 50
Count of 2 = 50
Current Entropy  is = 1.584962500721156
Splitting on feature pw_labeled with gain ratio 0.6996382036222092

Level: 1
Count of 1 = 10
Current Entropy  is = 0.0
Reached leaf Node

Level: 1
Count of 0 = 50
Current Entropy  is = 0.0
Reached leaf Node

Level: 1
Count of 2 = 34
Current Entropy  is = 0.0
Reached leaf Node

Level: 1
Count of 1 = 40
Count of 2 = 16
Current Entropy  is = 0.863120568566631
Splitting on feature pl_labeled with gain ratio 0.43340994956210654

Level: 2
Count of 1 = 1
Current Entropy  is = 0.0
Reached leaf Node

Level: 2
Count of 2 = 8
Current Entropy  is = 0.0
Reached leaf Node

Level: 2
Count of 1 = 39
Count of 2 = 8
Current Entropy  is = 0.6581912658132185
Splitting on feature sl_labeled with gain ratio 0.12674503775809323

Level: 3
Count of 1 = 14
Current Entropy  is = 0.0
Reached leaf Node

Level: 3
Count of 2 = 1
Current Entropy  is = 0.0
Reached leaf Node

Level: 3
Count of 1 = 2
Current Entropy  is = 0.

## Descision Tree- OR

In [28]:
x = [[1,1],[0,1],[1,0],[0,0]]
output = [1,1,1,0]

df_or = pd.DataFrame(x, columns=['X1','X2'])
unused_features_or = set(['X1','X2'])
y_or = pd.DataFrame(output)
level = 0

build_tree(df_or, y_or, unused_features_or, level)

Level: 0
Count of 1 = 3
Count of 0 = 1
Current Entropy  is = 0.8112781244591328
Splitting on feature X1 with gain ratio 0.3112781244591329

Level: 1
Count of 1 = 1
Count of 0 = 1
Current Entropy  is = 1.0
Splitting on feature X2 with gain ratio 1.0

Level: 2
Count of 0 = 1
Current Entropy  is = 0.0
Reached leaf Node

Level: 2
Count of 1 = 1
Current Entropy  is = 0.0
Reached leaf Node

Level: 1
Count of 1 = 2
Current Entropy  is = 0.0
Reached leaf Node
