# #               DECISION TREE IMPLEMENTATION ON IRIS DATASET

In [1]:
import pandas as pd
import numpy as np
import math as ma
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris=datasets.load_iris()
df=pd.DataFrame(iris.data)
df.columns=['sl','sw','pl','pw']
bf=pd.DataFrame(iris.target)
bf.shape

(150, 1)

In [2]:
features=df.columns
features

Index(['sl', 'sw', 'pl', 'pw'], dtype='object')

In [3]:
def count(y):
    #iris_count=[0,0,0]
    setosa=np.array(y[:])
    iris_setosa=(setosa==0).sum()
    virsicolor=np.array(y[:])
    iris_virsicolor=(virsicolor==1).sum()
    virginica=np.array(y[:])
    iris_virginica=(virginica==2).sum()
    return iris_setosa,iris_virsicolor,iris_virginica

In [4]:
def entropy(y):
    total=0
    no_of_setosa,no_of_virsicolor,no_of_virginica=count(y)
    length=no_of_setosa+no_of_virsicolor+no_of_virginica
    p1=(no_of_setosa/length)
    p2=(no_of_virsicolor/length)
    p3=(no_of_virginica/length)
    if(p1!=0):
        total+=(p1*ma.log(p1,2))
    if(p2!=0):
        total+=(p2*ma.log(p2,2))
    if(p3!=0):
        total+=(p3*ma.log(p3,2))
    if(not(total)):
        return total

    return -1*total
    

In [5]:
def split(y_split1,y_split2,y):
    split_info=0
    p1=(len(y_split1)/len(y))
    p2=(len(y_split2)/len(y))
    if(p1!=0):
        split_info+=(p1*ma.log(p1,2))
    if(p2!=0):
        split_info+=(p2*ma.log(p2,2))
    return -1*split_info

In [6]:
def gain(info_gain,split_info):
    value=(info_gain/split_info)
    return value

In [7]:
def feature_split(x,y,feature):
    xt=np.array(x[feature])
    temp_g=-1
    temp_split=-1
    for i in range(1,len(x)):
        split_t=(xt[i-1]+xt[i])/2
        x_split1=x[x[feature]>split_t]
        x_split2=x[x[feature]<=split_t]
        y_split1=y[x[feature]>split_t]
        y_split2=y[x[feature]<=split_t]
        final_entropy=0
        initial_entropy=entropy(y)
        entropy1=entropy(y_split1)
        entropy2=entropy(y_split2)
        final_entropy+=(entropy1*(len(y_split1)/len(y)))
        final_entropy+=(entropy2*(len(y_split2)/len(y)))
        info_gain=(initial_entropy-final_entropy)
        split_info=split(y_split1,y_split2,y)
        gain_r=gain(info_gain,split_info)
        if(gain_r>temp_g):
            temp_g=gain_r
            temp_split=split_t
    return temp_g,temp_split
            
        
        

In [8]:
def build_tree(x,y,split_features,lvl):
    no_of_features_left=len(features)
    length_of_data=len(x)
    no_of_setosa,no_of_virsicolor,no_of_virginica=count(y)
    print("Level :",lvl)
    print("Count of iris_setosa :",no_of_setosa)
    print("Count of iris_virsicolor :",no_of_virsicolor)
    print("Count of iris_virginica :",no_of_virginica)
    print("Current Entropy :",entropy(y))
    
    if(no_of_features_left==0 or no_of_setosa==length_of_data or no_of_virsicolor==length_of_data or no_of_virginica==length_of_data ):
        print("Reached Leaf Node")
        print()
        return
    else:
        max_g=-1
        split_value=0
        for feature in features:
            temp_g,temp_split_value=feature_split(x,y,feature)
            if(max_g<temp_g):
                max_g=temp_g
                split_value=temp_split_value
                split_feature=feature
        print("Splitting on feature",split_feature,"with gain ratio",max_g)
        print()
        x1=x[x[split_feature]>split_value]
        x2=x[x[split_feature]<=split_value]
        y1=y[x[split_feature]>split_value]
        y2=y[x[split_feature]<=split_value]
        
        build_tree(x1,y1,features,lvl+1)
        build_tree(x2,y2,features,lvl+1)

In [9]:
build_tree(df,bf,features,0)

Level : 0
Count of iris_setosa : 50
Count of iris_virsicolor : 50
Count of iris_virginica : 50
Current Entropy : 1.584962500721156
Splitting on feature pw with gain ratio 0.9999999999999999

Level : 1
Count of iris_setosa : 0
Count of iris_virsicolor : 50
Count of iris_virginica : 50
Current Entropy : 1.0
Splitting on feature pw with gain ratio 0.6933647985912662

Level : 2
Count of iris_setosa : 0
Count of iris_virsicolor : 1
Count of iris_virginica : 45
Current Entropy : 0.15109697051711368
Splitting on feature pl with gain ratio 0.2622302372762406

Level : 3
Count of iris_setosa : 0
Count of iris_virsicolor : 0
Count of iris_virginica : 43
Current Entropy : 0.0
Reached Leaf Node

Level : 3
Count of iris_setosa : 0
Count of iris_virsicolor : 1
Count of iris_virginica : 2
Current Entropy : 0.9182958340544896
Splitting on feature sw with gain ratio 1.0

Level : 4
Count of iris_setosa : 0
Count of iris_virsicolor : 1
Count of iris_virginica : 0
Current Entropy : 0.0
Reached Leaf Node

L

  p1=(no_of_setosa/length)
  p2=(no_of_virsicolor/length)
  p3=(no_of_virginica/length)
  p1=(no_of_setosa/length)
  p2=(no_of_virsicolor/length)
  p3=(no_of_virginica/length)
  p1=(no_of_setosa/length)
  p2=(no_of_virsicolor/length)
  p3=(no_of_virginica/length)
  p1=(no_of_setosa/length)
  p2=(no_of_virsicolor/length)
  p3=(no_of_virginica/length)


Splitting on feature pl with gain ratio 0.6066178220203009

Level : 3
Count of iris_setosa : 0
Count of iris_virsicolor : 0
Count of iris_virginica : 2
Current Entropy : 0.0
Reached Leaf Node

Level : 3
Count of iris_setosa : 0
Count of iris_virsicolor : 49
Count of iris_virginica : 3
Current Entropy : 0.31821529768323314
Splitting on feature pl with gain ratio 0.2720453440631924

Level : 4
Count of iris_setosa : 0
Count of iris_virsicolor : 2
Count of iris_virginica : 2
Current Entropy : 1.0
Splitting on feature pw with gain ratio 1.0

Level : 5
Count of iris_setosa : 0
Count of iris_virsicolor : 2
Count of iris_virginica : 0
Current Entropy : 0.0
Reached Leaf Node

Level : 5
Count of iris_setosa : 0
Count of iris_virsicolor : 0
Count of iris_virginica : 2
Current Entropy : 0.0
Reached Leaf Node

Level : 4
Count of iris_setosa : 0
Count of iris_virsicolor : 47
Count of iris_virginica : 1
Current Entropy : 0.14609425012013633
Splitting on feature pw with gain ratio 0.26298064861912657
