# #      DECISION TREE IMPLEMENTATION ON IRIS DATASET

In [23]:
class bt:
    def __init__(self,lvl,entropy,split_feature,gain,cls_name):
        self.split_feature=split_feature
        self.lvl=lvl
        self.entropy=entropy
        self.gain=gain
        self.cls_name=cls_name
        self.right=None
        self.left=None
    
def printb1(root,s):
    if root is None:
        return
    print(s)
    print("Level :",root.lvl)
    print("Entropy :",root.entropy)
    print("Split Feature :",root.split_feature)
    print("Gain Ratio :",root.gain)
    print("Class Name :",root.cls_name)
        
    print()
        
    printb1(root.left,"Left Node")
    printb1(root.right,"Right Node")
        

In [24]:
import pandas as pd
import numpy as np
from sklearn import datasets
import math as ma
import sys
iris=datasets.load_iris()

df=pd.DataFrame(iris.data)
bf=pd.DataFrame(iris.target)
df.columns=iris.feature_names
df.columns


Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [25]:
def count(y):
    setosa=np.array(y[:])
    no_setosa=(setosa==0).sum()
    virsicolor=np.array(y[:])
    no_virsicolor=(virsicolor==1).sum()
    virginica=np.array(y[:])
    no_virginica=(virginica==2).sum()
    return no_setosa,no_virsicolor,no_virginica

In [26]:
def entropy(y):
    final_entropy=0
    no_setosa,no_virsicolor,no_virginica=count(y)
    length=len(y)
    p1=(no_setosa/length)
    p2=(no_virsicolor/length)
    p3=(no_virginica/length)
    
    if(p1!=0):
        final_entropy+=(p1*ma.log(p1,2))
    if(p2!=0):
        final_entropy+=(p2*ma.log(p2,2))
    if(p3!=0):
        final_entropy+=(p3*ma.log(p3,2))
    if(not(final_entropy)):
        return final_entropy
    return -1*final_entropy

In [27]:
def split(y_split1,y_split2,y):
    
    p1=len(y_split1)/len(y)
    p2=len(y_split2)/len(y)
    
    final_val=0
    if(p1!=0):
        final_val+=(p1*ma.log(p1,2))
    if(p2!=0):
        final_val+=(p2*ma.log(p2,2))
    return -1*final_val

In [28]:
def gain(info_gain,split_info):
    
    val=(info_gain/split_info)
    return val

In [37]:
def feature_split(x,y,sf):
    xe=np.array(x[sf])
    max_g=-1
    split_value=0
    
    for i in range(1,len(x)):
        mid=(xe[i-1]+xe[i])/2
        
        x_split1=x[x[sf]>mid]
        x_split2=x[x[sf]<=mid]
        y_split1=y[x[sf]>mid]
        y_split2=y[x[sf]<=mid]
        
        final_entropy=0
        initial_entropy=entropy(y)
        final_entropy+=(len(y_split1)/len(y))*entropy(y_split1)
        final_entropy+=(len(y_split2)/len(y))*entropy(y_split2)
        
        info_gain=(initial_entropy-final_entropy)
        
        split_info=split(y_split1,y_split2,y)
        
        temp_g=gain(info_gain,split_info)
        
        if(temp_g>max_g):
            max_g=temp_g
            split_value=mid
    
    return max_g,split_value
            
        
        
        

In [38]:
def dt(x,y,features,lvl):
    no_of_features=len(features)
    length_of_data=len(x)
    no_setosa,no_virsicolor,no_virginica=count(y)
    max_cls=max(no_setosa,no_virsicolor,no_virginica)
    if(no_setosa==max_cls):
        cls_name="Setosa"
    elif(no_virsicolor==max_cls):
        cls_name="Virsicolor"
    else:
        cls_name="Virginica"
        
    if(no_setosa==length_of_data or no_virginica==length_of_data or no_virsicolor==length_of_data or no_of_features==0):
        root=bt(lvl,entropy(y),"Can't split based on feature",0,cls_name)
        return root
    else:
        max_g=-1
        split_value=0
        for i in features:
            temp_g,temp_split_value=feature_split(x,y,i)
            if(temp_g>max_g):
                max_g=temp_g
                split_value=temp_split_value
                split_feature=i
        root=bt(lvl,entropy(y),split_feature,max_g,cls_name)
        x1=x[x[split_feature]>split_value]
        x2=x[x[split_feature]<=split_value]
        y1=y[x[split_feature]>split_value]
        y2=y[x[split_feature]<=split_value]
        
        root.left=dt(x1,y1,features,lvl+1)
        root.right=dt(x2,y2,features,lvl+1)
        
        return root
    
        
    

In [46]:
root=dt(df,bf,df.columns,0)
printb1(root,"Root Node")

  """
  
  import sys


Root Node
Level : 0
Entropy : 1.584962500721156
Split Feature : petal width (cm)
Gain Ratio : 0.9999999999999999
Class Name : Setosa

Left Node
Level : 1
Entropy : 1.0
Split Feature : petal width (cm)
Gain Ratio : 0.6933647985912662
Class Name : Virsicolor

Left Node
Level : 2
Entropy : 0.15109697051711368
Split Feature : petal length (cm)
Gain Ratio : 0.2622302372762406
Class Name : Virginica

Left Node
Level : 3
Entropy : 0.0
Split Feature : Can't split based on feature
Gain Ratio : 0
Class Name : Virginica

Right Node
Level : 3
Entropy : 0.9182958340544896
Split Feature : sepal width (cm)
Gain Ratio : 1.0
Class Name : Virginica

Left Node
Level : 4
Entropy : 0.0
Split Feature : Can't split based on feature
Gain Ratio : 0
Class Name : Virsicolor

Right Node
Level : 4
Entropy : 0.0
Split Feature : Can't split based on feature
Gain Ratio : 0
Class Name : Virginica

Right Node
Level : 2
Entropy : 0.44506485705083865
Split Feature : petal length (cm)
Gain Ratio : 0.6066178220203009
Class

In [48]:
def cost(root):
    return root.gain

In [49]:
def pruning(root,lamba,leaves_count):
    if(root is None):
        return
    if(root.left is None )and (root.right is None):
        return 
    pruning(root.left,lamba,leaves_count)
    initial_cost=cost(root)+(lamba*leaves_count)
    child_cost=cost(root.left)+(lamba*(leaves_count-1))
    print(child_cost)
    if(child_cost<initial_cost):
        leaves_count-=1
        root.left=None
    
    pruning(root.right,lamba,leaves_count)
    child_cost=cost(root.right)+(lamba*(leaves_count-1))
    print(child_cost)
    if(child_cost<initial_cost):
        leaves_count-=1
        root.right=None
        
        
    

In [50]:
def no_of_leaves(root):
    if(root is None):
        return 0
    if((root.left is None )and(root.right is None)):
        return 1
    return (no_of_leaves(root.left)+no_of_leaves(root.right))
        

In [51]:
leaves_count=no_of_leaves(root)
pruning(root,1,leaves_count)
printb1(root,"Root Node")

Root Node
Level : 0
Entropy : 1.584962500721156
Split Feature : petal width (cm)
Gain Ratio : 0.9999999999999999
Class Name : Setosa

