In [1]:
from sklearn import datasets
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
iris=datasets.load_iris()
df=pd.DataFrame(iris.data)
df.columns=['sl','sw','pl','pw']
df['flower_type']=iris.target


In [3]:
def label(val,*boundary):
    if(val<boundary[0]):
        return 'a'
    if(val<boundary[1]):
        return 'b'
    if(val<boundary[2]):
        return 'c'
    else:
        return 'd'
    
def tolabel(x,feature):
#     print(type(label))
    minimum=x[feature].min()
    second=x[feature].mean()
    maximum=x[feature].max()
    first=(minimum+second)/2
    third=(maximum+second)/2
    
    return x[feature].apply(label,args=(first,second,third))
# df[0]

In [4]:
df['sl_labelled']=tolabel(df,'sl')
df['sw_labelled']=tolabel(df,'sw')
df['pl_labelled']=tolabel(df,'pl')
df['pw_labelled']=tolabel(df,'pw')

In [5]:
df.drop(['sl','sw','pl','pw'],axis=1,inplace=True)
df

Unnamed: 0,flower_type,sl_labelled,sw_labelled,pl_labelled,pw_labelled
0,0,b,c,a,a
1,0,a,b,a,a
2,0,a,c,a,a
3,0,a,c,a,a
4,0,a,c,a,a
...,...,...,...,...,...
145,2,c,b,c,d
146,2,c,a,c,d
147,2,c,b,c,d
148,2,c,c,d,d


In [6]:
def entrophy(df):
    en=0
    categories = np.unique(df['flower_type'])
    y=df['flower_type']
    # print(y)
    
    for cat in categories:
        p_cat = (y==cat).sum()/y.size
        if p_cat != 0:
            en+= -p_cat*math.log2(p_cat)
    # print(en)
    return en

In [9]:
def buildtree(df,y,unused_feature,level):
    if(len(unused_feature)==0):
        return
    best_feature=''
    maximum=0
    for f in unused_feature:
        entf=entrophy(df)
        possible_val=set(df[f])
        ent=0
        for i in possible_val:
            #calulating sum of weighted entrophy
            ent += (len(df[df[f] == i])/df.size)*entrophy(df[df[f] == i])
        #calculating information gain
        info_g=entf-ent
        if(maximum<=info_g):
            maximum=info_g
            best_feature=f
#             print("f",f)
    entf=entrophy(df)
    print("Level",level)
    y=df['flower_type']
    classes=set(df['flower_type'])
#     print(classes)
    for i in classes:
        print("count of ",i," = ",(y==i).sum())
        
    print("Current Entrophy is =",entf)
    print("splitting on Feature ", best_feature," information gain ",maximum)
    
    label_b=set(df[best_feature])
#     print(df)
    unused_feature.remove(best_feature)
#     print(label_b)
    for i in label_b:
#         print(df[df[best_feature] == i])
        buildtree(df[df[best_feature] == i],y,unused_feature,level+1)

In [10]:
y=pd.DataFrame(iris.target)
unused_feature=set(df.columns)
unused_feature.remove('flower_type')
level=0
buildtree(df,y,unused_feature,level)

Level 0
count of  0  =  50
count of  1  =  50
count of  2  =  50
Current Entrophy is = 1.584962500721156
splitting on Feature  pw_labelled  information gain  1.5205161649348475
Level 1
count of  2  =  34
Current Entrophy is = 0.0
splitting on Feature  sw_labelled  information gain  0.0
Level 2
count of  2  =  2
Current Entrophy is = 0.0
splitting on Feature  pl_labelled  information gain  0.0
Level 3
count of  2  =  2
Current Entrophy is = 0.0
splitting on Feature  sl_labelled  information gain  0.0
