In [60]:
import pandas as pd #for manipulating the csv data
import numpy as np #for mathematical calculation

eps = np.finfo(float).eps #the smallest representable number to avoid division by 0
from numpy import log2 as log

In [64]:
dataset = pd.read_csv("19BCE0120_id3.csv")
df = pd.DataFrame(dataset,columns=['Outlook','Temp','Humidity','Wind','Decision'])
print(df)

     Outlook  Temp Humidity    Wind Decision
0      Sunny   Hot    High     Weak       No
1      Sunny   Hot    High   Strong       No
2   Overcast   Hot    High     Weak      Yes
3       Rain  Mild    High     Weak      Yes
4       Rain  Cool   Normal    Weak      Yes
5       Rain  Cool   Normal  Strong       No
6   Overcast  Cool   Normal  Strong      Yes
7      Sunny  Mild    High     Weak       No
8      Sunny  Cool   Normal    Weak      Yes
9       Rain  Mild   Normal    Weak      Yes
10     Sunny  Mild   Normal  Strong      Yes
11  Overcast  Mild    High   Strong      Yes
12  Overcast   Hot   Normal    Weak      Yes
13      Rain   Hot    High   Strong       No


In [45]:
def find_entropy(df):
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

In [46]:
def find_entropy_attribute(df,attribute):
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    target_variables = df[Class].unique()  #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute (like 'Hot','Cold' in Temperature)
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
            den = len(df[attribute][df[attribute]==variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
    return abs(entropy2)

In [47]:
def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
#         Entropy_att.append(find_entropy_attribute(df,key))
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [48]:
def get_subtable(df, node,value):
    return df[df[node] == value].reset_index(drop=True)

In [65]:
def buildTree(df,tree=None): 
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    
    #Here we build our decision tree

    #Get attribute with maximum information gain
    node = find_winner(df)
    
    #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])
    
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
   #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable['Decision'],return_counts=True)                        
        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable) #Calling the function recursively 
                   
    return tree

In [68]:
tree = buildTree(df)
print(tree)

{'Outlook': {'Overcast': 'Yes', 'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}}, 'Sunny': {'Humidity': {'High ': 'No', 'Normal': 'Yes'}}}}


{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High ': 'No', 'Normal': 'Yes'}}}}
