In [304]:
# Functions

# Entropy
from math import log

def get_entropy(probabilities):
    en = 0
    for p in probabilities:
        if(p == 0):
            e = 0
        else:
            e = -p * log(p, 2)
        en += e
    return en

def calc_entropy(data, attribute):
    size = len(data)
    values_counts = data[attribute].value_counts()
    p = []
    for idx, count in values_counts.items():
        p.append(count/size)
    #print(p)
    return get_entropy(p)
    
def calc_conditional_entropy(data, attribute, target):
    size = len(data)

    # all values of target
    target_values_counts = data[target].value_counts()
    #print(target_values_counts)
    target_values = []
    for idx, count in target_values_counts.items():
        #print(target_values)
        target_values.append(idx)
    #print("target_values:", target_values)
    
    conditional_entropy = 0.0
    if(attribute) != target:
            #conditional entropy
            conditional_entropy = 0.0
            
            values = data[attribute].value_counts()
            for idx, count in values.items():
                p_c = count/size
                
                #print('for condition value', idx)
                data_c = data.loc[data[attribute] ==idx,:]
                p_target_c = []
                for target_value in target_values:
                    #print('for target value', target_value)
                    data_v_c = data_c.loc[data_c[target] == target_value]
                    #print(data_v_c)
                    p_target_c.append((len(data_v_c)/size)/p_c)
                
                #print('condition', p_c)
                #print("conditional probabilities", p_target_c)
                e = get_entropy(p_target_c)
                p_entropy = p_c * e
                conditional_entropy += p_entropy

    return conditional_entropy

def cal_conditional_entropies(data, target):
    size = len(data)
    attributes = data.columns
    entropies = {}
    for attribute in attributes:
        if(attribute) != target:
            entropy = calc_conditional_entropy(data, attribute, target)
            entropies[attribute] = entropy
    return entropies

def calc_target_entropy(data, target):
    size = len(data)

    # all values of target
    target_values_counts = data[target].value_counts()
    #print(target_values_counts)
    p = []
    for idx, count in target_values_counts.items():
        p.append(count/size)
    return get_entropy(p)

def get_gains_feature(entropy_value, conditional_entropies):
    gains = {}
    for key, value in conditional_entropies.items():
        gains[key] = entropy_value - value
    
    max_gain = 0
    feature = ""
    for key,gain in gains.items():
        if(max_gain < gain):
            max_gain = gain
            feature = key
    return gains, feature

def get_selected_feature(data, target):
    conditional_entropies = cal_conditional_entropies(data,target)
    entropy_value = calc_target_entropy(data,target)
    gains, selected_feature = get_gains_feature(entropy_value, conditional_entropies)
    print('gains:',gains)
    return selected_feature

# get entropies, if entropy ==0 , target is done for corresponding branch
def id3_fit(data, target):
    root = Node()
    selected_feature = get_selected_feature(data, target)
    print('selected_feature:', selected_feature)
    root.name = selected_feature

    children = []
    root.children = children

    selected_values = data[selected_feature].value_counts()
    for value, count in selected_values.items():
        #print('value:', value)
        new_data = data.loc[data[selected_feature]==value,:]

        entropy_value = calc_target_entropy(new_data,target)
        #print(entropy_value)

        child = Node()
        child.value = value

        if(entropy_value == 0):
            final = new_data[target].values[0]
            child.final = final
        else:
            child = id3_fit(new_data, target)
            child.value = value
        children.append(child)
    return root

def id3_inner_predict(node, instance):
    if(node.name):
        feature = node.name
        #print('feature:', feature)
        value_of_instance = instance[feature]
        #print('value_of_instance:', value_of_instance)
    
    if(node.children):
        children = node.children
        for child in children:
            if(child.value == value_of_instance):
                if(child.final):
                    return child.final
                else:
                    return id3_inner_predict(child, instance)
    
def id3_predict(node, test_data):
    results = {}
    for idx in range(len(test_data)):
        instance = test_data.loc[idx,:]
        
        ret = id3_inner_predict(node, instance)
        results[idx] = ret
    return results
        
def printTree(node, level):
    empty = '  '*level
    
    if(node.value):
        print(empty, node.value,":",node.name)
    else:
        print(empty, node.name)
        
    if(node.children):
        children = node.children
        for child in children:
            if(child.final):
                empty2 = '  '*(level+1)
                print(empty2, child.value+":", child.final)
            else:
                printTree(child, level + 1)

In [303]:
class Node():
    def __init__(self):
        self.children = None
        self.value = None # the value of data[root's name]
        self.name = None
        self.final = None # final class
    

# LOAD DATA

In [2]:
import pandas as pd

data = pd.read_csv("datasets/conditional_entropy.csv")


In [3]:
data.head()

Unnamed: 0,color,target attribute
0,R,a
1,R,a
2,R,b
3,R,a
4,G,b


# get insights

In [4]:
size = len(data)

In [5]:
# R
data_r = data.loc[data['color'] =="R",:]
data_r_a = data_r.loc[data_r['target attribute'] == 'a']
data_r_b = data_r.loc[data_r['target attribute'] == 'b']

p_r = len(data_r)/size
print(p_r)
p_ar = (len(data_r_a)/size)/p_r
print(p_ar)
p_br = (len(data_r_b)/size)/p_r
print(p_br)

0.4
0.7499999999999999
0.25


In [6]:
# G
data_g = data.loc[data['color'] =="G",:]
data_g_a = data_g.loc[data_g['target attribute'] == 'a']
data_g_b = data_g.loc[data_g['target attribute'] == 'b']

p_g = len(data_g)/size
print(p_g)
p_ag = (len(data_g_a)/size)/p_g
print(p_ag)
p_bg = (len(data_g_b)/size)/p_g
print(p_bg)

0.4
0.5
0.5


In [7]:
# B
data_B = data.loc[data['color'] =="B",:]
data_B_a = data_B.loc[data_B['target attribute'] == 'a']
data_B_b = data_B.loc[data_B['target attribute'] == 'b']

p_B = len(data_B)/size
print(p_B)
p_aB = (len(data_B_a)/size)/p_B
print(p_aB)
p_bB = (len(data_B_b)/size)/p_B
print(p_bB)

0.2
0.5
0.5


# Gain

In [8]:
data.head()

Unnamed: 0,color,target attribute
0,R,a
1,R,a
2,R,b
3,R,a
4,G,b


In [9]:
# Calculate the conditional entropies for each feature
conditional_entropy = calc_conditional_entropy(data, 'color', 'target attribute')
print(conditional_entropy)


target_values: ['a', 'b']
condition 0.4
conditional probabilities [0.5, 0.5]
condition 0.4
conditional probabilities [0.7499999999999999, 0.25]
condition 0.2
conditional probabilities [0.5, 0.5]
0.9245112497836532


In [10]:
conditional_entropies = cal_conditional_entropies(data, 'target attribute')
print(conditional_entropies)

target_values: ['a', 'b']
condition 0.4
conditional probabilities [0.5, 0.5]
condition 0.4
conditional probabilities [0.7499999999999999, 0.25]
condition 0.2
conditional probabilities [0.5, 0.5]
{'color': 0.9245112497836532}


In [11]:
# Calculate entropy
entropy = calc_entropy(data, 'color')
print(entropy)

[0.4, 0.4, 0.2]
1.5219280948873621


# ID3

In [12]:
import pandas as pd

data = pd.read_csv("datasets/id3.csv")

In [13]:
data.head()

Unnamed: 0,Motor,Wheels,Doors,Size,Efficiency
0,no,two,none,small,good
1,no,three,none,small,bad
2,yes,two,none,small,good
3,yes,four,two,small,bad
4,yes,four,three,medium,good


In [14]:
target = 'Efficiency'

In [15]:
conditional_entropies = cal_conditional_entropies(data,target)
print(conditional_entropies)

target_values: ['good', 'bad']
condition 0.7142857142857143
conditional probabilities [0.6, 0.39999999999999997]
condition 0.2857142857142857
conditional probabilities [0.5, 0.5]
target_values: ['good', 'bad']
condition 0.5714285714285714
conditional probabilities [0.5, 0.5]
condition 0.2857142857142857
conditional probabilities [1.0, 0.0]
condition 0.14285714285714285
conditional probabilities [0.0, 1.0]
target_values: ['good', 'bad']
condition 0.42857142857142855
conditional probabilities [0.6666666666666666, 0.3333333333333333]
condition 0.2857142857142857
conditional probabilities [0.5, 0.5]
condition 0.14285714285714285
conditional probabilities [0.0, 1.0]
condition 0.14285714285714285
conditional probabilities [1.0, 0.0]
target_values: ['good', 'bad']
condition 0.5714285714285714
conditional probabilities [0.5, 0.5]
condition 0.2857142857142857
conditional probabilities [1.0, 0.0]
condition 0.14285714285714285
conditional probabilities [0.0, 1.0]
{'Motor': 0.9792504246104776, 'Wh

In [16]:
entropy = calc_target_entropy(data,target)
print(entropy)

0.9852281360342516


In [17]:
gains = {}
for key, value in conditional_entropies.items():
    gains[key] = entropy - value
print(gains)

{'Motor': 0.0059777114237740125, 'Wheels': 0.41379956460568024, 'Doors': 0.3059584928680419, 'Size': 0.41379956460568024}


In [18]:
data

Unnamed: 0,Motor,Wheels,Doors,Size,Efficiency
0,no,two,none,small,good
1,no,three,none,small,bad
2,yes,two,none,small,good
3,yes,four,two,small,bad
4,yes,four,three,medium,good
5,yes,four,four,medium,good
6,yes,four,four,large,bad


# ID3 example

In [98]:
import pandas as pd

data = pd.read_csv("datasets/id3 example.csv")
data.head()

Unnamed: 0,Height,Hair,Eyes,Sensitivity
0,short,blond,blue,yes
1,tall,blond,brown,no
2,tall,red,blue,yes
3,tall,dark,brown,no
4,short,dark,blue,no


In [99]:
target = 'Sensitivity'

In [100]:
conditional_entropies = cal_conditional_entropies(data,target)
print(conditional_entropies)

target_values: ['no', 'yes']
condition 0.625
conditional probabilities [0.6, 0.4]
condition 0.375
conditional probabilities [0.6666666666666666, 0.3333333333333333]
target_values: ['no', 'yes']
condition 0.5
conditional probabilities [0.5, 0.5]
condition 0.375
conditional probabilities [1.0, 0.0]
condition 0.125
conditional probabilities [0.0, 1.0]
target_values: ['no', 'yes']
condition 0.625
conditional probabilities [0.4, 0.6]
condition 0.375
conditional probabilities [1.0, 0.0]
{'Height': 0.9512050593046015, 'Hair': 0.5, 'Eyes': 0.6068441215341679}


In [101]:
entropy_value = calc_target_entropy(data,target)
print(entropy_value)

0.9544340029249649


In [102]:
gains = {}
for key, value in conditional_entropies.items():
    gains[key] = entropy - value
print(gains)

{'Height': 0.034023076729650104, 'Hair': 0.48522813603425163, 'Eyes': 0.37838401450008374}


In [103]:
gains, selected_feature = get_gains_feature(entropy, conditional_entropies)
print(gains)
print(selected_feature)

{'Height': 0.034023076729650104, 'Hair': 0.48522813603425163, 'Eyes': 0.37838401450008374}
Hair


In [104]:
data.head()

Unnamed: 0,Height,Hair,Eyes,Sensitivity
0,short,blond,blue,yes
1,tall,blond,brown,no
2,tall,red,blue,yes
3,tall,dark,brown,no
4,short,dark,blue,no


In [212]:
tree = id3_fit(data, target)

target_values: ['no', 'yes']
condition 0.625
conditional probabilities [0.6, 0.4]
condition 0.375
conditional probabilities [0.6666666666666666, 0.3333333333333333]
target_values: ['no', 'yes']
condition 0.5
conditional probabilities [0.5, 0.5]
condition 0.375
conditional probabilities [1.0, 0.0]
condition 0.125
conditional probabilities [0.0, 1.0]
target_values: ['no', 'yes']
condition 0.625
conditional probabilities [0.4, 0.6]
condition 0.375
conditional probabilities [1.0, 0.0]
selected_feature: Hair
value: blond
1.0
target_values: ['no', 'yes']
condition 0.5
conditional probabilities [0.5, 0.5]
condition 0.5
conditional probabilities [0.5, 0.5]
target_values: ['no', 'yes']
condition 1.0
conditional probabilities [0.5, 0.5]
target_values: ['no', 'yes']
condition 0.5
conditional probabilities [1.0, 0.0]
condition 0.5
conditional probabilities [0.0, 1.0]
selected_feature: Eyes
value: brown
0.0
value: blue
0.0
value: dark
0.0
value: red
0.0


In [216]:
printTree(tree,0)

 Hair
   blond : Eyes
     brown: no
     blue: yes
   dark: no
   red: yes


# HOMEWORK

In [276]:
import pandas as pd

data = pd.read_csv("datasets/homework1.csv")
data.head()

Unnamed: 0,WRITABLE,UPDATED,SIZE,CLASS
0,yes,no,small,text
1,yes,yes,large,text
2,no,yes,med,text
3,no,no,med,executable
4,yes,no,large,executable


In [277]:
target = 'CLASS'

In [299]:
tree = id3_fit(data, target)

gains: {'WRITABLE': 0.08170416594551044, 'UPDATED': 0.4591479170272448, 'SIZE': 0.20751874963942196}
selected_feature: UPDATED
gains: {'WRITABLE': 0.31127812445913283, 'UPDATED': 0.0, 'SIZE': 0.8112781244591328}
selected_feature: SIZE


In [300]:
printTree(tree,0)

 UPDATED
   no : SIZE
     large: executable
     small: text
     med: executable
   yes: text


In [305]:
from pandas import DataFrame

test_data = DataFrame({
    "WRITABLE": ['yes', 'no',],
    "UPDATED": ['yes', 'no'],
    "SIZE": ['large', 'small'],
}).sort_index()


result = id3_predict(tree, test_data)
print(result)

{0: 'text', 1: 'text'}


In [306]:
get_entropy([1/2,1/4,1/4])

1.5

In [307]:
get_entropy([1/2,1/2])

1.0