- ID3 stands for Iterative Dichotomizer 
- The attributes we're using to build the tree also have to be categorical. Attributes can have many categories but they must be known and countable.

### Entropy calculation example

In [1]:
from math import log2

# Put your calculations below

#calculate entropy baed on tall height
height_tall = 4 / 5 * log2(5 / 4) + 1 / 5 * log2(5 / 1)
height_not_tall = 8 / 15 * log2(15 / 8) + 7 / 15 * log2(15 / 7)

entropy_height_tall = 5 / 20 * (height_tall) + 15 / 20 * (height_not_tall)

#calculate entroy based on medium height
height_medium = 6/8 * log2(8/6) + 2/8 * log2(8/2)
height_not_medium = 6/12 * log2(12/6) + 6/12 * log2(12/6)

entropy_height_medium = 8/20 * (height_medium) + 12/20 * (height_not_medium)


print(entropy_height_tall, entropy_height_medium)

0.9280757477080679 0.9245112497836532


In [None]:
#1 Algorithm(Observations, Outcome, Attributes)
#2    Create a root node.
#3    If all observations are 'A', label root node 'A' and return.
#4    If all observations are 'B', label root node 'B' and return.
#5    If no attributes return the root note labeled with the most common Outcome.
#6    Otherwise, start:
#7        For each value vi of each attribute ai, calculate the entropy.
#8        The attribute ai and value vi with the lowest entropy is the best rule.
#9        The attribute for this node is then ai
#10            Split the tree to below based on the rule ai = vi
#11            Observations vi is the subset of observations with value vi
#12            If Observations vi is empty cap with node labeled with most common Outcome
#13            Else at the new node start a subtree (Observationsvi, Target Outcome, Attributes - {ai}) and repeat the algorithm


In [6]:
import math

#find item in a list
def find(item, list):
    for i in list:
        if item(i): 
            return True
        else:
            return False

In [7]:
#find most common value for an attribute
def majority(attributes, data, target):
    #find target attribute
    valFreq = {}
    #find target in data
    index = attributes.index(target)
    #calculate frequency of values in target attr
    for tuple in data:
        if (valFreq.has_key(tuple[index])):
            valFreq[tuple[index]] += 1 
        else:
            valFreq[tuple[index]] = 1
    max = 0
    major = ""
    for key in valFreq.keys():
        if valFreq[key]>max:
            max = valFreq[key]
            major = key
    return major

In [8]:
#Calculates the entropy of the given data set for the target attr
def entropy(attributes, data, targetAttr): #7

    valFreq = {}
    dataEntropy = 0.0
    
    #find index of the target attribute
    i = 0
    for entry in attributes:
        if (targetAttr == entry):
            break
        ++i
    
    # Calculate the frequency of each of the values in the target attr
    for entry in data:
        if (valFreq.has_key(entry[i])):
            valFreq[entry[i]] += 1.0
        else:
            valFreq[entry[i]]  = 1.0

    # Calculate the entropy of the data for the target attr
    for freq in valFreq.values():
        dataEntropy += (-freq/len(data)) * math.log(freq/len(data), 2) 
        
    return dataEntropy

In [9]:
def gain(attributes, data, attr, targetAttr):
    """
    Calculates the information gain (reduction in entropy) that would
    result by splitting the data on the chosen attribute (attr).
    """
    valFreq = {}
    subsetEntropy = 0.0
    
    #find index of the attribute
    i = attributes.index(attr)

    # Calculate the frequency of each of the values in the target attribute
    for entry in data:
        if (valFreq.has_key(entry[i])):
            valFreq[entry[i]] += 1.0
        else:
            valFreq[entry[i]]  = 1.0
    # Calculate the sum of the entropy for each subset of records weighted
    # by their probability of occuring in the training set.
    for val in valFreq.keys():
        valProb        = valFreq[val] / sum(valFreq.values())
        dataSubset     = [entry for entry in data if entry[i] == val]
        subsetEntropy += valProb * entropy(attributes, dataSubset, targetAttr)

    # Subtract the entropy of the chosen attribute from the entropy of the
    # whole data set with respect to the target attribute (and return it)
    return (entropy(attributes, data, targetAttr) - subsetEntropy)


In [10]:
#choose best attibute #8,9
def chooseAttr(data, attributes, target):
    best = attributes[0]
    maxGain = 0;
    for attr in attributes:
        newGain = gain(attributes, data, attr, target) 
        if newGain>maxGain:
            maxGain = newGain
            best = attr
    return best

In [11]:
#get values in the column of the given attribute 
def getValues(data, attributes, attr):
    index = attributes.index(attr)
    values = []
    for entry in data:
        if entry[index] not in values:
            values.append(entry[index])
    return values

In [12]:
def getExamples(data, attributes, best, val):
    examples = [[]]
    index = attributes.index(best)
    for entry in data:
        #find entries with the give value
        if (entry[index] == val):
            newEntry = []
            #add value if it is not in best column
            for i in range(0,len(entry)):
                if(i != index):
                    newEntry.append(entry[i])
            examples.append(newEntry)
    examples.remove([])
    return examples

In [13]:
def makeTree(data, attributes, target, recursion): #1
    recursion += 1
    #Returns a new decision tree based on the examples given.
    data = data[:]
    vals = [record[attributes.index(target)] for record in data]
    default = majority(attributes, data, target)

    # If the dataset is empty or the attributes list is empty, return the 
    # default value. When checking the attributes list for emptiness, we
    # need to subtract 1 to account for the target attribute. #5,12
    if not data or (len(attributes) - 1) <= 0:
        return default
    # If all the records in the dataset have the same classification,
    # return that classification. #3,4
    elif vals.count(vals[0]) == len(vals):
        return vals[0]
    else: #6
        # Choose the next best attribute to best classify our data
        best = chooseAttr(data, attributes, target) 
        # Create a new decision tree/node with the best attribute and an empty
        # dictionary object--we'll fill that up next.
        tree = {best:{}} #2
    
        # Create a new decision tree/sub-node for each of the values in the
        # best attribute field
        for val in getValues(data, attributes, best): #10,11 
            # Create a subtree for the current value under the "best" field
            examples = getExamples(data, attributes, best, val)
            newAttr = attributes[:]
            newAttr.remove(best)
            subtree = makeTree(examples, newAttr, target, recursion)
    
            # Add the new subtree to the empty dictionary object in our new
            # tree/node we just created.
            tree[best][val] = subtree #13 
    
    return tree