In [75]:
training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
    
]

In [76]:
def unique_vals(Data,col):
    return set([row[col] for row in Data])

In [77]:
unique_vals(training_data, 0)

{'Green', 'Red', 'Yellow'}

In [78]:
unique_vals(training_data, 2)

{'Apple', 'Grape', 'Lemon'}

In [79]:
def class_counts(Data):
    counts = {}
    for row in Data:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] +=1
    return counts

In [80]:
class_counts(training_data)

{'Apple': 2, 'Grape': 2, 'Lemon': 1}

In [81]:
header = ["Colour", "Diameter", "label"]

In [82]:
class Question:
    def __init__(self, column, value): #1,3 #Is Diameter
        self.column = column
        self.value = value
        
    def match(self, example): #Green 3 Apple
        val = example[self.column] #3
        return val == self.value #3==3
    
    def __repr__(self):
        return "Is %s %s %s?" %(
        header[self.column],"==", str(self.value))

In [83]:
Question(1,3)

Is Diameter == 3?

In [84]:
q = Question(1,3)

In [85]:
q.match(training_data[2])

False

In [86]:
def partition(rows, question):
    true_rows, false_rows =[], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [87]:
true_rows, false_rows = partition(training_data, Question(1,3) )

In [88]:
def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl]/float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

In [89]:
x=[['apple'],['apple'],['abc'],['abc']]
gini(training_data)

0.6399999999999999

In [90]:
def info_gain(left,right, current_uncertainty):
    p = float(len(left))/(len(left) + len(right))
    q = 1-p
    return current_uncertainty - p*gini(left) - q*gini(right)

In [91]:
info_gain(true_rows, false_rows, 0.639)

0.37233333333333335

In [92]:
def find_best_split(rows):
    best_gain = 0 
    best_question = None
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1
    for col in range(n_features):
        values = set([row[col] for row in rows])
        for val in values:
            question = Question(col, val)
            true_rows,false_rows = partition(rows,question)
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if (gain>best_gain):
                best_gain= gain
                best_question = question
    return best_gain,best_question

In [93]:
find_best_split(training_data)

(0.37333333333333324, Is Colour == Red?)

In [94]:
class Leaf:
    def __init__(self,rows):
        self.predictions = class_counts(rows)

In [95]:
class Decision_Node:
    def __init__(self,
                question,
                true_branch,
                false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [96]:
def build_tree(rows):
    
    gain, question = find_best_split(rows)
    
    if gain == 0:
        return Leaf(rows)
    
    true_rows, false_rows = partition(rows,question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

In [97]:
my_tree = build_tree(training_data)
print(my_tree)

<__main__.Decision_Node object at 0x105ee75c0>


In [98]:
def print_tree(node, spacing = " "):
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return
    print (spacing + str(node.question))
    print (spacing + '--> True :')
    print_tree(node.true_branch, spacing = " ")
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + " ")

In [99]:
print_tree(my_tree)

 Is Colour == Red?
 --> True :
 Predict {'Grape': 2}
 --> False:
  Is Colour == Yellow?
  --> True :
 Predict {'Apple': 1, 'Lemon': 1}
  --> False:
   Predict {'Apple': 1}


In [100]:
def print_leaf(count):
    """A nicer way to print the predicitions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl]/ totsl * 100)) + "%"
    return probs

In [101]:
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [102]:
testing_data = [
    ['red', 1, 'apple'],
    ['Yellow', 3 , 'apple']
]

In [74]:
for row in testing_data:
    print("Actual: %s. Predicted: %s"%(row[-1], print_leaf(classify(row, mytree))))

NameError: name 'mytree' is not defined