In [1]:
from __future__ import print_function

In [2]:
#Toy dataset.
#Format: each row is an example.
#The last columns are features.
#Feel free to play with it by adding more features & examples
#Interesting note: I've written this so the 2nd and 5th examples
#have same features, but different labels = so we can see how the
#tree handles this case.
training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon']
]

In [3]:
#Columns labels
#There are used only to print the tree.
header = ['color', 'diameter', 'label']

In [6]:
def unique_cals(rows,col):
    """Find the unique values for the column in a dataset."""
    return set(row[col] for row in rows)

In [7]:
######
#Demo:
unique_cals(training_data,0)
#unique_cals(training_data,0)
######

{'Green', 'Red', 'Yellow'}

In [8]:
def class_counts(rows):
    """Counts the number of each type of example in a dataset."""
    counts= {}
    for row in rows:
        #in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [9]:
#####
#Demo
class_counts(training_data)
#####

{'Apple': 2, 'Grape': 2, 'Lemon': 1}

In [10]:
def is_numeric(value):
    """Test if a value is numeric."""
    return isinstance(value,int) or isinstance(value, float)

In [11]:
#####
#Demo
is_numeric(7)
#is_numeric('Red')
#####

True

In [41]:
class Question:
    """A question is used to partition a dataset.
    
    This class just records a 'column number' (e.g., 0 for Color) and a
    'column value' (e.g. Green). The 'match' method is used to compare
    the feature value in an example to the feature value stored in the 
    question. See the demo below."""
    def __init__(self, column, value):
        self.column = column
        self.value = value
        
    def match(self, example):
        #compare the feature value in an example th the 
        #feature value in this question.
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value
    def __repr__(self):
        #This is just a helper method to print
        #the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [42]:
#####
#Demo
#Let's write a question fror a numeric attribute
Question(1,3)
#####

Is diameter >= 3?

In [43]:
#How about one for a catagorical attribute
q = Question(0,'Red')
q

Is color == Red?

In [44]:
def partition(rows, question):
    """Partitions a dataset.
    
    For each row in dataset, check if it matches the question. If
    so, add it to 'true rows', oterwise, add it to 'false rows'
    """
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [45]:
#Demo
#Let's partition he training data based on whether rows are Red.
true_rows,false_rows = partition(training_data, Question(0,'Red'))
#This will contain all the 'Red' row
true_rows
#####

[['Red', 1, 'Grape'], ['Red', 1, 'Grape']]

In [46]:
#This will contain everything else. 
false_rows
######

[['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']]