### Decision Tree From Scratch

Adaptation of ["Lets Write a Decision Tree Classifier from Scratch"](https://www.youtube.com/watch?v=LDRbO9a6XPU) with code [here](https://github.com/random-forests/tutorials/blob/master/decision_tree.ipynb).



### Vocab

**Impurity** Chance of being incorrect if you randomly assign a label to an example in the same set

**Information Gain** Find the best question to ask

#### Functions

In [1]:
def unique_vals(rows, col):
    """Find unique values for select column in a dataset"""
    return set([row[col] for row in rows])

def class_counts(rows):
    """Counts number of each type of example in a dataset"""
    counts = {}
    for row in rows:
        label = row[-1]  # label is last column in row
        if label not in counts:
            counts[label]=0
        counts[label]+=1
    return counts

def is_numeric(value):
    """Test if a value is numeric"""
    return isinstance(value,int) or isinstance(value,float)

def gini(rows):
    """Test for impurity"""
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

def info_gain(left, right, current_uncertainty):
    """Test for information gain"""
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1-p) * gini(right)

def find_best_split(rows):
    best_gain = 0
    best_question = None
    
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1
    
    for col in range(n_features):
        values = set([row[col] for row in rows])
        
        for val in values:
            question = Question(col, val)
            
            true_rows, false_rows = question.partition(rows)
            
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
                
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            
            if gain >= best_gain:
                best_gain, best_question = gain, question
                
    return best_gain, best_question

def build_tree(rows):
    gain, question = find_best_split(rows)
    
    if gain == 0:
        return Leaf(rows)
    
    true_rows, false_rows = question.partition(rows)
    
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    
    return Decision_Node(question, true_branch, false_branch)

def print_tree(node, spacing=" "):
    if isinstance(node, Leaf):
        print(spacing + f"Predict {node.predictions}")
        return
    print(spacing + str(node.question))
    print(spacing + "--> True:")
    print_tree(node.true_branch, spacing + " ")
    
    print(spacing + "--> False:")
    print_tree(node.false_branch, spacing + " ")
    
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)
    
def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value
        
    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >=self.value
        else:
            return val == self.value
        
    def __repr__(self):
        """Helper method to print question in readable format"""
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return f"Is {header[self.column]} {condition} {str(self.value)}?"
    
    def partition(self, rows):
        """Determine whether each row is above or below breakpoint"""
        true_rows, false_rows = [], []
        for row in rows:
            if self.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows
    
class Leaf:
    
    def __init__(self, rows):
        self.predictions = class_counts(rows)
        
class Decision_Node:
    
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

#### Data

In [2]:
header = ['color','diameter','label']

train = [['Green',3,'Apple'],
        ['Yellow',3,'Apple'],
        ['Red',1,'Grape'],
        ['Red',1,'Grape'],
        ['Yellow',3,'Lemon']]

test = [['Green',3,'Apple'],
       ['Yellow',4,'Apple'],
       ['Red',2,'Grape'],
       ['Red',1,'Grape'],
       ['Yellow',3,'Lemon']]

In [3]:
unique_vals(train, 0)

{'Green', 'Red', 'Yellow'}

In [4]:
class_counts(train)

{'Apple': 2, 'Grape': 2, 'Lemon': 1}

In [5]:
is_numeric(7.0)

True

In [6]:
Question(1,3)

Is diameter >= 3?

In [7]:
q = Question(0,'Green')
q

Is color == Green?

In [8]:
example = train[0]
example

['Green', 3, 'Apple']

In [9]:
q.match(example)

True

In [10]:
true_rows, false_rows = Question(0,'Red').partition(train)

In [11]:
true_rows

[['Red', 1, 'Grape'], ['Red', 1, 'Grape']]

In [12]:
false_rows

[['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']]

In [13]:
no_mixing = [['Apple'],['Apple']]
gini(no_mixing)

0.0

In [14]:
some_mixing = [['Apple'],['Orange']]

gini(some_mixing)

0.5

In [15]:
lots_of_mixing = [['Apple'],['Orange'],['Grape'],['Grapefruit'],['Blueberry']]

gini(lots_of_mixing)

0.7999999999999998

In [16]:
current_uncertainty = gini(train)

In [17]:
current_uncertainty

0.6399999999999999

In [18]:
true_rows, false_rows = Question(0,'Green').partition(train)
info_gain(true_rows, false_rows, current_uncertainty)

0.1399999999999999

In [19]:
true_rows

[['Green', 3, 'Apple']]

In [20]:
false_rows

[['Yellow', 3, 'Apple'],
 ['Red', 1, 'Grape'],
 ['Red', 1, 'Grape'],
 ['Yellow', 3, 'Lemon']]

In [21]:
true_rows, false_rows = Question(0,'Red').partition(train)
info_gain(true_rows, false_rows, current_uncertainty)

0.37333333333333324

In [22]:
true_rows

[['Red', 1, 'Grape'], ['Red', 1, 'Grape']]

In [23]:
false_rows

[['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']]

In [24]:
best_gain, best_question = find_best_split(train)
best_question

Is diameter >= 3?

In [25]:
my_tree = build_tree(train)

In [26]:
print_tree(my_tree)

 Is diameter >= 3?
 --> True:
  Is color == Yellow?
  --> True:
   Predict {'Apple': 1, 'Lemon': 1}
  --> False:
   Predict {'Apple': 1}
 --> False:
  Predict {'Grape': 2}


In [27]:
classify(train[0], my_tree)

{'Apple': 1}

In [28]:
print_leaf(classify(train[0], my_tree))

{'Apple': '100%'}

In [29]:
print_leaf(classify(train[1], my_tree))

{'Apple': '50%', 'Lemon': '50%'}

In [30]:
for row in test:
    print(f"Actual: {row[-1]} Predicted: {print_leaf(classify(row,my_tree))}")

Actual: Apple Predicted: {'Apple': '100%'}
Actual: Apple Predicted: {'Apple': '50%', 'Lemon': '50%'}
Actual: Grape Predicted: {'Grape': '100%'}
Actual: Grape Predicted: {'Grape': '100%'}
Actual: Lemon Predicted: {'Apple': '50%', 'Lemon': '50%'}
