In [1]:
from __future__ import division

In [2]:
import math
import numpy as np
import pandas as pd

In [42]:
from abc import ABCMeta, abstractmethod


class Node(object):
    __metaclass__ = ABCMeta
    
    @abstractmethod
    def predict(self, df, leaf_score_map):
        raise NotImplemented()
        
    @abstractmethod
    def prn(self, indent=None):
        raise NotImplemented()


class BranchNode(Node):
    
    def __init__(self, var_name, split, left, right):
        self.var_name = var_name
        self.split = split
        self.left = left
        self.right = right
         
    def predict(self, df, leaf_score_map):                
        idx_left = (df[self.var_name] <= self.split)        
        left_values = self.left.predict(df[idx_left], leaf_score_map)            
        right_values = self.right.predict(df[~idx_left], leaf_score_map)                
        return pd.concat([left_values, right_values]).loc[df.index]
    
    def prn(self, indent=None):
        
        indent = indent or 0
        
        if self.left:
            self.left.prn(indent+1)
        
        for _ in range(indent):
            print '\t',
        print "{} {}\n".format(self.var_name, self.split)
            
        if self.right:
            self.right.prn(indent+1)
                    
    
class LeafNode(Node):
    
    def __init__(self):
        pass
        #self._hash = random.random()
        #self.num_good = num_good
        #self.num_bad = num_bad
        
    def predict(self, df, leaf_score_map):
        #leaf_ = leaf_score_map[self]
        #num_good = leaf_data['num_good']
        #num_bad = leaf_data['num_bad']
        return pd.Series(leaf_score_map[self], index=df.index)
    
    def prn(self, indent=None):
        
        indent = indent or 0
        
        for _ in range(indent):
            print '\t',
        print "Leaf({})\n".format(hash(self)) #[{}] ({}, {})\n".format(self.num_good/(self.num_good+self.num_bad), self.num_good, self.num_bad)

In [43]:
# Loss Fun:
# loss_fn(predicted, truth)

def _single_variable_best_split(srs, target, loss_fn):
    
    # Convention:
    # Left is BAD
    # Right is GOOD
    
    if len(srs) < 100:
        candidates = srs.values
    else:
        _, candidates = pd.qcut(srs, 100, labels=False, retbins=True)
    
    best_loss = None
    best_split = None
    
    for val in candidates:
        left_idx = (srs <= val)
                
        left_truth = target[left_idx]
        left_predicted = [left_truth.mean() for _ in left_truth]
                
        right_truth = target[~left_idx]
        right_predicted = [right_truth.mean() for _ in right_truth]
                
        loss = loss_fn(left_predicted, left_truth) + loss_fn(right_predicted, right_truth)
        
        if best_loss is None or loss < best_loss:
            best_split = val
            best_loss = loss
            
    return best_split, best_loss
 

def get_best_split(df, target, loss_fn):
    # Return:
    # (var, split, loss)
    
    best_var = None
    best_split = None
    best_loss = None
    
    for var in df.columns:
        split, loss = _single_variable_best_split(df[var], target, loss_fn)
        if best_loss is None or loss < best_loss:
            best_var = var
            best_split = split
            best_loss = loss
            
    return (best_var, best_split, best_loss)    

In [44]:
def leaf_good_rate(num_good, num_bad):
    return num_good / (num_bad + num_good)
    

def train_greedy_tree(df, target, loss_fn,
                      max_depth=None,
                      min_to_split=None,
                      leaf_map=None,
                      leaf_value_fn=leaf_good_rate):
    """
    Returns a tree and its leaf map
    """
    
    if leaf_map is None:
        leaf_map = {}
    
    counts = dict(target.value_counts())
    num_good = counts.get(1, 0)
    num_bad = counts.get(0, 0)
    good_rate = num_good / (num_good + num_bad)
    
    current_loss = loss_fn([good_rate for _ in target], target)
    
    if len(df) == 1 or (max_depth is not None and max_depth <= 0) or (min_to_split is not None and len(df) < min_to_split):
        leaf = LeafNode()
        leaf_map[leaf] = leaf_value_fn(num_good, num_bad) #{'num_good': num_good,
                          #'num_bad': num_bad}
        return leaf, leaf_map
    
    var, split, loss = get_best_split(df, target, loss_fn)

    if loss >= current_loss:
        leaf = LeafNode()
        leaf_map[leaf] = leaf_value_fn(num_good, num_bad) #{'num_good': num_good,
                          #'num_bad': num_bad}
        return leaf, leaf_map
        
    
    left_idx = df[var] <= split
    
    left_tree, left_map = train_greedy_tree(df[left_idx], target[left_idx],
                                            loss_fn,
                                            max_depth = max_depth-1 if max_depth else None,
                                            min_to_split=min_to_split,
                                            leaf_map=leaf_map,
                                            leaf_value_fn=leaf_value_fn)
                                  
    right_tree, right_map = train_greedy_tree(df[~left_idx], target[~left_idx],
                                              loss_fn,
                                              max_depth = max_depth-1 if max_depth else None,
                                              min_to_split=min_to_split,
                                              leaf_map=leaf_map,
                                              leaf_value_fn=leaf_value_fn)
    
    leaf_map.update(left_map)
    leaf_map.update(right_map)
    
    return (BranchNode(var, split,
                      left_tree, right_tree),
            leaf_map)

In [45]:
def mate(mother, father):
    """
    Create a child tree
    """
    
    
    

In [46]:
data = pd.DataFrame({'A': [0.1, 10, .02],
                     'B': [10, 20, 30]},
                    index=['foo', 'bar', 'baz'])

In [47]:
t = BranchNode('A', 0.5, None, None)
t.left = LeafNode() #'A', 0.5, 10, 20)
t.right = LeafNode() #'A', 0.5, 100, 0)


leaf_map = {t.left: 10,
           t.right: 20}

t.predict(data, leaf_map)

foo    10
bar    20
baz    10
dtype: int64

In [48]:
def cut(x, min, max):
    if x < min:
        return min
    elif x > max:
        return max
    else:
        return x

def cross_entropy(predicted, truth, threshold=0.5):
    
    if len(predicted) == 0:
        return 0
    
    loss = 0
    
    for (p, t) in zip(predicted, truth):
    
        # Correct prediction
        if (p >= threshold and t==1) or (p < threshold and t==0):
            loss += math.log(cut(p, 0.001, 0.999))
        else:
            loss += math.log(cut(1.0-p, 0.001, 0.999))
        
    return loss

In [49]:
_single_variable_best_split(pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
                            pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1]),
                            cross_entropy)

(2, -19.108016463228132)

In [50]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]})
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

tree, leaf_map = train_greedy_tree(df, target, cross_entropy)

In [51]:
tree.prn()

	Leaf(278867597)

B 40

		Leaf(278859125)

	A 10

		Leaf(278859013)



In [52]:
leaf_map

{<__main__.LeafNode at 0x109f0d050>: 0.0,
 <__main__.LeafNode at 0x109f0d750>: 0.83333333333333337,
 <__main__.LeafNode at 0x109f2e8d0>: 0.0}