In [2]:
from __future__ import division

In [3]:
import sys
sys.path.append('..')

In [4]:
import math
import random
import numpy as np
import pandas as pd

In [5]:
import gtree

In [6]:
%load_ext autoreload
%autoreload 1

%aimport gtree
%aimport tools
%aimport tree._my_tree

# The goal of this notebook is to explore ways to optimizing our splitting algorithm, which is a huge part of the cost of this code

In [7]:
from timeit import default_timer as timer

class benchmark(object):

    def __init__(self, msg, fmt="%0.3g"):
        self.msg = msg
        self.fmt = fmt

    def __enter__(self):
        self.start = timer()
        return self

    def __exit__(self, *args):
        t = timer() - self.start
        print(("%s : " + self.fmt + " seconds") % (self.msg, t))
        self.time = t

In [8]:
with benchmark("Just A test"):
    print "Foo"

Foo
Just A test : 0.000241 seconds


# Investigate Optimal Splitting

Can we do splitting faster using raw numpy arrays rather than Pandas DataFrames?

In [9]:
df_features, df_targets = tools.make_random_classification(5000)
df_features = pd.DataFrame(df_features, dtype='float32')
df_targets = pd.Series(df_targets, dtype='float32')

In [10]:
np_feaures = np.array(df_features.values, dtype='float32')
np_targets = np.array(df_targets.values, dtype='float32')

In [11]:
feature_names = df_features.columns

In [12]:
# Get the candidates in advance
candidate_map = {v: gtree._get_split_candidates(df_features[v]) for v in feature_names}

In [32]:
loss_fn = gtree.cross_entropy_loss
prediction_builder = gtree.leaf_good_rate_prediction_builder

In [None]:
# Create the numpy version

In [None]:
def np_leaf_good_rate_split_builder(features, target):
    """
    Assume the target consists of 0, 1
    """
    if len(target) > 0:
        mean = sum(target) / len(target)
    else:
        mean = 0

    return lambda arr: np.full(target.shape, mean) #pd.Series([mean for _ in range(len(df))], index=df.index)

In [None]:
def np_naive_single_variable_best_split(features, var_idx, target, loss_fn, leaf_prediction_builder, candidates=None):

    # Select the ith column
    srs = features[:, var_idx]
    if candidates is None:
        candidates = list(srs) 

    best_loss = None
    best_split = None

    for val in candidates:
        
        #print srs, srs.shape
        #print features, features.shape
        
        left_condition = (srs <= val)
        # Get feature rows satisfying the constraint
        feat_left = features[left_condition, :]
        target_left = target[left_condition]
        left_leaf_predict_fn = leaf_prediction_builder(feat_left, target_left)
        left_predicted = left_leaf_predict_fn(feat_left)
        left_loss = loss_fn(left_predicted, target_left)
        #print "LEFT PREDICTED: ", left_predicted
        
        right_condition = (srs > val)
        # Get feature rows satisfying the constraint
        feat_right = features[right_condition, :]
        target_right = target[right_condition]
        right_leaf_predict_fn = leaf_prediction_builder(feat_right, target_right)
        right_predicted = right_leaf_predict_fn(feat_right)
        right_loss = loss_fn(right_predicted, target_right)
        #print "RIGHT PREDICTED: ", right_predicted

        #print "Left loss: ", left_loss, "Right Loss: ", right_loss
        avg_loss = (left_loss * left_condition.sum() + right_loss * right_condition.sum()) / (len(features))
        
        if best_loss is None or avg_loss < best_loss:
            best_split = val
            best_loss = avg_loss

    return best_split, best_loss

In [None]:
def sort_by_col(fs, t, idx):
    order = np.argsort(fs[:, idx])    
    return (fs[order], t[order])

def np_single_variable_best_split(features, var_idx, target, loss_fn, leaf_prediction_builder, candidates=None):

    # First, we sort the features by the ith idx
    features, target = sort_by_col(features, target, var_idx)
    srs = features[:, var_idx]
    if candidates is None:
        candidates = set(srs) 

    best_loss = None
    best_split = None
    
    split_value = None

    for idx in range(len(srs)):
        
        # We consider splits only at the first value
        # in a series
        #  1 1 1 2 2 2 3 3 3
        #       ^ -- SPLIT
        #
        if srs[idx] == split_value:
            continue
        else:
            split_value = srs[idx]
            
        if split_value not in candidates:
            continue
        
        #left_condition = (srs <= val)
        feat_left = features[0:idx, :]
        target_left = target[0:idx]
        left_leaf_predict_fn = leaf_prediction_builder(feat_left, target_left)
        left_predicted = left_leaf_predict_fn(feat_left)
        left_loss = loss_fn(left_predicted, target_left)
        
        #right_condition = (srs > val)
        feat_right = features[idx:-1, :]
        target_right = target[idx:-1]
        right_leaf_predict_fn = leaf_prediction_builder(feat_right, target_right)
        right_predicted = right_leaf_predict_fn(feat_right)
        right_loss = loss_fn(right_predicted, target_right)

        avg_loss = (left_loss * idx + right_loss * (len(target)-idx)) / (len(features))
        if best_loss is None or avg_loss < best_loss:
            best_split = split_value
            best_loss = avg_loss

    return best_split, best_loss

In [14]:
x = pd.DataFrame({
    'a': [1, 2, 3, 4, 3, 6],
    'b': [10, 20, 30, 40, 50, 60]
})
y = pd.Series([1, 0, 0, 1, 0, 1])

In [42]:
var = 'feature_15'
var_idx = 15
candidates = set(candidate_map[var])

In [46]:
with benchmark("GTREE DF Single"):
    s, l = gtree._df_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn, #gtree.error_rate_loss,
            leaf_prediction_builder=prediction_builder,
            candidates = candidates)
    print s, l

-1.08545 0.692860737821
GTREE DF Single : 0.823 seconds


In [47]:
with benchmark("GTREE HYBRID Single"):
    s, l = gtree._hybrid_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn, #gtree.error_rate_loss,
            leaf_prediction_builder=prediction_builder,
            candidates = candidates)
    print s, l

0.289631 0.692857644557
GTREE HYBRID Single : 0.25 seconds


In [48]:
with benchmark("GTREE NP Single"):
    s, l = gtree._np_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn, #gtree.error_rate_loss,
            leaf_prediction_builder=prediction_builder,
            candidates = candidates)
    print s, l

0.289631 0.692857644557
GTREE NP Single : 0.0969 seconds


In [49]:
lmb = tree._my_tree.MeanLeafMapperBuilder()
ce = tree._my_tree.CrossEntropyLoss()
spliter = tree._my_tree.SpitFinder()

with benchmark("NP CYTHON"):
    s, l = spliter.getBestSplit(
        var_idx,
        candidates,
        np_feaures,
        np_targets,
        lmb,
        ce)
    print s, l

Split Idx:  3300 split val:  0.418152064085 Loss 0.693133592606
Split Idx:  4100 split val:  0.922379493713 Loss 0.693139672279
-1.0854524374 0.692860722542
NP CYTHON : 0.0459 seconds


In [None]:
lmb = tree._my_tree.MeanLeafMapperBuilder()
ce = tree._my_tree.CrossEntropyLoss()
spliter = tree._my_tree.SpitFinder()

with benchmark("SKLEARN ADAPTED"):
    s, l = spliter.getBestSplit(
        var_idx,
        candidates,
        np_feaures,
        np_targets,
        lmb,
        ce)
    print s, la\

In [19]:
#with benchmark("NP NAIVE"):
#    s, l = np_naive_single_variable_best_split(
#        features=np_feaures, var_idx=var_idx, target=np_targets,
#        loss_fn=loss_fn, #gtree.error_rate_loss,
#        leaf_prediction_builder=np_leaf_good_rate_split_builder,
#        candidates=candidates)
#    print s, l

In [21]:
#with benchmark("NP SMART"):
#    s, l = np_single_variable_best_split(
#        features=np_feaures, var_idx=15, target=np_targets,
#        loss_fn=loss_fn, #gtree.error_rate_loss,
#        leaf_prediction_builder=np_leaf_good_rate_split_builder,
#        candidates=candidates)
#    print s, l

# Test multiple splits

-------------------

In [26]:
# Test Df Splitting
with benchmark("DF Test"):
    for var in feature_names:
        gtree._df_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn,
            leaf_prediction_builder=prediction_builder,
            candidates = candidate_map[var])

DF Test : 86.1 seconds


In [39]:
with benchmark("NP Test"):
    for var in feature_names:
        gtree._np_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn,
            leaf_prediction_builder=prediction_builder,
            candidates = candidate_map[var])

NP Test : 10.9 seconds


# Misc

In [None]:
X = np.array([[1, 7, 3],
              [4, 5, 6],
              [7, 8, 9]])
Y = np.array([1, 2, 3])

In [None]:
sort_by_col(X, Y, 1)

In [None]:
Y

In [None]:
x = pd.DataFrame({
    'a': [1, 2, 3, 4, 3, 6],
    'b': [10, 20, 30, 40, 50, 60]
})
y = pd.Series([1, 0, 0, 1, 0, 1])

In [None]:
from sklearn.tree import _criterion, _splitter

c = _criterion.Gini(1, np.array([2]))
s = _splitter.BestSplitter

In [None]:
max_features = len(df_features)

splitter = s(c,
             max_features,
             min_samples_leaf=1,
             min_weight_leaf=1,
             random_state=1,
             presort=False)

In [None]:
splitter.

In [None]:
s.node_split

In [None]:
x = pd.Series([1, 2, 3])

In [None]:
x.index[[2, 0, 1]]