In [1]:
from __future__ import division

In [2]:
import math
import random
import numpy as np
import pandas as pd

In [5]:
%load_ext autoreload
%autoreload 1

%aimport gtree
%aimport tools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
from timeit import default_timer as timer

class benchmark(object):

    def __init__(self, msg, fmt="%0.3g"):
        self.msg = msg
        self.fmt = fmt

    def __enter__(self):
        self.start = timer()
        return self

    def __exit__(self, *args):
        t = timer() - self.start
        print(("%s : " + self.fmt + " seconds") % (self.msg, t))
        self.time = t

In [26]:
with benchmark("Just A test"):
    
    print "Foo"

Foo
Just A test : 0.000242 seconds


# Investigate Optimal Splitting

Can we do splitting faster using raw numpy arrays rather than Pandas DataFrames?

In [7]:
df_features, df_targets = tools.make_random_classification(5000)

In [None]:
np_feaures = df_features.values
np_targets = df_targets.values

In [13]:
feature_names = df_features.columns

In [19]:
# Get the candidates in advance
candidate_map = {v: gtree._get_split_candidates(df_features[v]) for v in feature_names}

In [16]:
loss_fn = gtree.cross_entropy_loss
prediction_builder = gtree.leaf_good_rate_split_builder

In [None]:
# Create the numpy version

In [47]:
def np_leaf_good_rate_split_builder(features, target):
    """
    Assume the target consists of 0, 1
    """
    if len(target) > 0:
        mean = sum(target) / len(target)
    else:
        mean = 0

    return lambda arr: np.full(target.shape, mean) #pd.Series([mean for _ in range(len(df))], index=df.index)

In [168]:
def np_naive_single_variable_best_split(features, var_idx, target, loss_fn, leaf_prediction_builder, candidates=None):

    # Select the ith column
    srs = features[:, var_idx]
    candidates = list(srs) 

    best_loss = None
    best_split = None

    for val in candidates:
        
        #print srs, srs.shape
        #print features, features.shape
        
        left_condition = (srs <= val)
        # Get feature rows satisfying the constraint
        feat_left = features[left_condition, :]
        target_left = target[left_condition]
        left_leaf_predict_fn = leaf_prediction_builder(feat_left, target_left)
        left_predicted = left_leaf_predict_fn(feat_left)
        left_loss = loss_fn(left_predicted, target_left)
        #print "LEFT PREDICTED: ", left_predicted
        
        right_condition = (srs > val)
        # Get feature rows satisfying the constraint
        feat_right = features[right_condition, :]
        target_right = target[right_condition]
        right_leaf_predict_fn = leaf_prediction_builder(feat_right, target_right)
        right_predicted = right_leaf_predict_fn(feat_right)
        right_loss = loss_fn(right_predicted, target_right)
        #print "RIGHT PREDICTED: ", right_predicted

        #print "Left loss: ", left_loss, "Right Loss: ", right_loss
        avg_loss = (left_loss * left_condition.sum() + right_loss * right_condition.sum()) / (len(features))
        
        if best_loss is None or avg_loss < best_loss:
            best_split = val
            best_loss = avg_loss

    return best_split, best_loss

In [225]:
def sort_by_col(fs, t, idx):
    order = np.argsort(fs[:, idx])    
    return (fs[order], t[order])

def np_single_variable_best_split(features, var_idx, target, loss_fn, leaf_prediction_builder, candidates=None):

    # First, we sort the features by the ith idx
    print "SORTING"
    features, target = sort_by_col(features, target, var_idx)
    print "DONE"
    srs = features[:, var_idx]
    candidates = set(srs) 

    best_loss = None
    best_split = None
    
    split_value = None

    for idx in range(len(srs)):
        
        # We consider splits only at the first value
        # in a series
        #  1 1 1 2 2 2 3 3 3
        #       ^ -- SPLIT
        #
        if srs[idx] == split_value:
            continue
        else:
            split_value = srs[idx]
            
        if split_value not in candidates:
            continue
        
        #left_condition = (srs <= val)
        feat_left = features[0:idx, :]
        target_left = target[0:idx]
        left_leaf_predict_fn = leaf_prediction_builder(feat_left, target_left)
        left_predicted = left_leaf_predict_fn(feat_left)
        left_loss = loss_fn(left_predicted, target_left)
        
        #right_condition = (srs > val)
        feat_right = features[idx:-1, :]
        target_right = target[idx:-1]
        right_leaf_predict_fn = leaf_prediction_builder(feat_right, target_right)
        right_predicted = right_leaf_predict_fn(feat_right)
        right_loss = loss_fn(right_predicted, target_right)

        avg_loss = (left_loss * idx + right_loss * (len(target)-idx)) / (len(features))
        if best_loss is None or avg_loss < best_loss:
            best_split = split_value
            best_loss = avg_loss

    return best_split, best_loss

In [103]:
x = pd.DataFrame({
    'a': [1, 2, 3, 4, 3, 6],
    'b': [10, 20, 30, 40, 50, 60]
})
y = pd.Series([1, 0, 0, 1, 0, 1])

In [217]:
with benchmark("DF Single"):
    gtree._single_variable_best_split(
            df_features, 'feature_15', df_targets,
            loss_fn=loss_fn, #gtree.error_rate_loss,
            leaf_prediction_builder=prediction_builder,
            candidates = candidate_map[var])

DF Single : 0.834 seconds


In [224]:
with benchmark("NP NAIVE"):
    np_naive_single_variable_best_split(
        features=np_feaures, var_idx=15, target=np_targets,
        loss_fn=loss_fn, #gtree.error_rate_loss,
        leaf_prediction_builder=np_leaf_good_rate_split_builder)

NP Single : 10.7 seconds


In [226]:
with benchmark("NP SMART"):
    np_single_variable_best_split(
        features=np_feaures, var_idx=15, target=np_targets,
        loss_fn=loss_fn, #gtree.error_rate_loss,
        leaf_prediction_builder=np_leaf_good_rate_split_builder)

SORTING
DONE
NP SMART : 3.49 seconds


In [144]:
# Test Df Splitting
with benchmark("DF Test"):
    for var in feature_names[:5]:
        gtree._single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn,
            leaf_prediction_builder=prediction_builder,
            candidates = candidate_map[var])

DF Test : 3.45 seconds


In [171]:
with benchmark("NP Test"):
    for idx in range(df_features.shape[1])[:5]:
        np_single_variable_best_split(
            np_feaures, idx, np_targets,
            loss_fn=loss_fn,
            leaf_prediction_builder=np_leaf_good_rate_split_builder,
            candidates = candidate_map[var])

NP Test : 17.2 seconds


In [155]:
X = np.array([[1, 7, 3],
              [4, 5, 6],
              [7, 8, 9]])
Y = np.array([1, 2, 3])

In [156]:
sort_by_col(X, Y, 1)

(array([[4, 5, 6],
        [1, 7, 3],
        [7, 8, 9]]), array([2, 1, 3]))

In [153]:
Y

array([1, 2, 3])

In [None]:
x = pd.DataFrame({
    'a': [1, 2, 3, 4, 3, 6],
    'b': [10, 20, 30, 40, 50, 60]
})
y = pd.Series([1, 0, 0, 1, 0, 1])

In [239]:
from sklearn.tree import _criterion, _splitter

c = _criterion.Gini(1, np.array([2]))
s = _splitter.BestSplitter

In [243]:
max_features = len(df_features)

splitter = s(c,
             max_features,
             min_samples_leaf=1,
             min_weight_leaf=1,
             random_state=1,
             presort=False)

In [244]:
splitter.

SyntaxError: invalid syntax (<ipython-input-244-22ab8b1cb9af>, line 1)

In [247]:
s.node_split

AttributeError: type object 'sklearn.tree._splitter.BestSplitter' has no attribute 'node_split'