In [1]:
from __future__ import division

In [2]:
import sys
sys.path.append('..')

In [3]:
import math
import random
import numpy as np
import pandas as pd

In [4]:
import gtree

In [5]:
%load_ext autoreload
%autoreload 1

%aimport gtree
%aimport tools
%aimport tree._my_tree

In [6]:
tree._my_tree.foo(10)

100.0

# The goal of this notebook is to explore ways to optimizing our splitting algorithm, which is a huge part of the cost of this code

In [7]:
from timeit import default_timer as timer

class benchmark(object):

    def __init__(self, msg, fmt="%0.3g"):
        self.msg = msg
        self.fmt = fmt

    def __enter__(self):
        self.start = timer()
        return self

    def __exit__(self, *args):
        t = timer() - self.start
        print(("%s : " + self.fmt + " seconds") % (self.msg, t))
        self.time = t

In [8]:
with benchmark("Just A test"):
    print "Foo"

Foo
Just A test : 0.000389 seconds


# Investigate Optimal Splitting

Can we do splitting faster using raw numpy arrays rather than Pandas DataFrames?

In [9]:
df_features, df_targets = tools.make_random_classification(5000)
df_features = pd.DataFrame(df_features, dtype='float32')
df_targets = pd.Series(df_targets, dtype='float32')

In [10]:
np_feaures = np.array(df_features.values, dtype='float32')
np_targets = np.array(df_targets.values, dtype='float32')

In [11]:
feature_names = df_features.columns

In [12]:
# Get the candidates in advance
candidate_map = {v: gtree._get_split_candidates(df_features[v]) for v in feature_names}

In [13]:
loss_fn = gtree.cross_entropy_loss
prediction_builder = gtree.leaf_good_rate_prediction_builder

In [14]:
# Create the numpy version

In [15]:
# Test Loss Function

pred = pd.Series(df_targets, dtype=np.float32).values
truth = np.zeros(len(df_targets), np.float32)

tree._my_tree.CrossEntropyLoss().loss(truth, pred)


8.300631523132324

In [38]:
def cross_entropy_loss(predicted, truth):
    if len(truth) == 0:
        return 0.0
    else:
        predicted = np.clip(predicted, 0.000001, .999999)  # predicted.clip(lower=0.000001, upper=.999999)  #  #
        return (-1.0 * truth * np.log(predicted) - (1.0 - truth) * np.log(1.0 - predicted)).mean()

    
def leaf_good_rate_prediction_builder(_, target):

    if len(target) > 0:
        mean = target.sum() / len(target)  # sum(target) / len(target)
    else:
        mean = 0

    return lambda fs: np.array([mean for _ in range(len(fs))])

In [17]:
var = 'feature_15'
var_idx = 15
candidates = set(candidate_map[var])

In [31]:
with benchmark("GTREE DF Single"):
    s, l = gtree._df_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn, #gtree.error_rate_loss,
            leaf_prediction_builder=prediction_builder,
            candidates = candidates)
    print s, l

GTREE DF Single : 0.0103 seconds


TypeError: 'tree._my_tree.MeanLeafMapperBuilder' object is not callable

In [None]:
with benchmark("GTREE HYBRID Single"):
    s, l = gtree._hybrid_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn, #gtree.error_rate_loss,
            leaf_prediction_builder=prediction_builder,
            candidates = candidates)
    print s, l

In [34]:
with benchmark("GTREE NP Single"):
    s, l = gtree._np_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=cross_entropy_loss, #gtree.error_rate_loss,
            leaf_prediction_builder=leaf_good_rate_prediction_builder,
            candidates = candidates)
    print s, l

1.58485 0.692938759426
GTREE NP Single : 0.0955 seconds


In [35]:
lmb = tree._my_tree.MeanLeafMapperBuilder()
ce = tree._my_tree.CrossEntropyLoss()
#spliter = tree._my_tree.SpitFinder()

with benchmark("NP CYTHON"):
    s, l = tree._my_tree.getBestSplit(
        np_feaures,
        var_idx,
        np_targets,
        gtree.cross_entropy_loss,
        gtree.leaf_good_rate_prediction_builder,
        candidates)
    print s, l

2.44881868362 6.15088033676
NP CYTHON : 0.0407 seconds


In [36]:
#lmb = tree._my_tree.MeanLeafMapperBuilder()
#ce = tree._my_tree.ErrorRateLoss()
#spliter = tree._my_tree.SpitFinder()

with benchmark("NP CYTHON ERROR RATE LOSS"):
    s, l = tree._my_tree.getBestSplit(
        np_feaures,
        var_idx,
        np_targets,
        gtree.error_rate_loss,
        gtree.leaf_good_rate_prediction_builder,
        candidates)
    print s, l

1.25406587124 0.49979993701
NP CYTHON ERROR RATE LOSS : 0.0125 seconds


In [None]:
lmb = tree._my_tree.MeanLeafMapperBuilder()
ce = tree._my_tree.CrossEntropyLoss()
#spliter = tree._my_tree.SpitFinder()

with benchmark("SKLEARN ADAPTED"):
    s, l = spliter.getBestSplit(
        var_idx,
        candidates,
        np_feaures,
        np_targets,
        lmb,
        ce)
    print s, la\

In [None]:
#with benchmark("NP NAIVE"):
#    s, l = np_naive_single_variable_best_split(
#        features=np_feaures, var_idx=var_idx, target=np_targets,
#        loss_fn=loss_fn, #gtree.error_rate_loss,
#        leaf_prediction_builder=np_leaf_good_rate_split_builder,
#        candidates=candidates)
#    print s, l

In [None]:
#with benchmark("NP SMART"):
#    s, l = np_single_variable_best_split(
#        features=np_feaures, var_idx=15, target=np_targets,
#        loss_fn=loss_fn, #gtree.error_rate_loss,
#        leaf_prediction_builder=np_leaf_good_rate_split_builder,
#        candidates=candidates)
#    print s, l

# Test multiple splits

-------------------

In [None]:
# Test Df Splitting
with benchmark("DF Test"):
    for var in feature_names:
        gtree._df_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn,
            leaf_prediction_builder=prediction_builder,
            candidates = candidate_map[var])

In [48]:
with benchmark("NP Test"):
    for var in feature_names:
        gtree._np_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=cross_entropy_loss,
            leaf_prediction_builder=leaf_good_rate_prediction_builder,
            candidates = candidate_map[var])

NP Test : 10.5 seconds


In [49]:
lmb = tree._my_tree.MeanLeafMapperBuilder()
ce = tree._my_tree.CrossEntropyLoss()
#spliter = tree._my_tree.SpitFinder()

with benchmark("NP CYTHON"):
    for i, var in enumerate(feature_names):
        tree._my_tree.getBestSplit(
            np_feaures,
            i,
            np_targets,
            ce,
            lmb,
            set(candidate_map[var]))

NP CYTHON : 3.93 seconds


# Misc

In [None]:
X = np.array([[1, 7, 3],
              [4, 5, 6],
              [7, 8, 9]])
Y = np.array([1, 2, 3])

In [None]:
sort_by_col(X, Y, 1)

In [None]:
Y

In [None]:
x = pd.DataFrame({
    'a': [1, 2, 3, 4, 3, 6],
    'b': [10, 20, 30, 40, 50, 60]
})
y = pd.Series([1, 0, 0, 1, 0, 1])

In [None]:
from sklearn.tree import _criterion, _splitter

c = _criterion.Gini(1, np.array([2]))
s = _splitter.BestSplitter

In [None]:
max_features = len(df_features)

splitter = s(c,
             max_features,
             min_samples_leaf=1,
             min_weight_leaf=1,
             random_state=1,
             presort=False)

In [None]:
splitter.

In [None]:
s.node_split

In [None]:
x = pd.Series([1, 2, 3])

In [None]:
x.index[[2, 0, 1]]