In [1]:
from __future__ import division

In [2]:
import sys
sys.path.append('..')

In [3]:
import math
import random
import numpy as np
import pandas as pd

In [4]:
import gtree

In [5]:
%load_ext autoreload
%autoreload 1

%aimport gtree
%aimport tools
%aimport tree._my_tree

# The goal of this notebook is to explore ways to optimizing our splitting algorithm, which is a huge part of the cost of this code

In [18]:
from timeit import default_timer as timer

class benchmark(object):

    def __init__(self, msg, fmt="%0.3g"):
        self.msg = msg
        self.fmt = fmt

    def __enter__(self):
        self.start = timer()
        return self

    def __exit__(self, *args):
        t = timer() - self.start
        print(("%s : " + self.fmt + " seconds") % (self.msg, t))
        self.time = t

In [19]:
with benchmark("Just A test"):
    print "Foo"

Foo
Just A test : 0.00032 seconds


# Define the 'alternative' split functions

In [7]:
def cross_entropy_loss(predicted, truth):
    if len(truth) == 0:
        return 0.0
    else:
        predicted = np.clip(predicted, 0.000001, .999999)  # predicted.clip(lower=0.000001, upper=.999999)  #  #
        return (-1.0 * truth * np.log(predicted) - (1.0 - truth) * np.log(1.0 - predicted)).mean()

    
def leaf_good_rate_prediction_builder(_, target):

    if len(target) > 0:
        mean = target.sum() / len(target)  # sum(target) / len(target)
    else:
        mean = 0

    return lambda fs: np.array([mean for _ in range(len(fs))])

In [23]:
def sort_by_col(fs, t, idx):
    order = np.argsort(fs[:, idx])
    return (fs[order], t[order])

def _np_single_variable_best_split(df, var, target, loss_fn, leaf_prediction_builder, candidates=None):
    X = df.values.copy()
    Y = target.values.copy()
    var_idx = list(df.columns).index(var)

    # First, we sort the features by the ith idx
    # order = np.argsort(df[:, var_idx])
    # features = X[order]
    # target = Y[order]
    features, target = sort_by_col(X, Y, var_idx)
    # IDX = df.index[order]
    srs = X[:, var_idx]
    if candidates is None:
        candidates = _get_split_candidates(srs)

        # candidates = set(srs)

    best_loss = None
    best_split = None

    split_value = None

    for idx in range(len(srs)):

        # We consider splits only at the first value
        # in a series
        #  0  1  2  3  4  5  6  7  8
        #  1  1  1  2  2  2  3  3  3
        #           ^ --- SPLIT
        #    [0:3]    [3:-1]
        #
        #
        if srs[idx] == split_value:
            continue
        else:
            split_value = srs[idx]

        if split_value not in candidates:
            continue

        # left_condition = (srs < val)
        np_feat_left = features[0:idx, :]
        np_target_left = target[0:idx]

        left_leaf_predict_fn = leaf_prediction_builder(np_feat_left, np_target_left)
        left_predicted = left_leaf_predict_fn(np_feat_left)
        left_loss = loss_fn(left_predicted, np_target_left)

        # right_condition = (srs >= val)
        np_feat_right = features[idx:len(X), :]
        np_target_right = target[idx:len(X)]

        right_leaf_predict_fn = leaf_prediction_builder(np_feat_right, np_target_right)
        right_predicted = right_leaf_predict_fn(np_feat_right)
        right_loss = loss_fn(right_predicted, np_target_right)

        avg_loss = (left_loss * len(np_feat_left) + right_loss * (len(np_feat_right))) / (len(features))

        # print "Idx: {} Split Val: {:.3f} Left Loss: {:.3f} Right Loss: {:.3f} Avg Loss: {:.3f} Is Best?: {}".format(
        #    idx, split_value, left_loss, right_loss, avg_loss, avg_loss < best_loss
        # )

        if best_loss is None or avg_loss < best_loss:
            best_split = split_value
            best_loss = avg_loss

    if best_loss is None:
        raise Exception()

    return best_split, best_loss


def _hybrid_single_variable_best_split(df, var, target, loss_fn, leaf_prediction_builder, candidates=None):
    X = df.values.copy()
    Y = target.values.copy()
    var_idx = list(df.columns).index(var)

    # First, we sort the features by the ith idx
    # order = np.argsort(df[:, var_idx])
    # features = X[order]
    # target = Y[order]
    features, target = sort_by_col(X, Y, var_idx)
    # IDX = df.index[order]
    srs = X[:, var_idx]
    if candidates is None:
        candidates = _get_split_candidates(srs)

        # candidates = set(srs)

    best_loss = None
    best_split = None

    split_value = None

    for idx in range(len(srs)):

        # We consider splits only at the first value
        # in a series
        #  0  1  2  3  4  5  6  7  8
        #  1  1  1  2  2  2  3  3  3
        #           ^ --- SPLIT
        #    [0:3]    [3:-1]
        #
        #
        if srs[idx] == split_value:
            continue
        else:
            split_value = srs[idx]

        if split_value not in candidates:
            continue

        # left_condition = (srs < val)
        np_feat_left = features[0:idx, :]
        np_target_left = target[0:idx]
        idx_left = df.index[0:idx]

        df_feat_left = pd.DataFrame(np_feat_left, index=idx_left)
        df_targ_left = pd.Series(np_target_left, index=idx_left)

        left_leaf_predict_fn = leaf_prediction_builder(df_feat_left, df_targ_left)
        left_predicted = left_leaf_predict_fn(df_feat_left)
        left_loss = loss_fn(left_predicted, df_targ_left)

        # right_condition = (srs >= val)
        np_feat_right = features[idx:len(X), :]
        np_target_right = target[idx:len(X)]
        idx_right = df.index[idx:len(X)]

        df_feat_right = pd.DataFrame(np_feat_right, index=idx_right)
        df_targ_right = pd.Series(np_target_right, index=idx_right)

        right_leaf_predict_fn = leaf_prediction_builder(df_feat_right, df_targ_right)
        right_predicted = right_leaf_predict_fn(df_feat_right)
        right_loss = loss_fn(right_predicted, df_targ_right)

        avg_loss = (left_loss * len(np_feat_left) + right_loss * (len(np_feat_right))) / (len(features))

        # print "Idx: {} Split Val: {:.3f} Left Loss: {:.3f} Right Loss: {:.3f} Avg Loss: {:.3f} Is Best?: {}".format(
        #    idx, split_value, left_loss, right_loss, avg_loss, avg_loss < best_loss
        # )

        if best_loss is None or avg_loss < best_loss:
            best_split = split_value
            best_loss = avg_loss

    if best_loss is None:
        raise Exception()

    return best_split, best_loss


def _df_single_variable_best_split(df, var, target, loss_fn, leaf_prediction_builder, candidates=None):
    # Convention:
    # Left is BAD
    # Right is GOOD

    # TODO: Optimize me!
    # Try: df.reindex_axis(index, copy=False)
    # or:  df.reindex(index=['a', 'b'], copy=False)
    # or even: df._reindex_axes(axes={'index':df.index, 'columns': df.columns},
    # copy=False, level=None, limit=None, tolerance=None, method=None, fill_value=None)
    # From generic.py: 2594


    df = df.sort_values(by=var)
    target = target.loc[df.index]

    srs = df[var]

    if candidates is None:
        candidates = _get_split_candidates(srs)

    if len(srs) <= len(candidates):
        candidates = srs.values

    best_loss = None
    best_split = None

    for idx in range(len(df)):

        val = df.iloc[idx][var]

        if val not in candidates:
            continue

        # left_idx = df.iloc[0, idx] #index[(srs <= val)]
        df_left = df.iloc[0:idx]  # df.reindex_axis(left_idx, copy=False)  # df.loc[left_idx]
        target_left = target.iloc[0:idx]
        left_leaf_predict_fn = leaf_prediction_builder(df_left, target_left)
        left_predicted = left_leaf_predict_fn(df_left)

        # right_idx = df.index[(srs > val)]
        df_right = df.iloc[idx:len(df)]  # reindex_axis(right_idx, copy=False)  # df.loc[right_idx]
        target_right = target.iloc[idx:len(df)]  # .loc[right_idx]
        right_leaf_predict_fn = leaf_prediction_builder(df_right, target_right)
        right_predicted = right_leaf_predict_fn(df_right)

        left_loss = loss_fn(left_predicted, target_left)
        assert pd.notnull(left_loss), "Loss yielded null value"
        right_loss = loss_fn(right_predicted, target_right)
        assert pd.notnull(right_loss), "Loss yielded null value"

        avg_loss = (left_loss * len(df_left) + right_loss * len(df_right)) / (len(df))

        if best_loss is None or avg_loss < best_loss:
            best_split = val
            best_loss = avg_loss

    return best_split, best_loss

# Investigate Optimal Splitting

Can we do splitting faster using raw numpy arrays rather than Pandas DataFrames?

In [9]:
df_features, df_targets = tools.make_random_classification(5000)
df_features = pd.DataFrame(df_features, dtype='float32')
df_targets = pd.Series(df_targets, dtype='float32')

In [10]:
np_feaures = np.array(df_features.values, dtype='float32')
np_targets = np.array(df_targets.values, dtype='float32')

In [11]:
feature_names = df_features.columns

In [12]:
# Get the candidates in advance
candidate_map = {v: gtree._get_split_candidates(df_features[v]) for v in feature_names}

In [14]:
#loss_fn = gcross_entropy_loss
#prediction_builder = gtree.leaf_good_rate_prediction_builder

In [None]:
# Create the numpy version

In [15]:
# Test Loss Function
pred = pd.Series(df_targets, dtype=np.float32).values
truth = np.zeros(len(df_targets), np.float32)

tree._my_tree.CrossEntropyLoss().loss(truth, pred)


8.300631523132324

In [16]:
var = 'feature_15'
var_idx = 15
candidates = set(candidate_map[var])

In [20]:
with benchmark("GTREE DF Single"):
    s, l = _df_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=cross_entropy_loss,
            leaf_prediction_builder=leaf_good_rate_prediction_builder,
            candidates = candidates)
    print s, l

0.444324 0.692970100312
GTREE DF Single : 0.856 seconds


In [24]:
with benchmark("GTREE HYBRID Single"):
    s, l = _hybrid_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=cross_entropy_loss,
            leaf_prediction_builder=leaf_good_rate_prediction_builder,
            candidates = candidates)
    print s, l

-0.83052 0.692961128812
GTREE HYBRID Single : 0.275 seconds


In [25]:
with benchmark("GTREE NP Single"):
    s, l = _np_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=cross_entropy_loss, #gtree.error_rate_loss,
            leaf_prediction_builder=leaf_good_rate_prediction_builder,
            candidates = candidates)
    print s, l

-0.83052 0.692961128812
GTREE NP Single : 0.0972 seconds


In [27]:
with benchmark("NP CYTHON"):
    s, l = tree._my_tree.getBestSplit(
        np_feaures,
        var_idx,
        np_targets,
        tree._my_tree.CrossEntropyLoss(),
        tree._my_tree.MeanLeafMapperBuilder(),
        candidates)
    print s, l

-0.00128457322717 5.64174222946
NP CYTHON : 0.0227 seconds


In [29]:
#lmb = 
#ce = 
#spliter = tree._my_tree.SpitFinder()

with benchmark("NP CYTHON ERROR RATE LOSS"):
    s, l = tree._my_tree.getBestSplit(
        np_feaures,
        var_idx,
        np_targets,
        tree._my_tree.ErrorRateLoss(),
        tree._my_tree.MeanLeafMapperBuilder(),
        candidates)
    print s, l

-1.63080155849 0.49979993701
NP CYTHON ERROR RATE LOSS : 0.0131 seconds


In [30]:
with benchmark("NP CYTHON RANDOM LOSS"):
    s, l = tree._my_tree.getBestSplit(
        np_feaures,
        var_idx,
        np_targets,
        tree._my_tree.RandomLoss(),
        tree._my_tree.MeanLeafMapperBuilder(),
        candidates)
    print s, l

1.93952488899 0.029166419059
NP CYTHON RANDOM LOSS : 0.0097 seconds


In [None]:
# NOT YET IMPLEMENTED

lmb = tree._my_tree.MeanLeafMapperBuilder()
ce = tree._my_tree.CrossEntropyLoss()
#spliter = tree._my_tree.SpitFinder()

with benchmark("SKLEARN ADAPTED"):
    s, l = spliter.getBestSplit(
        var_idx,
        candidates,
        np_feaures,
        np_targets,
        lmb,
        ce)
    print s, l

In [None]:
#with benchmark("NP NAIVE"):
#    s, l = np_naive_single_variable_best_split(
#        features=np_feaures, var_idx=var_idx, target=np_targets,
#        loss_fn=loss_fn, #gtree.error_rate_loss,
#        leaf_prediction_builder=np_leaf_good_rate_split_builder,
#        candidates=candidates)
#    print s, l

In [None]:
#with benchmark("NP SMART"):
#    s, l = np_single_variable_best_split(
#        features=np_feaures, var_idx=15, target=np_targets,
#        loss_fn=loss_fn, #gtree.error_rate_loss,
#        leaf_prediction_builder=np_leaf_good_rate_split_builder,
#        candidates=candidates)
#    print s, l

In [None]:
type(gtree.error_rate_loss)

# Test multiple splits

-------------------

In [None]:
# Test Df Splitting
with benchmark("DF Test"):
    for var in feature_names:
        gtree._df_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=loss_fn,
            leaf_prediction_builder=prediction_builder,
            candidates = candidate_map[var])

In [None]:
with benchmark("NP Test"):
    for var in feature_names:
        gtree._np_single_variable_best_split(
            df_features, var, df_targets,
            loss_fn=cross_entropy_loss,
            leaf_prediction_builder=leaf_good_rate_prediction_builder,
            candidates = candidate_map[var])

In [None]:
lmb = tree._my_tree.MeanLeafMapperBuilder()
ce = tree._my_tree.CrossEntropyLoss()
#spliter = tree._my_tree.SpitFinder()

with benchmark("NP CYTHON"):
    for i, var in enumerate(feature_names):
        tree._my_tree.getBestSplit(
            np_feaures,
            i,
            np_targets,
            ce,
            lmb,
            set(candidate_map[var]))

# Misc

In [None]:
X = np.array([[1, 7, 3],
              [4, 5, 6],
              [7, 8, 9]])
Y = np.array([1, 2, 3])

In [None]:
sort_by_col(X, Y, 1)

In [None]:
Y

In [None]:
x = pd.DataFrame({
    'a': [1, 2, 3, 4, 3, 6],
    'b': [10, 20, 30, 40, 50, 60]
})
y = pd.Series([1, 0, 0, 1, 0, 1])

In [None]:
from sklearn.tree import _criterion, _splitter

c = _criterion.Gini(1, np.array([2]))
s = _splitter.BestSplitter

In [None]:
max_features = len(df_features)

splitter = s(c,
             max_features,
             min_samples_leaf=1,
             min_weight_leaf=1,
             random_state=1,
             presort=False)

In [None]:
splitter.

In [None]:
s.node_split

In [None]:
x = pd.Series([1, 2, 3])

In [None]:
x.index[[2, 0, 1]]