In [1]:
from __future__ import division

In [2]:
import math
import random
import numpy as np
import pandas as pd

In [3]:
from sklearn import datasets

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline

In [12]:
%load_ext autoreload
%autoreload 1

%aimport gtree

In [13]:
import logging
%config Application.log_level="INFO"
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

## Goals:

### Separate the structure of a tree from the data of a tree.  In other words,
  fitting a tree does two things: It creates the structure of a tree and it
  creates a mapping of each leaf to a value.  Lookup therefore requires both
  finding the leaf node AND using the map to lookup the value.
  
### The loss function optimized by the tree is configurable, as is the leaf


## Terms:

### Tree
A Tree is an object that takes input data and determines what leaf it ends up in.  Unlike many tree implementations, the Tree itself doesn't store data about the value of a leaf.  That is stored externally.


### loss_fn
A loss_fn is a function that takes data rows, the predicted targets for those rows, and the actual targets for those rows, and returns a single value that determines the "LOSS" or "COST" of that prediction (lower cost/loss is better)

```
def loss_fn(predicted_targets, actual_targets) -> float
```

A loss function must be additive (so, one should not apply a mean as a part of it)

### leaf_prediction_fn
A leaf_prediction_fn is a function which takes the features and actual targets that end up in a leaf and returns a Series of the predictions for each row ending up in that leaf.  It is typically a constant function whose value is either the mean good rate in that leaf (among the actual targets) or the median target, but can be anything else

```
def leaf_prediction_fn(features) -> pd.Series
```


### leaf_prediction_builder
A leaf_prediction_builder is a function which takes the features and actual targets that end up in a TRANING leaf and returns a leaf_prediction_fn.  This leaf_prediction_fn is used to predict the value of testing rows that end up in the same leaf.

```
def leaf_prediction_builder(features, actual_targets) -> leaf_prediction_fn
```


### leaf_prediction_map
A leaf_prediction_map is a map of leaf ids (eg their hash) to the leaf_prediction_fn for that leaf.  One can only use a tree to score data if one has a leaf_prediction_map.  This design allows on to use the same tree as a subset of another tree without having their leaf values become entangled.

--------------

## Test Tree Manipulation Functions

In [7]:
%pdb off

Automatic pdb calling has been turned OFF


In [8]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.LeafNode()

o = gtree.BranchNode('C', 0.1, None, None)
o.left = gtree.LeafNode()
o.right = gtree.LeafNode()

t.prn()
print '\n\n'
o.prn()

	Leaf(0.360883544777)

A 0.5

		Leaf(0.500856349064)

	B 0.9

		Leaf(0.453760363577)




	Leaf(0.97798875491)

C 0.1

	Leaf(0.72512513995)



In [9]:
u = gtree.replace_branch_split(t, t.right, o)
u.prn()
print '\n\n'
t.prn()

	Leaf(0.360883544777)

A 0.5

		Leaf(0.500856349064)

	C 0.1

		Leaf(0.453760363577)




	Leaf(0.360883544777)

A 0.5

		Leaf(0.500856349064)

	B 0.9

		Leaf(0.453760363577)



In [10]:
v = gtree.replace_node(t, t.left, o)
v.prn()
print '\n\n'
t.prn()

		Leaf(0.97798875491)

	C 0.1

		Leaf(0.72512513995)

A 0.5

		Leaf(0.500856349064)

	B 0.9

		Leaf(0.453760363577)




	Leaf(0.360883544777)

A 0.5

		Leaf(0.500856349064)

	B 0.9

		Leaf(0.453760363577)



In [11]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.BranchNode('C', 0.9, None, None)
t.right.right.right = gtree.LeafNode()
t.right.right.left = gtree.LeafNode()

t.prn()
print '\n\n'
gtree.prune(t, 2).prn()
print '\n\n'
t.prn()

	Leaf(0.256080594178)

A 0.5

		Leaf(0.969367371707)

	B 0.9

			Leaf(0.187523824675)

		C 0.9

			Leaf(0.902891676411)




	Leaf(0.256080594178)

A 0.5

		Leaf(0.920887642809)

	B 0.9

		Leaf(0.171126608762)




	Leaf(0.256080594178)

A 0.5

		Leaf(0.969367371707)

	B 0.9

			Leaf(0.187523824675)

		C 0.9

			Leaf(0.902891676411)



In [12]:
data = pd.DataFrame({'A': [0.1, 10, .02],
                     'B': [10, 20, 30]},
                    index=['foo', 'bar', 'baz'])

In [13]:
def leaf_count_fn(val):
    return lambda df: pd.Series([val for _ in range(len(df))], index=df.index)

In [14]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode() #'A', 0.5, 10, 20)
t.right = gtree.LeafNode() #'A', 0.5, 100, 0)

leaf_map = {hash(t.left): leaf_count_fn(10),
            hash(t.right): leaf_count_fn(20)}

t.predict(data, leaf_map)

foo    10
bar    20
baz    10
dtype: int64

In [15]:
# Create a split on a DataFrame

df = pd.DataFrame({'foo': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])})

gtree._single_variable_best_split(
    df,
    'foo',
    pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1]),
    gtree.error_rate_loss,
    gtree.leaf_good_rate_split_builder)

(5, 0.19999999999999996)

In [16]:
threshold = 0.5
truth     = pd.Series([1, 0, 1])
predicted = pd.Series([0, 1, 0])

gtree.error_rate_loss(truth, predicted)


1.0 - ((predicted >= threshold) == truth).mean() #+ (predicted < threshold) * (1 - truth)

1.0

In [17]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]})
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

tree, leaf_map = gtree.train_greedy_tree(df, target, loss_fn=gtree.error_rate_loss)

print '\nTree:\n'
tree.prn()

print leaf_map

03:35:13 tree INFO:Training.  Depth 0 Current Loss: 0.4167 Best Split: B 40.0000 0.2500
03:35:13 tree INFO:Training.  Depth 1 Current Loss: 0.0000 Best Split: A 1.0000 0.0000
03:35:13 tree INFO:No split improves loss.  Returning
03:35:14 tree INFO:Training.  Depth 1 Current Loss: 0.3750 Best Split: A 10.0000 0.1250
03:35:14 tree INFO:Training.  Depth 2 Current Loss: 0.1667 Best Split: A 3.0000 0.1667
03:35:14 tree INFO:No split improves loss.  Returning
03:35:14 tree INFO:Training.  Depth 2 Current Loss: 0.0000 Best Split: A 11.0000 0.0000
03:35:14 tree INFO:No split improves loss.  Returning



Tree:

	Leaf(0.729188252837)

B 40

		Leaf(0.2499323379)

	A 10

		Leaf(0.429783676767)

{2167387241: <function <lambda> at 0x1106be320>, 4259768738: <function <lambda> at 0x110684848>, 1996908948: <function <lambda> at 0x1106be410>}


In [18]:
gtree.calculate_leaf_map(tree, df, target)

{1996908948: <function gtree.<lambda>>,
 2167387241: <function gtree.<lambda>>,
 4259768738: <function gtree.<lambda>>}

In [19]:
gtree.random_node(tree)

<gtree.LeafNode at 0x110665c90>

In [20]:
print gtree.get_all_nodes(tree)

[<gtree.BranchNode object at 0x1106add10>, <gtree.LeafNode object at 0x110665e50>, <gtree.BranchNode object at 0x1106b8450>, <gtree.LeafNode object at 0x110665e50>, <gtree.BranchNode object at 0x1106b8450>, <gtree.LeafNode object at 0x110665c90>, <gtree.LeafNode object at 0x1106c5490>, <gtree.LeafNode object at 0x110665c90>, <gtree.LeafNode object at 0x1106c5490>]


In [21]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]})
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

tree, leaf_map = gtree.train_greedy_tree(df, target, loss_fn=gtree.error_rate_loss,
                                         feature_sample_rate=.5,
                                         row_sample_rate=.5)

print '\nTree:\n'
tree.prn()

print leaf_map

03:35:19 tree INFO:Training.  Depth 0 Current Loss: 0.4167 Best Split: A 1.0000 0.1667
03:35:19 tree INFO:Reached leaf node, or constraints force termination.  Returning
03:35:19 tree INFO:Training.  Depth 1 Current Loss: 0.4545 Best Split: B 40.0000 0.2727
03:35:19 tree INFO:Training.  Depth 2 Current Loss: 0.0000 Best Split: A 2.0000 0.0000
03:35:19 tree INFO:No split improves loss.  Returning
03:35:19 tree INFO:Training.  Depth 2 Current Loss: 0.3750 Best Split: A 10.0000 0.1250
03:35:19 tree INFO:Training.  Depth 3 Current Loss: 0.1667 Best Split: A 3.0000 0.1667
03:35:19 tree INFO:No split improves loss.  Returning
03:35:19 tree INFO:Training.  Depth 3 Current Loss: 0.0000 Best Split: A 11.0000 0.0000
03:35:19 tree INFO:No split improves loss.  Returning



Tree:

	Leaf(0.251727490486)

A 1

		Leaf(0.930525089142)

	B 40

			Leaf(0.720837462476)

		A 10

			Leaf(0.60678913307)

{1383260795: <function <lambda> at 0x1106a21b8>, 4113978420: <function <lambda> at 0x110684cf8>, 1413170733: <function <lambda> at 0x1106a29b0>, 2691078375: <function <lambda> at 0x1106e51b8>}


In [22]:
gtree.mate(tree, tree).prn()

	Leaf(0.251727490486)

B 40

		Leaf(0.930525089142)

	A 1

			Leaf(0.720837462476)

		A 10

			Leaf(0.60678913307)



In [23]:
def make_hastie_sample(n_samples):

    features, targets = datasets.make_hastie_10_2(n_samples=n_samples)

    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
    targets = pd.Series(targets, name='target')
    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
    return features, targets



In [14]:
def make_kddcup(n_samples):
    
    features, targets = datasets.fetch_kddcup99(subset='smtp')

    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
    targets = pd.Series(targets, name='target')
    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
    
    features = featurse.sample(n=n_samples)
    
    return features, targets.loc[features.index]
    

def make_random_classification(n_samples, n_features=100):
    
    
    
    features, targets = datasets.make_classification(n_samples=n_samples,
                                                     n_features=n_features,
                                                     n_informative=8,
                                                     n_classes=2,
                                                     n_clusters_per_class=4)
                                                     

    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
    targets = pd.Series(targets, name='target')
    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
        
    return features, targets.loc[features.index]

In [15]:

#features, targets = make_hastie_sample(10000)
features, targets = make_random_classification(10000)

In [16]:
features.shape

(10000, 100)

In [17]:
targets.value_counts()

0.0    5012
1.0    4988
Name: target, dtype: int64

In [18]:
gtree.tree_logger.setLevel(logging.INFO)
tree, leaf_map = gtree.train_greedy_tree(features, targets, loss_fn=gtree.error_rate_loss, max_depth=8)

06:54:15 tree INFO:Training.  Depth 0 Current Loss: 0.4988 Best Split: feature_83 0.0698 0.3852
06:56:11 tree INFO:Training.  Depth 1 Current Loss: 0.3886 Best Split: feature_3 -1.8243 0.2763
06:57:17 tree INFO:Training.  Depth 2 Current Loss: 0.2949 Best Split: feature_17 -3.2804 0.2878
06:57:27 tree INFO:Training.  Depth 3 Current Loss: 0.2727 Best Split: feature_83 -2.9895 0.0455
06:57:29 tree INFO:Training.  Depth 4 Current Loss: 0.0000 Best Split: feature_0 0.2749 0.0000
06:57:29 tree INFO:No split improves loss.  Returning
06:57:37 tree INFO:Training.  Depth 4 Current Loss: 0.0588 Best Split: feature_3 -5.1263 0.0000
06:57:37 tree INFO:Reached leaf node, or constraints force termination.  Returning
06:57:45 tree INFO:Training.  Depth 5 Current Loss: 0.0000 Best Split: feature_0 -0.2473 0.0000
06:57:45 tree INFO:No split improves loss.  Returning
06:58:44 tree INFO:Training.  Depth 3 Current Loss: 0.2880 Best Split: feature_32 2.1051 0.2829
06:59:41 tree INFO:Training.  Depth 4 Cu

07:20:42 tree INFO:No split improves loss.  Returning
07:20:44 tree INFO:Training.  Depth 6 Current Loss: 0.2500 Best Split: feature_0 -0.4574 0.0000
07:20:44 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:20:45 tree INFO:Training.  Depth 7 Current Loss: 0.0000 Best Split: feature_0 -0.4382 0.0000
07:20:45 tree INFO:No split improves loss.  Returning
07:20:47 tree INFO:Training.  Depth 5 Current Loss: 0.2000 Best Split: feature_1 -2.3422 0.0000
07:20:47 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:20:49 tree INFO:Training.  Depth 6 Current Loss: 0.0000 Best Split: feature_0 0.6537 0.0000
07:20:49 tree INFO:No split improves loss.  Returning
07:21:42 tree INFO:Training.  Depth 4 Current Loss: 0.1684 Best Split: feature_31 -1.3284 0.1528
07:21:51 tree INFO:Training.  Depth 5 Current Loss: 0.3750 Best Split: feature_37 -1.3136 0.1250
07:21:53 tree INFO:Training.  Depth 6 Current Loss: 0.0000 Best Split: feature_0 -1.7719 0.0000

KeyboardInterrupt: 

In [None]:
tree.prn()

In [None]:
leaf_map

In [None]:
tree.predict(features, leaf_map)

In [None]:
results = pd.DataFrame({'truth': targets, 'prediction': tree.predict(features, leaf_map)})

In [None]:
1.0 - gtree.error_rate_loss(results.prediction, results.truth) / len(targets)

In [None]:
results.plot(kind='scatter', x='prediction', y='truth')

In [None]:
fig = plt.figure(figsize=(12,8))

for label, grp in tree.predict(features, leaf_map).groupby(targets):
    grp.hist(normed=True, alpha=0.5, label=str(label)) #, label=label)
plt.legend(loc='best')
None

# Compare Methods

In [19]:
features, targets = make_random_classification(5000)

features_validation = features.sample(frac=.3)
targets_validation = targets.loc[features_validation.index]

features = features[~features.index.isin(features_validation.index)]
targets = targets.loc[features.index]

In [None]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, targets,
                                   loss_fn=gtree.cross_entropy_loss,
                                   max_depth=2, min_to_split=10,
                                   num_generations=2, num_survivors=10,
                                   num_children=100, num_seed_trees=2)


06:02:41 evolution DEBUG:Growing Seed: 1 of 2
  return (-1.0 * truth * np.log(predicted) - (1.0 - truth) * np.log(1.0 - predicted)).mean()
06:03:36 evolution DEBUG:Growing Seed: 2 of 2
06:04:32 evolution DEBUG:Resplitting the data
06:04:32 evolution DEBUG:Mating to create 100 children


In [48]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

0.297333333333


In [56]:
generations[-1]['best_of_generation']['tree'].find_leaves(features).value_counts()

2078480799    1414
2437144351    1148
2570464104     938
dtype: int64

In [52]:
for gen in generations[-1]['generation']:
    print '--------------------{:.4f}----------------------------'.format(gen['loss_testing'])
    gen['tree'].prn()

--------------------0.6624----------------------------
	Leaf(0.922108366641)

feature_50 -0.488387233513

		Leaf(0.78625028227)

	feature_99 -0.565015852089

		Leaf(0.00848191813269)

--------------------0.6642----------------------------
		Leaf(0.163847666623)

	feature_17 -0.565015852089

		Leaf(0.12723466858)

feature_50 -0.625108536211

		Leaf(0.957803398685)

	feature_47 0.756409780254

		Leaf(0.516671038657)

--------------------0.6650----------------------------
		Leaf(0.802550722777)

	feature_50 -0.565015852089

		Leaf(0.599164400845)

feature_50 1.26675090679

		Leaf(0.998781959327)

	feature_47 1.08799997383

		Leaf(0.470929887087)

--------------------0.6652----------------------------
		Leaf(0.402819908511)

	feature_60 -0.709362631916

		Leaf(0.563559424693)

feature_50 -0.113775250463

		Leaf(0.646840213024)

	feature_47 0.756409780254

		Leaf(0.0183416109128)

--------------------0.6652----------------------------
		Leaf(0.825303177451)

	feature_60 -0.709362631916

		L

In [None]:
for result in generations[-1]['generation']:
    print '---------------------------------------------'
    result['tree'].prn()

In [None]:
result = gtree.train_random_trees(features, targets, loss_fn=gtree.error_rate_loss,
                                  max_depth=2,
                                  min_to_split=10,
                                  num_trees=10)

In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [34]:
from sklearn import tree
from sklearn.model_selection import train_test_split

clf = tree.DecisionTreeClassifier(max_depth=2)
clf = clf.fit(features, targets)

In [35]:
predictions = pd.Series(clf.predict_proba(features_validation)[:, 1], index=features_validation.index)
gtree.error_rate_loss(predictions, targets_validation)

0.29600000000000004

In [36]:
from sklearn.externals.six import StringIO  
from sklearn import tree as sklearn_tree
import pydot 
dot_data = StringIO() 
sklearn_tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("iris.pdf") 

Oct 15 17:03:49  dot[71655] <Error>: The function ‘CGFontGetGlyphPath’ is obsolete and will be removed in an upcoming update. Unfortunately, this application, or a library it uses, is using this obsolete function, and is thereby contributing to an overall degradation of system performance.
Oct 15 17:03:49  dot[71655] <Error>: The function ‘CGFontGetGlyphPaths’ is obsolete and will be removed in an upcoming update. Unfortunately, this application, or a library it uses, is using this obsolete function, and is thereby contributing to an overall degradation of system performance.



True

In [37]:
%alias_magic t timeit

Created `%t` as an alias for `%timeit`.
Created `%%t` as an alias for `%%timeit`.


In [38]:
sel = features[features['feature_3'] < 0].index

In [39]:
sel

Int64Index([   0,    1,    2,    4,    7,    9,   13,   16,   17,   18,
            ...
            4961, 4969, 4973, 4975, 4981, 4982, 4987, 4990, 4994, 4997],
           dtype='int64', length=1755)

In [46]:
%t features.loc[sel]

The slowest run took 10.91 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 513 µs per loop


In [43]:
%t df.reindex_axis(sel, copy=False)

1000 loops, best of 3: 213 µs per loop
