In [1]:
from __future__ import division

In [2]:
import sys
sys.path.append('..')

In [3]:
import math
import random
import numpy as np
import pandas as pd

In [4]:
from sklearn import datasets

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
%load_ext autoreload
%autoreload 1

%aimport gtree
%aimport tools

In [7]:
import logging
%config Application.log_level="INFO"
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

## Goals:

### Separate the structure of a tree from the data of a tree.  In other words,
  fitting a tree does two things: It creates the structure of a tree and it
  creates a mapping of each leaf to a value.  Lookup therefore requires both
  finding the leaf node AND using the map to lookup the value.
  
### The loss function optimized by the tree is configurable, as is the leaf


## Terms:

### Tree
A Tree is an object that takes input data and determines what leaf it ends up in.  Unlike many tree implementations, the Tree itself doesn't store data about the value of a leaf.  That is stored externally.


### loss_fn
A loss_fn is a function that takes data rows, the predicted targets for those rows, and the actual targets for those rows, and returns a single value that determines the "LOSS" or "COST" of that prediction (lower cost/loss is better)

```
def loss_fn(predicted_targets, actual_targets) -> float
```

A loss function must be additive (so, one should not apply a mean as a part of it)

### leaf_prediction_fn
A leaf_prediction_fn is a function which takes the features and actual targets that end up in a leaf and returns a Series of the predictions for each row ending up in that leaf.  It is typically a constant function whose value is either the mean good rate in that leaf (among the actual targets) or the median target, but can be anything else

```
def leaf_prediction_fn(features) -> pd.Series
```


### leaf_prediction_builder
A leaf_prediction_builder is a function which takes the features and actual targets that end up in a TRANING leaf and returns a leaf_prediction_fn.  This leaf_prediction_fn is used to predict the value of testing rows that end up in the same leaf.

```
def leaf_prediction_builder(features, actual_targets) -> leaf_prediction_fn
```


### leaf_prediction_map
A leaf_prediction_map is a map of leaf ids (eg their hash) to the leaf_prediction_fn for that leaf.  One can only use a tree to score data if one has a leaf_prediction_map.  This design allows on to use the same tree as a subset of another tree without having their leaf values become entangled.

--------------

## Test Tree Manipulation Functions

In [None]:
%pdb off

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.LeafNode()

o = gtree.BranchNode('C', 0.1, None, None)
o.left = gtree.LeafNode()
o.right = gtree.LeafNode()

t.prn()
print '\n\n'
o.prn()

In [None]:
u = gtree.replace_branch_split(t, t.right, o)
u.prn()
print '\n\n'
t.prn()

In [None]:
v = gtree.replace_node(t, t.left, o)
v.prn()
print '\n\n'
t.prn()

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.BranchNode('C', 0.9, None, None)
t.right.right.right = gtree.LeafNode()
t.right.right.left = gtree.LeafNode()

t.prn()
print '\n\n'
gtree.prune(t, 2).prn()
print '\n\n'
t.prn()

In [None]:
data = pd.DataFrame({'A': [0.1, 10, .02],
                     'B': [10, 20, 30]},
                    index=['foo', 'bar', 'baz'])

In [None]:
def leaf_count_fn(val):
    return lambda df: pd.Series([val for _ in range(len(df))], index=df.index)

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode() #'A', 0.5, 10, 20)
t.right = gtree.LeafNode() #'A', 0.5, 100, 0)

leaf_map = {hash(t.left): leaf_count_fn(10),
            hash(t.right): leaf_count_fn(20)}

t.predict(data, leaf_map)

In [None]:
# Create a split on a DataFrame

df = pd.DataFrame({'foo': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])})

gtree._single_variable_best_split(
    df,
    'foo',
    pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1]),
    gtree.error_rate_loss,
    gtree.leaf_good_rate_prediction_builder)

In [None]:
threshold = 0.5
truth     = pd.Series([1, 0, 1])
predicted = pd.Series([0, 1, 0])

gtree.error_rate_loss(truth, predicted)


1.0 - ((predicted >= threshold) == truth).mean() #+ (predicted < threshold) * (1 - truth)

# Test Split Finding

In [18]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]}, dtype=np.float32)
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0], dtype=np.float32)

tree, leaf_map = gtree.train_greedy_tree(df, target, loss='error_rate')

print '\nTree:\n'
tree.prn()

print leaf_map

07:49:00 tree INFO:Training.  Depth 0 Current Loss: 0.0000 Best Split: A 1.0000 0.4167
07:49:00 tree INFO:No split improves loss.  Returning



Tree:

Leaf(id=0.514557152326)

{2353669182: <tree._my_tree.MeanLeafMapper object at 0x1108b0790>}


In [19]:
gtree.calculate_leaf_map(tree, df, target)

{2353669182: <tree._my_tree.MeanLeafMapper at 0x1108b0750>}

In [20]:
gtree.random_node(tree)

<gtree.LeafNode at 0x111184210>

In [21]:
print gtree.get_all_nodes(tree)

[<gtree.LeafNode object at 0x111184210>]


In [29]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]}, dtype=np.float32)
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0], dtype=np.float32)

In [30]:
gtree._single_variable_best_split(df,'B', target,
                                  loss='error_rate',
                                  leaf_prediction='mean')

(10.0, 0.4166666567325592)

In [31]:
tree, leaf_map = gtree.train_greedy_tree(df, target,
                                         loss='error_rate',
                                         feature_sample_rate=.5,
                                         row_sample_rate=.5)

print '\nTree:\n'
tree.prn()

print leaf_map

07:51:08 tree INFO:Training.  Depth 0 Current Loss: 0.0000 Best Split: A 7.0000 0.3333
07:51:08 tree INFO:No split improves loss.  Returning



Tree:

Leaf(id=0.773447212481)

{2506347385: <tree._my_tree.MeanLeafMapper object at 0x1108b08d0>}


In [32]:
gtree.mate(tree, tree).prn()

Leaf(id=0.773447212481)



In [None]:
#def make_hastie_sample(n_samples):
#
#    features, targets = datasets.make_hastie_10_2(n_samples=n_samples)
#
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#    return features, targets



In [None]:
#def make_kddcup(n_samples):
#    
#    features, targets = datasets.fetch_kddcup99(subset='smtp')
#
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#    
#    features = featurse.sample(n=n_samples)
#    
#    return features, targets.loc[features.index]
    

#def make_random_classification(n_samples, n_features=100):
       
#    features, targets = datasets.make_classification(n_samples=n_samples,
#                                                     n_features=n_features,
#                                                     n_informative=8,
#                                                     n_classes=2,
#                                                     n_clusters_per_class=4)
#                                                     
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#        
#    return features, targets.loc[features.index]

# Start the Test Analysis Here

In [37]:
#features, targets = make_hastie_sample(10000)
features, targets = tools.make_random_classification(10000)
features = pd.DataFrame(features, dtype=np.float32)
targets = pd.Series(targets, dtype=np.float32)

In [38]:
features.shape

(10000, 100)

In [39]:
targets.value_counts()

0.0    5003
1.0    4997
Name: target, dtype: int64

In [41]:
gtree.tree_logger.setLevel(logging.INFO)
tree, leaf_map = gtree.train_greedy_tree(features, targets, loss='cross_entropy', max_depth=8)

07:52:16 tree INFO:Training.  Depth 0 Current Loss: 8.2986 Best Split: feature_14 0.1281 6.1064
07:52:21 tree INFO:Training.  Depth 1 Current Loss: 6.4319 Best Split: feature_97 -1.0547 4.5129
07:52:24 tree INFO:Training.  Depth 2 Current Loss: 8.3439 Best Split: feature_82 -0.0740 5.2206
07:52:25 tree INFO:Training.  Depth 3 Current Loss: 8.6334 Best Split: feature_26 1.0266 4.8444
07:52:26 tree INFO:Training.  Depth 4 Current Loss: 9.7062 Best Split: feature_39 2.9441 4.7745
07:52:26 tree INFO:Training.  Depth 5 Current Loss: 7.3270 Best Split: feature_39 1.2425 3.2949
07:52:27 tree INFO:Training.  Depth 6 Current Loss: 5.3484 Best Split: feature_39 0.3246 2.1701
07:52:28 tree INFO:Training.  Depth 7 Current Loss: 3.6360 Best Split: feature_36 -0.4634 1.3878
07:52:28 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:52:28 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:52:28 tree INFO:Training.  Depth 7 Current Loss: 9.6259 Best

07:52:47 tree INFO:Training.  Depth 7 Current Loss: 8.1152 Best Split: feature_97 -1.7958 2.6449
07:52:47 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:52:47 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:52:48 tree INFO:Training.  Depth 7 Current Loss: 11.3084 Best Split: feature_64 0.3080 3.4134
07:52:48 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:52:48 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:52:49 tree INFO:Training.  Depth 4 Current Loss: 9.2751 Best Split: feature_0 -0.1757 4.6016
07:52:49 tree INFO:Training.  Depth 5 Current Loss: 10.3306 Best Split: feature_60 0.0506 4.3405
07:52:50 tree INFO:Training.  Depth 6 Current Loss: 11.4435 Best Split: feature_39 3.5772 4.0435
07:52:50 tree INFO:Training.  Depth 7 Current Loss: 10.1083 Best Split: feature_61 -0.4925 3.0714
07:52:50 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:

07:53:18 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:18 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:19 tree INFO:Training.  Depth 6 Current Loss: 5.4723 Best Split: feature_97 2.6921 2.2665
07:53:19 tree INFO:Training.  Depth 7 Current Loss: 3.7986 Best Split: feature_56 1.2745 1.4747
07:53:19 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:19 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:20 tree INFO:Training.  Depth 7 Current Loss: 9.7415 Best Split: feature_23 1.7563 3.0126
07:53:20 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:20 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:21 tree INFO:Training.  Depth 3 Current Loss: 8.7940 Best Split: feature_26 0.7853 4.8240
07:53:22 tree INFO:Training.  Depth 4 Current Loss: 11.2575 Best Split: feature_26 -0.3813 5.2756
07:53:22 tree INFO:T

07:53:42 tree INFO:Training.  Depth 6 Current Loss: 6.0534 Best Split: feature_42 0.5475 2.4415
07:53:43 tree INFO:Training.  Depth 7 Current Loss: 4.7792 Best Split: feature_4 0.0194 1.7411
07:53:43 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:43 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:43 tree INFO:Training.  Depth 7 Current Loss: 8.4116 Best Split: feature_78 0.5224 2.5809
07:53:43 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:43 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:44 tree INFO:Training.  Depth 6 Current Loss: 8.3039 Best Split: feature_49 1.1887 3.0853
07:53:44 tree INFO:Training.  Depth 7 Current Loss: 5.9318 Best Split: feature_57 -0.2629 1.9813
07:53:44 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:44 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:53:44 tree INFO:Tra

07:53:59 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:00 tree INFO:Training.  Depth 5 Current Loss: 12.1518 Best Split: feature_47 0.1677 4.8748
07:54:00 tree INFO:Training.  Depth 6 Current Loss: 10.8750 Best Split: feature_47 -0.5957 3.7899
07:54:00 tree INFO:Training.  Depth 7 Current Loss: 12.7932 Best Split: feature_5 0.4295 3.4828
07:54:00 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:00 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:00 tree INFO:Training.  Depth 7 Current Loss: 9.4182 Best Split: feature_17 0.2230 2.8225
07:54:01 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:01 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:01 tree INFO:Training.  Depth 6 Current Loss: 13.6030 Best Split: feature_25 0.1390 4.5179
07:54:01 tree INFO:Training.  Depth 7 Current Loss: 15.4229 Best Split: feature_11 0.1833 4.0784
07:5

07:54:17 tree INFO:Training.  Depth 7 Current Loss: 16.1824 Best Split: feature_0 -0.5205 4.1435
07:54:17 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:17 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:18 tree INFO:Training.  Depth 7 Current Loss: 16.1588 Best Split: feature_0 0.6038 4.1699
07:54:18 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:18 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:18 tree INFO:Training.  Depth 5 Current Loss: 14.0655 Best Split: feature_29 0.0332 5.5166
07:54:19 tree INFO:Training.  Depth 6 Current Loss: 14.9038 Best Split: feature_18 -0.5595 4.8904
07:54:19 tree INFO:Training.  Depth 7 Current Loss: 15.7879 Best Split: feature_1 -0.4542 4.1435
07:54:19 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:19 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:54:19 tree INF

In [None]:
#gtree.cross_entropy_loss(targets[features.feature_45 < .2326])

In [None]:
#gtree._single_variable_best_split(features, 'feature_45', targets, None, None, None)

In [None]:
tree.prn()

In [None]:
set(pd.Series([1, 2, 3]))

In [None]:
tree.predict(features, leaf_map)

In [None]:
results = pd.DataFrame({'truth': targets, 'prediction': tree.predict(features, leaf_map)})

In [None]:
1.0 - gtree.error_rate_loss(results.prediction, results.truth) / len(targets)

In [None]:
results.plot(kind='scatter', x='prediction', y='truth')

In [None]:
fig = plt.figure(figsize=(12,8))

for label, grp in tree.predict(features, leaf_map).groupby(targets):
    grp.hist(normed=True, alpha=0.5, label=str(label)) #, label=label)
plt.legend(loc='best')
None

# Compare Methods

In [42]:
features, targets = tools.make_random_classification(5000)
features = pd.DataFrame(features, dtype=np.float32)
targets = pd.Series(targets, dtype=np.float32)

features_validation = features.sample(frac=.3)
targets_validation = targets.loc[features_validation.index]

features = features[~features.index.isin(features_validation.index)]
targets = targets.loc[features.index]

In [None]:
%pdb off

In [None]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, targets,
                                   loss='cross_entropy',
                                   max_depth=3, min_to_split=10,
                                   num_generations=15, num_survivors=10,
                                   num_children=200, num_seed_trees=2)


08:13:23 evolution DEBUG:Growing Seed: 1 of 2
08:13:28 evolution DEBUG:Growing Seed: 2 of 2
08:13:34 evolution DEBUG:Resplitting the data
08:13:34 evolution DEBUG:Mating to create 200 children
08:13:47 evolution DEBUG:Surviving Generation: 1:7.8922, 1:7.9039, 1:7.9091, 1:7.9109, 1:7.9120, 1:7.9121, 1:7.9121, 1:7.9123, 1:7.9124, 1:7.9124
08:13:47 evolution INFO:Generation 0 Training Loss: 8.4223 Hold Out Loss 7.8922

08:13:47 evolution DEBUG:Resplitting the data
08:13:47 evolution DEBUG:Mating to create 200 children
08:14:00 evolution DEBUG:Surviving Generation: 2:7.8846, 2:7.8882, 1:7.8882, 2:7.8882, 2:7.8923, 2:7.8992, 2:7.9006, 2:7.9012, 2:7.9020, 2:7.9022
08:14:00 evolution INFO:Generation 1 Training Loss: 8.3367 Hold Out Loss 7.8846

08:14:00 evolution DEBUG:Resplitting the data
08:14:00 evolution DEBUG:Mating to create 200 children
08:14:13 evolution DEBUG:Surviving Generation: 3:7.8955, 3:7.8982, 3:7.8989, 3:7.8990, 3:7.9014, 3:7.9029, 2:7.9031, 3:7.9037, 3:7.9047, 3:7.9048
08:14

In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [None]:
generations[-1]['best_of_generation']['tree'].find_leaves(features).value_counts()

In [None]:
for gen in generations[-1]['generation']:
    print '--------------------{:.4f}----------------------------'.format(gen['loss_testing'])
    gen['tree'].prn()

In [None]:
for result in generations[-1]['generation']:
    print '---------------------------------------------'
    result['tree'].prn()

In [None]:
result = gtree.train_random_trees(features, targets, loss_fn=gtree.error_rate_loss,
                                  max_depth=2,
                                  min_to_split=10,
                                  num_trees=10)

In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split

clf = tree.DecisionTreeClassifier(max_depth=2)
clf = clf.fit(features, targets)

In [None]:
predictions = pd.Series(clf.predict_proba(features_validation)[:, 1], index=features_validation.index)
gtree.error_rate_loss(predictions, targets_validation)

In [None]:
from sklearn.externals.six import StringIO  
from sklearn import tree as sklearn_tree
import pydot 
dot_data = StringIO() 
sklearn_tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("iris.pdf") 

In [None]:
%alias_magic t timeit

In [None]:
sel = features[features['feature_3'] < 0].index

In [None]:
sel

In [None]:
%t features.loc[sel]

In [None]:
%t df.reindex_axis(sel, copy=False)

In [None]:
# BC Dataset

In [None]:
bc_info = datasets.load_breast_cancer()
features = pd.DataFrame(bc_info['data'])
target = pd.Series(bc_info['target']).dropna()
features = features.loc[target.index]

In [None]:
target.value_counts(dropna=False)

In [None]:
%pdb off

In [None]:
gtree.tree_logger.setLevel(logging.WARNING)


In [None]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, target,
                                   loss_fn=gtree.cross_entropy_loss,
                                   max_depth=3, min_to_split=10,
                                   num_generations=30, num_survivors=10,
                                   num_children=50, num_seed_trees=5)

In [None]:
X = np.array([[1, 2, 3], [4, 5, 6]])
Y = X[0:2]

In [None]:
type(X)

In [None]:
id(Y.data)

In [None]:
[1, 2, 3][0:3]

In [None]:
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
X

In [None]:
fns = {0: lambda x: (x[:,0] + x[:,1] + x[:,2]).reshape(len(x), 1), 1: lambda x: (-1*x[:,0]).reshape(len(x), 1)}

hashes = np.array([[0], [1], [0]])
hashes

In [None]:
predictions = np.zeros((len(X), 1))
predictions

In [None]:
zero = np.zeros((len(X), 1))

for i in [0, 1]:
    #comparison = np.full((len(X), 1), i)
    predictions[hashes.reshape(len(X))==i] = fns[i](X[hashes.reshape(len(X))==i, :]) # += np.where(hashes==i, fns[i](X), zero)
    
predictions

In [None]:
X[hashes==comparison, :]

In [None]:
X[:,0].reshape(3, 1)

In [None]:
X[hashes==np.array([1]).reshape((len(X), 1))]

In [None]:
hashes==1

In [None]:
X[np.array([True, False, True]), :]