In [1]:
from __future__ import division

In [2]:
import sys
sys.path.append('..')

In [3]:
import math
import random
import numpy as np
import pandas as pd

In [4]:
from sklearn import datasets

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
%load_ext autoreload
%autoreload 1

%aimport gtree
%aimport tools

In [7]:
import logging
%config Application.log_level="INFO"
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

## Goals:

### Separate the structure of a tree from the data of a tree.  In other words,
  fitting a tree does two things: It creates the structure of a tree and it
  creates a mapping of each leaf to a value.  Lookup therefore requires both
  finding the leaf node AND using the map to lookup the value.
  
### The loss function optimized by the tree is configurable, as is the leaf


## Terms:

### Tree
A Tree is an object that takes input data and determines what leaf it ends up in.  Unlike many tree implementations, the Tree itself doesn't store data about the value of a leaf.  That is stored externally.


### loss_fn
A loss_fn is a function that takes data rows, the predicted targets for those rows, and the actual targets for those rows, and returns a single value that determines the "LOSS" or "COST" of that prediction (lower cost/loss is better)

```
def loss_fn(predicted_targets, actual_targets) -> float
```

A loss function must be additive (so, one should not apply a mean as a part of it)

### leaf_prediction_fn
A leaf_prediction_fn is a function which takes the features and actual targets that end up in a leaf and returns a Series of the predictions for each row ending up in that leaf.  It is typically a constant function whose value is either the mean good rate in that leaf (among the actual targets) or the median target, but can be anything else

```
def leaf_prediction_fn(features) -> pd.Series
```


### leaf_prediction_builder
A leaf_prediction_builder is a function which takes the features and actual targets that end up in a TRANING leaf and returns a leaf_prediction_fn.  This leaf_prediction_fn is used to predict the value of testing rows that end up in the same leaf.

```
def leaf_prediction_builder(features, actual_targets) -> leaf_prediction_fn
```


### leaf_prediction_map
A leaf_prediction_map is a map of leaf ids (eg their hash) to the leaf_prediction_fn for that leaf.  One can only use a tree to score data if one has a leaf_prediction_map.  This design allows on to use the same tree as a subset of another tree without having their leaf values become entangled.

--------------

## Test Tree Manipulation Functions

In [None]:
%pdb off

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.LeafNode()

o = gtree.BranchNode('C', 0.1, None, None)
o.left = gtree.LeafNode()
o.right = gtree.LeafNode()

t.prn()
print '\n\n'
o.prn()

In [None]:
u = gtree.replace_branch_split(t, t.right, o)
u.prn()
print '\n\n'
t.prn()

In [None]:
v = gtree.replace_node(t, t.left, o)
v.prn()
print '\n\n'
t.prn()

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.BranchNode('C', 0.9, None, None)
t.right.right.right = gtree.LeafNode()
t.right.right.left = gtree.LeafNode()

t.prn()
print '\n\n'
gtree.prune(t, 2).prn()
print '\n\n'
t.prn()

In [None]:
data = pd.DataFrame({'A': [0.1, 10, .02],
                     'B': [10, 20, 30]},
                    index=['foo', 'bar', 'baz'])

In [None]:
def leaf_count_fn(val):
    return lambda df: pd.Series([val for _ in range(len(df))], index=df.index)

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode() #'A', 0.5, 10, 20)
t.right = gtree.LeafNode() #'A', 0.5, 100, 0)

leaf_map = {hash(t.left): leaf_count_fn(10),
            hash(t.right): leaf_count_fn(20)}

t.predict(data, leaf_map)

In [None]:
# Create a split on a DataFrame

df = pd.DataFrame({'foo': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])})

gtree._single_variable_best_split(
    df,
    'foo',
    pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1]),
    gtree.error_rate_loss,
    gtree.leaf_good_rate_prediction_builder)

In [None]:
threshold = 0.5
truth     = pd.Series([1, 0, 1])
predicted = pd.Series([0, 1, 0])

gtree.error_rate_loss(truth, predicted)


1.0 - ((predicted >= threshold) == truth).mean() #+ (predicted < threshold) * (1 - truth)

# Test Split Finding

In [None]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]})
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

tree, leaf_map = gtree.train_greedy_tree(df, target, loss_fn=gtree.error_rate_loss)

print '\nTree:\n'
tree.prn()

print leaf_map

In [None]:
gtree.calculate_leaf_map(tree, df, target)

In [None]:
gtree.random_node(tree)

In [None]:
print gtree.get_all_nodes(tree)

In [None]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]})
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

In [None]:
gtree._np_single_variable_best_split(df,'B', target, 
                                     gtree.error_rate_loss,
                                     gtree.leaf_good_rate_prediction_builder)

In [None]:
tree, leaf_map = gtree.train_greedy_tree(df, target, loss_fn=gtree.error_rate_loss,
                                         feature_sample_rate=.5,
                                         row_sample_rate=.5)

print '\nTree:\n'
tree.prn()

print leaf_map

In [None]:
gtree.mate(tree, tree).prn()

In [None]:
#def make_hastie_sample(n_samples):
#
#    features, targets = datasets.make_hastie_10_2(n_samples=n_samples)
#
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#    return features, targets



In [None]:
#def make_kddcup(n_samples):
#    
#    features, targets = datasets.fetch_kddcup99(subset='smtp')
#
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#    
#    features = featurse.sample(n=n_samples)
#    
#    return features, targets.loc[features.index]
    

#def make_random_classification(n_samples, n_features=100):
       
#    features, targets = datasets.make_classification(n_samples=n_samples,
#                                                     n_features=n_features,
#                                                     n_informative=8,
#                                                     n_classes=2,
#                                                     n_clusters_per_class=4)
#                                                     
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#        
#    return features, targets.loc[features.index]

# Start the Test Analysis Here

In [8]:

#features, targets = make_hastie_sample(10000)
features, targets = tools.make_random_classification(10000)

In [9]:
features.shape

(10000, 100)

In [10]:
targets.value_counts()

0.0    5007
1.0    4993
Name: target, dtype: int64

In [15]:
gtree.tree_logger.setLevel(logging.INFO)
tree, leaf_map = gtree.train_greedy_tree(features, targets, loss_fn=gtree.cross_entropy_loss, max_depth=8)

02:48:59 tree INFO:Training.  Depth 0 Current Loss: 0.6931 Best Split: feature_45 0.2326 0.6538
02:49:03 tree INFO:Training.  Depth 1 Current Loss: 0.6735 Best Split: feature_28 -0.1936 0.6074
02:49:05 tree INFO:Training.  Depth 2 Current Loss: 0.6437 Best Split: feature_61 -4.7653 0.5756
02:49:07 tree INFO:Training.  Depth 3 Current Loss: 0.2954 Best Split: feature_28 -2.5279 0.2640
02:49:10 tree INFO:Training.  Depth 4 Current Loss: 0.0000 Best Split: feature_0 -0.5067 0.0000
02:49:10 tree INFO:No split improves loss.  Returning
02:49:12 tree INFO:Training.  Depth 4 Current Loss: 0.3723 Best Split: feature_11 -1.6206 0.3278
02:49:12 tree INFO:Training.  Depth 5 Current Loss: 0.0000 Best Split: feature_0 -0.3654 0.0000
02:49:12 tree INFO:No split improves loss.  Returning
02:49:14 tree INFO:Training.  Depth 5 Current Loss: 0.3347 Best Split: feature_20 1.0410 0.3119
02:49:15 tree INFO:Training.  Depth 6 Current Loss: 0.4518 Best Split: feature_51 -0.0126 0.4053
02:49:18 tree INFO:Trai

02:50:25 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:50:27 tree INFO:Training.  Depth 5 Current Loss: 0.6700 Best Split: feature_45 -2.2278 0.5446
02:50:28 tree INFO:Training.  Depth 6 Current Loss: 0.1425 Best Split: feature_3 -7.9260 0.0000
02:50:28 tree INFO:Training.  Depth 7 Current Loss: 0.6931 Best Split: feature_0 -0.7632 0.0000
02:50:28 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:50:28 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:50:29 tree INFO:Training.  Depth 7 Current Loss: 0.0000 Best Split: feature_0 -0.4013 0.0000
02:50:29 tree INFO:No split improves loss.  Returning
02:50:30 tree INFO:Training.  Depth 6 Current Loss: 0.6931 Best Split: feature_3 -1.7826 0.6051
02:50:31 tree INFO:Training.  Depth 7 Current Loss: 0.6077 Best Split: feature_89 1.1838 0.4769
02:50:31 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:50:31 tree INFO:Reached leaf node,

02:51:44 tree INFO:Training.  Depth 6 Current Loss: 0.3541 Best Split: feature_0 0.5892 0.1723
02:51:45 tree INFO:Training.  Depth 7 Current Loss: 0.0000 Best Split: feature_0 -0.4322 0.0000
02:51:45 tree INFO:No split improves loss.  Returning
02:51:45 tree INFO:Training.  Depth 7 Current Loss: 0.6890 Best Split: feature_26 0.0292 0.2458
02:51:45 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:51:45 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:51:46 tree INFO:Training.  Depth 6 Current Loss: 0.4506 Best Split: feature_28 1.2211 0.2998
02:51:47 tree INFO:Training.  Depth 7 Current Loss: 0.2530 Best Split: feature_85 1.5794 0.1093
02:51:47 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:51:47 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:51:47 tree INFO:Training.  Depth 7 Current Loss: 0.6890 Best Split: feature_3 3.5759 0.0000
02:51:47 tree INFO:Reached leaf node, or

02:52:55 tree INFO:Training.  Depth 5 Current Loss: 0.6761 Best Split: feature_51 2.1133 0.5745
02:52:56 tree INFO:Training.  Depth 6 Current Loss: 0.6930 Best Split: feature_20 0.7868 0.5649
02:52:57 tree INFO:Training.  Depth 7 Current Loss: 0.6416 Best Split: feature_79 -0.9998 0.4741
02:52:57 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:52:57 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:52:57 tree INFO:Training.  Depth 7 Current Loss: 0.4362 Best Split: feature_7 1.0310 0.0000
02:52:57 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:52:57 tree INFO:Reached leaf node, or constraints force termination.  Returning
02:52:57 tree INFO:Training.  Depth 6 Current Loss: 0.0000 Best Split: feature_0 -0.9923 0.0000
02:52:57 tree INFO:No split improves loss.  Returning
02:52:58 tree INFO:Training.  Depth 5 Current Loss: 0.0000 Best Split: feature_0 -2.1573 0.0000
02:52:58 tree INFO:No split improves l

In [13]:
#gtree.cross_entropy_loss(targets[features.feature_45 < .2326])

TypeError: cross_entropy_loss() takes exactly 2 arguments (1 given)

In [14]:
#gtree._single_variable_best_split(features, 'feature_45', targets, None, None, None)

(0.23258662223815918, 0.6538109183311462)

In [16]:
tree.prn()

				Leaf(id=0.641402293352)

			feature_28 -2.52788257599

					Leaf(id=0.774885089945)

				feature_11 -1.62058627605

								Leaf(id=0.356149173498)

							feature_61 -8.35889530182

								Leaf(id=0.449175580958)

						feature_51 -0.0125508848578

								Leaf(id=0.388940336614)

							feature_20 -1.07297968864

								Leaf(id=0.729989677067)

					feature_20 1.04102158546

							Leaf(id=0.356992467398)

						feature_71 0.5068551898

								Leaf(id=0.719547409438)

							feature_22 -0.555300176144

								Leaf(id=0.472063271809)

		feature_61 -4.76534843445

						Leaf(id=0.84257447111)

					feature_51 0.670628905296

								Leaf(id=0.930048353626)

							feature_11 1.49652242661

								Leaf(id=0.804369737334)

						feature_61 -2.56818819046

								Leaf(id=0.900585421993)

							feature_5 0.655855298042

								Leaf(id=0.819621134948)

				feature_22 -1.61188137531

								Leaf(id=0.761101720197)

							feature_22 0.0101326731965

								Leaf(id=0.63251056941

In [66]:
set(pd.Series([1, 2, 3]))

{1, 2, 3}

In [71]:
tree.predict(features, leaf_map)

0       0.633333
1       1.000000
2       0.333333
3       0.962466
4       0.900000
5       1.000000
6       0.900000
7       0.060391
8       0.000000
9       0.513514
10      0.060391
11      0.812500
12      0.379310
13      0.020833
14      0.433071
15      1.000000
16      0.994845
17      0.994845
18      0.020833
19      0.106667
20      1.000000
21      0.563218
22      0.513514
23      1.000000
24      0.804348
25      0.000000
26      0.903448
27      0.684211
28      0.106667
29      1.000000
          ...   
9970    0.060391
9971    1.000000
9972    1.000000
9973    0.000000
9974    0.180139
9975    0.962466
9976    0.000000
9977    0.994845
9978    1.000000
9979    0.006494
9980    0.431034
9981    1.000000
9982    0.962466
9983    0.513514
9984    0.046948
9985    0.903448
9986    0.000000
9987    0.433071
9988    0.167665
9989    0.433071
9990    0.043478
9991    0.614286
9992    0.994937
9993    0.812500
9994    0.180139
9995    0.000000
9996    0.000000
9997    0.0909

In [None]:
results = pd.DataFrame({'truth': targets, 'prediction': tree.predict(features, leaf_map)})

In [None]:
1.0 - gtree.error_rate_loss(results.prediction, results.truth) / len(targets)

In [None]:
results.plot(kind='scatter', x='prediction', y='truth')

In [None]:
fig = plt.figure(figsize=(12,8))

for label, grp in tree.predict(features, leaf_map).groupby(targets):
    grp.hist(normed=True, alpha=0.5, label=str(label)) #, label=label)
plt.legend(loc='best')
None

# Compare Methods

In [None]:
features, targets = tools.make_random_classification(5000)

features_validation = features.sample(frac=.3)
targets_validation = targets.loc[features_validation.index]

features = features[~features.index.isin(features_validation.index)]
targets = targets.loc[features.index]

In [None]:
%pdb off

In [None]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, targets,
                                   loss_fn=gtree.cross_entropy_loss,
                                   max_depth=3, min_to_split=10,
                                   num_generations=15, num_survivors=10,
                                   num_children=200, num_seed_trees=5)


In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [None]:
generations[-1]['best_of_generation']['tree'].find_leaves(features).value_counts()

In [None]:
for gen in generations[-1]['generation']:
    print '--------------------{:.4f}----------------------------'.format(gen['loss_testing'])
    gen['tree'].prn()

In [None]:
for result in generations[-1]['generation']:
    print '---------------------------------------------'
    result['tree'].prn()

In [None]:
result = gtree.train_random_trees(features, targets, loss_fn=gtree.error_rate_loss,
                                  max_depth=2,
                                  min_to_split=10,
                                  num_trees=10)

In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split

clf = tree.DecisionTreeClassifier(max_depth=2)
clf = clf.fit(features, targets)

In [None]:
predictions = pd.Series(clf.predict_proba(features_validation)[:, 1], index=features_validation.index)
gtree.error_rate_loss(predictions, targets_validation)

In [None]:
from sklearn.externals.six import StringIO  
from sklearn import tree as sklearn_tree
import pydot 
dot_data = StringIO() 
sklearn_tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("iris.pdf") 

In [None]:
%alias_magic t timeit

In [None]:
sel = features[features['feature_3'] < 0].index

In [None]:
sel

In [None]:
%t features.loc[sel]

In [None]:
%t df.reindex_axis(sel, copy=False)

In [None]:
# BC Dataset

In [None]:
bc_info = datasets.load_breast_cancer()
features = pd.DataFrame(bc_info['data'])
target = pd.Series(bc_info['target']).dropna()
features = features.loc[target.index]

In [None]:
target.value_counts(dropna=False)

In [None]:
%pdb off

In [None]:
gtree.tree_logger.setLevel(logging.WARNING)


In [None]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, target,
                                   loss_fn=gtree.cross_entropy_loss,
                                   max_depth=3, min_to_split=10,
                                   num_generations=30, num_survivors=10,
                                   num_children=50, num_seed_trees=5)

In [None]:
X = np.array([[1, 2, 3], [4, 5, 6]])
Y = X[0:2]

In [None]:
type(X)

In [None]:
id(Y.data)

In [None]:
[1, 2, 3][0:3]

In [22]:
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
X

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [36]:
fns = {0: lambda x: (x[:,0] + x[:,1] + x[:,2]).reshape(len(x), 1), 1: lambda x: (-1*x[:,0]).reshape(len(x), 1)}

hashes = np.array([[0], [1], [0]])
hashes

array([[0],
       [1],
       [0]])

In [37]:
predictions = np.zeros((len(X), 1))
predictions

array([[ 0.],
       [ 0.],
       [ 0.]])

In [61]:
zero = np.zeros((len(X), 1))

for i in [0, 1]:
    #comparison = np.full((len(X), 1), i)
    predictions[hashes.reshape(len(X))==i] = fns[i](X[hashes.reshape(len(X))==i, :]) # += np.where(hashes==i, fns[i](X), zero)
    
predictions

array([[  6.],
       [ -4.],
       [ 24.]])

In [60]:
X[hashes==comparison, :]

IndexError: too many indices for array

In [40]:
X[:,0].reshape(3, 1)

array([[1],
       [4],
       [7]])

In [55]:
X[hashes==np.array([1]).reshape((len(X), 1))]

ValueError: cannot reshape array of size 1 into shape (3,1)

In [52]:
hashes==1

array([[False],
       [ True],
       [False]], dtype=bool)

In [54]:
X[np.array([True, False, True]), :]

array([[1, 2, 3],
       [7, 8, 9]])