In [1]:
from __future__ import division

In [2]:
import math
import random
import numpy as np
import pandas as pd

In [3]:
from sklearn import datasets

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline

In [12]:
%load_ext autoreload
%autoreload 1

%aimport gtree

In [13]:
import logging
%config Application.log_level="INFO"
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

## Goals:

### Separate the structure of a tree from the data of a tree.  In other words,
  fitting a tree does two things: It creates the structure of a tree and it
  creates a mapping of each leaf to a value.  Lookup therefore requires both
  finding the leaf node AND using the map to lookup the value.
  
### The loss function optimized by the tree is configurable, as is the leaf


## Terms:

### Tree
A Tree is an object that takes input data and determines what leaf it ends up in.  Unlike many tree implementations, the Tree itself doesn't store data about the value of a leaf.  That is stored externally.


### loss_fn
A loss_fn is a function that takes data rows, the predicted targets for those rows, and the actual targets for those rows, and returns a single value that determines the "LOSS" or "COST" of that prediction (lower cost/loss is better)

```
def loss_fn(predicted_targets, actual_targets) -> float
```

A loss function must be additive (so, one should not apply a mean as a part of it)

### leaf_prediction_fn
A leaf_prediction_fn is a function which takes the features and actual targets that end up in a leaf and returns a Series of the predictions for each row ending up in that leaf.  It is typically a constant function whose value is either the mean good rate in that leaf (among the actual targets) or the median target, but can be anything else

```
def leaf_prediction_fn(features) -> pd.Series
```


### leaf_prediction_builder
A leaf_prediction_builder is a function which takes the features and actual targets that end up in a TRANING leaf and returns a leaf_prediction_fn.  This leaf_prediction_fn is used to predict the value of testing rows that end up in the same leaf.

```
def leaf_prediction_builder(features, actual_targets) -> leaf_prediction_fn
```


### leaf_prediction_map
A leaf_prediction_map is a map of leaf ids (eg their hash) to the leaf_prediction_fn for that leaf.  One can only use a tree to score data if one has a leaf_prediction_map.  This design allows on to use the same tree as a subset of another tree without having their leaf values become entangled.

--------------

## Test Tree Manipulation Functions

In [7]:
%pdb off

Automatic pdb calling has been turned OFF


In [8]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.LeafNode()

o = gtree.BranchNode('C', 0.1, None, None)
o.left = gtree.LeafNode()
o.right = gtree.LeafNode()

t.prn()
print '\n\n'
o.prn()

	Leaf(0.360883544777)

A 0.5

		Leaf(0.500856349064)

	B 0.9

		Leaf(0.453760363577)




	Leaf(0.97798875491)

C 0.1

	Leaf(0.72512513995)



In [9]:
u = gtree.replace_branch_split(t, t.right, o)
u.prn()
print '\n\n'
t.prn()

	Leaf(0.360883544777)

A 0.5

		Leaf(0.500856349064)

	C 0.1

		Leaf(0.453760363577)




	Leaf(0.360883544777)

A 0.5

		Leaf(0.500856349064)

	B 0.9

		Leaf(0.453760363577)



In [10]:
v = gtree.replace_node(t, t.left, o)
v.prn()
print '\n\n'
t.prn()

		Leaf(0.97798875491)

	C 0.1

		Leaf(0.72512513995)

A 0.5

		Leaf(0.500856349064)

	B 0.9

		Leaf(0.453760363577)




	Leaf(0.360883544777)

A 0.5

		Leaf(0.500856349064)

	B 0.9

		Leaf(0.453760363577)



In [11]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.BranchNode('C', 0.9, None, None)
t.right.right.right = gtree.LeafNode()
t.right.right.left = gtree.LeafNode()

t.prn()
print '\n\n'
gtree.prune(t, 2).prn()
print '\n\n'
t.prn()

	Leaf(0.256080594178)

A 0.5

		Leaf(0.969367371707)

	B 0.9

			Leaf(0.187523824675)

		C 0.9

			Leaf(0.902891676411)




	Leaf(0.256080594178)

A 0.5

		Leaf(0.920887642809)

	B 0.9

		Leaf(0.171126608762)




	Leaf(0.256080594178)

A 0.5

		Leaf(0.969367371707)

	B 0.9

			Leaf(0.187523824675)

		C 0.9

			Leaf(0.902891676411)



In [12]:
data = pd.DataFrame({'A': [0.1, 10, .02],
                     'B': [10, 20, 30]},
                    index=['foo', 'bar', 'baz'])

In [13]:
def leaf_count_fn(val):
    return lambda df: pd.Series([val for _ in range(len(df))], index=df.index)

In [14]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode() #'A', 0.5, 10, 20)
t.right = gtree.LeafNode() #'A', 0.5, 100, 0)

leaf_map = {hash(t.left): leaf_count_fn(10),
            hash(t.right): leaf_count_fn(20)}

t.predict(data, leaf_map)

foo    10
bar    20
baz    10
dtype: int64

In [15]:
# Create a split on a DataFrame

df = pd.DataFrame({'foo': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])})

gtree._single_variable_best_split(
    df,
    'foo',
    pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1]),
    gtree.error_rate_loss,
    gtree.leaf_good_rate_split_builder)

(5, 0.19999999999999996)

In [16]:
threshold = 0.5
truth     = pd.Series([1, 0, 1])
predicted = pd.Series([0, 1, 0])

gtree.error_rate_loss(truth, predicted)


1.0 - ((predicted >= threshold) == truth).mean() #+ (predicted < threshold) * (1 - truth)

1.0

In [17]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]})
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

tree, leaf_map = gtree.train_greedy_tree(df, target, loss_fn=gtree.error_rate_loss)

print '\nTree:\n'
tree.prn()

print leaf_map

03:35:13 tree INFO:Training.  Depth 0 Current Loss: 0.4167 Best Split: B 40.0000 0.2500
03:35:13 tree INFO:Training.  Depth 1 Current Loss: 0.0000 Best Split: A 1.0000 0.0000
03:35:13 tree INFO:No split improves loss.  Returning
03:35:14 tree INFO:Training.  Depth 1 Current Loss: 0.3750 Best Split: A 10.0000 0.1250
03:35:14 tree INFO:Training.  Depth 2 Current Loss: 0.1667 Best Split: A 3.0000 0.1667
03:35:14 tree INFO:No split improves loss.  Returning
03:35:14 tree INFO:Training.  Depth 2 Current Loss: 0.0000 Best Split: A 11.0000 0.0000
03:35:14 tree INFO:No split improves loss.  Returning



Tree:

	Leaf(0.729188252837)

B 40

		Leaf(0.2499323379)

	A 10

		Leaf(0.429783676767)

{2167387241: <function <lambda> at 0x1106be320>, 4259768738: <function <lambda> at 0x110684848>, 1996908948: <function <lambda> at 0x1106be410>}


In [18]:
gtree.calculate_leaf_map(tree, df, target)

{1996908948: <function gtree.<lambda>>,
 2167387241: <function gtree.<lambda>>,
 4259768738: <function gtree.<lambda>>}

In [19]:
gtree.random_node(tree)

<gtree.LeafNode at 0x110665c90>

In [20]:
print gtree.get_all_nodes(tree)

[<gtree.BranchNode object at 0x1106add10>, <gtree.LeafNode object at 0x110665e50>, <gtree.BranchNode object at 0x1106b8450>, <gtree.LeafNode object at 0x110665e50>, <gtree.BranchNode object at 0x1106b8450>, <gtree.LeafNode object at 0x110665c90>, <gtree.LeafNode object at 0x1106c5490>, <gtree.LeafNode object at 0x110665c90>, <gtree.LeafNode object at 0x1106c5490>]


In [21]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]})
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

tree, leaf_map = gtree.train_greedy_tree(df, target, loss_fn=gtree.error_rate_loss,
                                         feature_sample_rate=.5,
                                         row_sample_rate=.5)

print '\nTree:\n'
tree.prn()

print leaf_map

03:35:19 tree INFO:Training.  Depth 0 Current Loss: 0.4167 Best Split: A 1.0000 0.1667
03:35:19 tree INFO:Reached leaf node, or constraints force termination.  Returning
03:35:19 tree INFO:Training.  Depth 1 Current Loss: 0.4545 Best Split: B 40.0000 0.2727
03:35:19 tree INFO:Training.  Depth 2 Current Loss: 0.0000 Best Split: A 2.0000 0.0000
03:35:19 tree INFO:No split improves loss.  Returning
03:35:19 tree INFO:Training.  Depth 2 Current Loss: 0.3750 Best Split: A 10.0000 0.1250
03:35:19 tree INFO:Training.  Depth 3 Current Loss: 0.1667 Best Split: A 3.0000 0.1667
03:35:19 tree INFO:No split improves loss.  Returning
03:35:19 tree INFO:Training.  Depth 3 Current Loss: 0.0000 Best Split: A 11.0000 0.0000
03:35:19 tree INFO:No split improves loss.  Returning



Tree:

	Leaf(0.251727490486)

A 1

		Leaf(0.930525089142)

	B 40

			Leaf(0.720837462476)

		A 10

			Leaf(0.60678913307)

{1383260795: <function <lambda> at 0x1106a21b8>, 4113978420: <function <lambda> at 0x110684cf8>, 1413170733: <function <lambda> at 0x1106a29b0>, 2691078375: <function <lambda> at 0x1106e51b8>}


In [22]:
gtree.mate(tree, tree).prn()

	Leaf(0.251727490486)

B 40

		Leaf(0.930525089142)

	A 1

			Leaf(0.720837462476)

		A 10

			Leaf(0.60678913307)



In [23]:
def make_hastie_sample(n_samples):

    features, targets = datasets.make_hastie_10_2(n_samples=n_samples)

    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
    targets = pd.Series(targets, name='target')
    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
    return features, targets



In [14]:
def make_kddcup(n_samples):
    
    features, targets = datasets.fetch_kddcup99(subset='smtp')

    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
    targets = pd.Series(targets, name='target')
    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
    
    features = featurse.sample(n=n_samples)
    
    return features, targets.loc[features.index]
    

def make_random_classification(n_samples, n_features=100):
    
    
    
    features, targets = datasets.make_classification(n_samples=n_samples,
                                                     n_features=n_features,
                                                     n_informative=8,
                                                     n_classes=2,
                                                     n_clusters_per_class=4)
                                                     

    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
    targets = pd.Series(targets, name='target')
    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
        
    return features, targets.loc[features.index]

In [15]:

#features, targets = make_hastie_sample(10000)
features, targets = make_random_classification(10000)

In [16]:
features.shape

(10000, 100)

In [17]:
targets.value_counts()

0.0    5012
1.0    4988
Name: target, dtype: int64

In [18]:
gtree.tree_logger.setLevel(logging.INFO)
tree, leaf_map = gtree.train_greedy_tree(features, targets, loss_fn=gtree.error_rate_loss, max_depth=8)

06:54:15 tree INFO:Training.  Depth 0 Current Loss: 0.4988 Best Split: feature_83 0.0698 0.3852
06:56:11 tree INFO:Training.  Depth 1 Current Loss: 0.3886 Best Split: feature_3 -1.8243 0.2763
06:57:17 tree INFO:Training.  Depth 2 Current Loss: 0.2949 Best Split: feature_17 -3.2804 0.2878
06:57:27 tree INFO:Training.  Depth 3 Current Loss: 0.2727 Best Split: feature_83 -2.9895 0.0455
06:57:29 tree INFO:Training.  Depth 4 Current Loss: 0.0000 Best Split: feature_0 0.2749 0.0000
06:57:29 tree INFO:No split improves loss.  Returning
06:57:37 tree INFO:Training.  Depth 4 Current Loss: 0.0588 Best Split: feature_3 -5.1263 0.0000
06:57:37 tree INFO:Reached leaf node, or constraints force termination.  Returning
06:57:45 tree INFO:Training.  Depth 5 Current Loss: 0.0000 Best Split: feature_0 -0.2473 0.0000
06:57:45 tree INFO:No split improves loss.  Returning
06:58:44 tree INFO:Training.  Depth 3 Current Loss: 0.2880 Best Split: feature_32 2.1051 0.2829
06:59:41 tree INFO:Training.  Depth 4 Cu

07:20:42 tree INFO:No split improves loss.  Returning
07:20:44 tree INFO:Training.  Depth 6 Current Loss: 0.2500 Best Split: feature_0 -0.4574 0.0000
07:20:44 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:20:45 tree INFO:Training.  Depth 7 Current Loss: 0.0000 Best Split: feature_0 -0.4382 0.0000
07:20:45 tree INFO:No split improves loss.  Returning
07:20:47 tree INFO:Training.  Depth 5 Current Loss: 0.2000 Best Split: feature_1 -2.3422 0.0000
07:20:47 tree INFO:Reached leaf node, or constraints force termination.  Returning
07:20:49 tree INFO:Training.  Depth 6 Current Loss: 0.0000 Best Split: feature_0 0.6537 0.0000
07:20:49 tree INFO:No split improves loss.  Returning
07:21:42 tree INFO:Training.  Depth 4 Current Loss: 0.1684 Best Split: feature_31 -1.3284 0.1528
07:21:51 tree INFO:Training.  Depth 5 Current Loss: 0.3750 Best Split: feature_37 -1.3136 0.1250
07:21:53 tree INFO:Training.  Depth 6 Current Loss: 0.0000 Best Split: feature_0 -1.7719 0.0000

KeyboardInterrupt: 

In [None]:
tree.prn()

In [None]:
leaf_map

In [None]:
tree.predict(features, leaf_map)

In [None]:
results = pd.DataFrame({'truth': targets, 'prediction': tree.predict(features, leaf_map)})

In [None]:
1.0 - gtree.error_rate_loss(results.prediction, results.truth) / len(targets)

In [None]:
results.plot(kind='scatter', x='prediction', y='truth')

In [None]:
fig = plt.figure(figsize=(12,8))

for label, grp in tree.predict(features, leaf_map).groupby(targets):
    grp.hist(normed=True, alpha=0.5, label=str(label)) #, label=label)
plt.legend(loc='best')
None

# Compare Methods

In [19]:
features, targets = make_random_classification(5000)

features_validation = features.sample(frac=.3)
targets_validation = targets.loc[features_validation.index]

features = features[~features.index.isin(features_validation.index)]
targets = targets.loc[features.index]

In [76]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, targets,
                                   loss_fn=gtree.cross_entropy_loss,
                                   max_depth=3, min_to_split=10,
                                   num_generations=15, num_survivors=10,
                                   num_children=200, num_seed_trees=5)


06:26:23 evolution DEBUG:Growing Seed: 1 of 5
  return (-1.0 * truth * np.log(predicted) - (1.0 - truth) * np.log(1.0 - predicted)).mean()
06:27:38 evolution DEBUG:Growing Seed: 2 of 5
06:28:51 evolution DEBUG:Growing Seed: 3 of 5
06:30:04 evolution DEBUG:Growing Seed: 4 of 5
06:31:14 evolution DEBUG:Growing Seed: 5 of 5
06:33:06 evolution DEBUG:Resplitting the data
06:33:06 evolution DEBUG:Mating to create 200 children


Losses: [ 1.  1.  1.  1.  1.]
Probs: [ 0.2  0.2  0.2  0.2  0.2]


06:33:56 evolution DEBUG:Surviving Generation: 0:0.6159, 0:0.6264, 1:0.6273, 1:0.6274, 1:0.6279, 1:0.6289, 1:0.6292, 1:0.6293, 1:0.6305, 1:0.6319
06:33:56 evolution INFO:Generation 0 Training Loss: 0.6157 Hold Out Loss 0.6159

06:33:56 evolution DEBUG:Resplitting the data
06:33:56 evolution DEBUG:Mating to create 200 children


Losses: [ 0.61589022  0.62637169  0.62727087  0.62737486  0.62788282  0.62888887
  0.62921577  0.62931864  0.63051522  0.63191096]
Probs: [ 0.1018594   0.10017202  0.10002858  0.100012    0.09993107  0.09977097
  0.099719    0.09970266  0.09951271  0.09929159]


06:34:49 evolution DEBUG:Surviving Generation: 0:0.6158, 0:0.6262, 2:0.6266, 2:0.6267, 2:0.6274, 1:0.6276, 1:0.6276, 2:0.6276, 1:0.6277, 2:0.6279
06:34:49 evolution INFO:Generation 1 Training Loss: 0.6121 Hold Out Loss 0.6158

06:34:49 evolution DEBUG:Resplitting the data
06:34:49 evolution DEBUG:Mating to create 200 children


Losses: [ 0.61582009  0.62617656  0.62660072  0.62667858  0.62740067  0.62757279
  0.62761866  0.62761866  0.62765344  0.62788676]
Probs: [ 0.10165431  0.09998666  0.09991895  0.09990652  0.09979137  0.09976394
  0.09975663  0.09975663  0.09975109  0.09971392]


06:35:38 evolution DEBUG:Surviving Generation: 0:0.6155, 3:0.6220, 2:0.6247, 1:0.6256, 3:0.6261, 3:0.6262, 0:0.6268, 2:0.6276, 3:0.6277, 2:0.6280
06:35:38 evolution INFO:Generation 2 Training Loss: 0.6141 Hold Out Loss 0.6155

06:35:38 evolution DEBUG:Resplitting the data
06:35:38 evolution DEBUG:Mating to create 200 children


Losses: [ 0.61549619  0.62204708  0.62468999  0.6256173   0.62609945  0.62620565
  0.62684876  0.627618    0.62769249  0.62798275]
Probs: [ 0.10153532  0.10047669  0.10005272  0.09990439  0.09982736  0.0998104
  0.09970775  0.09958511  0.09957325  0.09952702]


06:36:29 evolution DEBUG:Surviving Generation: 0:0.6166, 4:0.6182, 3:0.6196, 2:0.6256, 3:0.6258, 3:0.6264, 2:0.6267, 4:0.6267, 4:0.6269, 1:0.6269
06:36:29 evolution INFO:Generation 3 Training Loss: 0.6320 Hold Out Loss 0.6166

06:36:29 evolution DEBUG:Resplitting the data
06:36:29 evolution DEBUG:Mating to create 200 children


Losses: [ 0.61657143  0.61816234  0.61959602  0.62555587  0.62580548  0.62642467
  0.62667427  0.62667427  0.6268522   0.62686509]
Probs: [ 0.10118251  0.10092484  0.10069319  0.09973592  0.09969603  0.09959714
  0.0995573   0.0995573   0.09952891  0.09952686]


06:37:24 evolution DEBUG:Surviving Generation: 4:0.6108, 0:0.6165, 5:0.6166, 4:0.6173, 5:0.6173, 3:0.6199, 2:0.6232, 3:0.6243, 5:0.6243, 5:0.6243
06:37:24 evolution INFO:Generation 4 Training Loss: 0.5895 Hold Out Loss 0.6108

06:37:24 evolution DEBUG:Resplitting the data
06:37:24 evolution DEBUG:Mating to create 200 children


Losses: [ 0.61081392  0.61645704  0.61662772  0.6173311   0.6173311   0.61985494
  0.62320601  0.62431188  0.62431188  0.62431188]
Probs: [ 0.10140239  0.10048283  0.10045515  0.10034115  0.10034115  0.09993316
  0.09939401  0.09921673  0.09921673  0.09921673]


06:38:17 evolution DEBUG:Surviving Generation: 6:0.6077, 6:0.6087, 5:0.6090, 6:0.6092, 5:0.6096, 4:0.6103, 5:0.6103, 5:0.6127, 6:0.6135, 0:0.6153
06:38:17 evolution INFO:Generation 5 Training Loss: 0.5952 Hold Out Loss 0.6077

06:38:17 evolution DEBUG:Resplitting the data
06:38:17 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60774859  0.60874797  0.60898915  0.60919728  0.60955808  0.61029172
  0.61029172  0.61272052  0.61353475  0.6153361 ]
Probs: [ 0.10047418  0.10030987  0.10027026  0.10023609  0.10017688  0.1000566
  0.1000566   0.09965942  0.09952662  0.09923346]


06:39:11 evolution DEBUG:Surviving Generation: 7:0.6072, 7:0.6081, 6:0.6085, 6:0.6089, 7:0.6089, 7:0.6090, 7:0.6090, 6:0.6092, 7:0.6095, 6:0.6095
06:39:11 evolution INFO:Generation 6 Training Loss: 0.6041 Hold Out Loss 0.6072

06:39:11 evolution DEBUG:Resplitting the data
06:39:11 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60717391  0.60808517  0.60849311  0.60893063  0.60893063  0.60895808
  0.60895808  0.60922146  0.60949042  0.60949158]
Probs: [ 0.10026301  0.10011304  0.10004598  0.0999741   0.0999741   0.09996959
  0.09996959  0.09992635  0.09988221  0.09988202]


06:39:56 evolution DEBUG:Surviving Generation: 7:0.6063, 7:0.6093, 7:0.6096, 8:0.6096, 8:0.6097, 7:0.6101, 8:0.6102, 6:0.6102, 8:0.6102, 8:0.6102
06:39:56 evolution INFO:Generation 7 Training Loss: 0.6038 Hold Out Loss 0.6063

06:39:56 evolution DEBUG:Resplitting the data
06:39:56 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60626269  0.6093378   0.60959582  0.60959582  0.60974607  0.61014252
  0.61016923  0.6101702   0.6101702   0.61017225]
Probs: [ 0.10053833  0.10003239  0.09999006  0.09999006  0.09996541  0.09990042
  0.09989604  0.09989588  0.09989588  0.09989554]


06:40:41 evolution DEBUG:Surviving Generation: 9:0.6061, 7:0.6073, 8:0.6078, 9:0.6081, 9:0.6100, 7:0.6101, 8:0.6101, 9:0.6103, 9:0.6105, 8:0.6105
06:40:41 evolution INFO:Generation 8 Training Loss: 0.5990 Hold Out Loss 0.6061

06:40:41 evolution DEBUG:Resplitting the data
06:40:41 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60612375  0.60725252  0.60778961  0.60811705  0.60998812  0.61010101
  0.61010101  0.61034611  0.61045888  0.61046779]
Probs: [ 0.10048534  0.10029929  0.10021088  0.10015702  0.09984982  0.09983131
  0.09983131  0.09979114  0.09977267  0.09977121]


06:41:36 evolution DEBUG:Surviving Generation: 9:0.6039, 10:0.6039, 10:0.6043, 10:0.6045, 10:0.6053, 10:0.6054, 9:0.6055, 7:0.6055, 10:0.6055, 9:0.6055
06:41:36 evolution INFO:Generation 9 Training Loss: 0.6039 Hold Out Loss 0.6039

06:41:36 evolution DEBUG:Resplitting the data
06:41:36 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60386545  0.60386545  0.60434376  0.60445632  0.6053492   0.60535287
  0.60550124  0.60550252  0.60550252  0.60550252]
Probs: [ 0.10017511  0.10017511  0.10009594  0.10007731  0.0999297   0.0999291
  0.09990459  0.09990438  0.09990438  0.09990438]


06:42:21 evolution DEBUG:Surviving Generation: 11:0.6039, 11:0.6041, 11:0.6052, 7:0.6056, 10:0.6056, 9:0.6056, 11:0.6056, 10:0.6056, 11:0.6056, 9:0.6056
06:42:21 evolution INFO:Generation 10 Training Loss: 0.6068 Hold Out Loss 0.6039

06:42:21 evolution DEBUG:Resplitting the data
06:42:21 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60386025  0.60413913  0.60518875  0.60559677  0.60559677  0.60559677
  0.60559677  0.60559677  0.60559677  0.60560212]
Probs: [ 0.10022769  0.10018152  0.10000793  0.09994053  0.09994053  0.09994053
  0.09994053  0.09994053  0.09994053  0.09993965]


06:43:11 evolution DEBUG:Surviving Generation: 12:0.6028, 11:0.6030, 11:0.6034, 12:0.6036, 11:0.6047, 12:0.6047, 11:0.6048, 7:0.6050, 10:0.6050, 9:0.6050
06:43:11 evolution INFO:Generation 11 Training Loss: 0.6053 Hold Out Loss 0.6028

06:43:11 evolution DEBUG:Resplitting the data
06:43:11 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60277936  0.60299624  0.60336227  0.60361183  0.6047461   0.6047461
  0.60484648  0.60498721  0.60498721  0.60498721]
Probs: [ 0.10023613  0.10020016  0.10013947  0.10009812  0.09991038  0.09991038
  0.09989379  0.09987052  0.09987052  0.09987052]


06:44:02 evolution DEBUG:Surviving Generation: 12:0.6014, 11:0.6015, 13:0.6015, 11:0.6020, 12:0.6027, 13:0.6036, 13:0.6041, 13:0.6041, 11:0.6044, 12:0.6044
06:44:02 evolution INFO:Generation 12 Training Loss: 0.6034 Hold Out Loss 0.6014

06:44:02 evolution DEBUG:Resplitting the data
06:44:02 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60135498  0.60153071  0.60153071  0.60201207  0.6026816   0.60361603
  0.60405094  0.60408974  0.60442964  0.60442964]
Probs: [ 0.10026843  0.10023921  0.10023921  0.10015922  0.10004807  0.09989315
  0.09982112  0.0998147   0.09975845  0.09975845]


06:44:46 evolution DEBUG:Surviving Generation: 12:0.6021, 14:0.6021, 11:0.6026, 13:0.6026, 13:0.6026, 11:0.6027, 13:0.6032, 13:0.6032, 14:0.6034, 14:0.6037
06:44:46 evolution INFO:Generation 13 Training Loss: 0.6069 Hold Out Loss 0.6021

06:44:46 evolution DEBUG:Resplitting the data
06:44:46 evolution DEBUG:Mating to create 200 children


Losses: [ 0.60211002  0.60211002  0.60255051  0.60255051  0.60264382  0.60274604
  0.60318653  0.60319638  0.60338147  0.60373705]
Probs: [ 0.10011801  0.10011801  0.10004488  0.10004488  0.1000294   0.10001244
  0.09993938  0.09993775  0.09990707  0.09984816]


06:45:32 evolution DEBUG:Surviving Generation: 12:0.6016, 14:0.6016, 11:0.6021, 13:0.6021, 13:0.6021, 11:0.6022, 13:0.6026, 14:0.6028, 14:0.6036, 15:0.6036
06:45:32 evolution INFO:Generation 14 Training Loss: 0.6093 Hold Out Loss 0.6016



In [48]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

0.297333333333


In [56]:
generations[-1]['best_of_generation']['tree'].find_leaves(features).value_counts()

2078480799    1414
2437144351    1148
2570464104     938
dtype: int64

In [52]:
for gen in generations[-1]['generation']:
    print '--------------------{:.4f}----------------------------'.format(gen['loss_testing'])
    gen['tree'].prn()

--------------------0.6624----------------------------
	Leaf(0.922108366641)

feature_50 -0.488387233513

		Leaf(0.78625028227)

	feature_99 -0.565015852089

		Leaf(0.00848191813269)

--------------------0.6642----------------------------
		Leaf(0.163847666623)

	feature_17 -0.565015852089

		Leaf(0.12723466858)

feature_50 -0.625108536211

		Leaf(0.957803398685)

	feature_47 0.756409780254

		Leaf(0.516671038657)

--------------------0.6650----------------------------
		Leaf(0.802550722777)

	feature_50 -0.565015852089

		Leaf(0.599164400845)

feature_50 1.26675090679

		Leaf(0.998781959327)

	feature_47 1.08799997383

		Leaf(0.470929887087)

--------------------0.6652----------------------------
		Leaf(0.402819908511)

	feature_60 -0.709362631916

		Leaf(0.563559424693)

feature_50 -0.113775250463

		Leaf(0.646840213024)

	feature_47 0.756409780254

		Leaf(0.0183416109128)

--------------------0.6652----------------------------
		Leaf(0.825303177451)

	feature_60 -0.709362631916

		L

In [None]:
for result in generations[-1]['generation']:
    print '---------------------------------------------'
    result['tree'].prn()

In [None]:
result = gtree.train_random_trees(features, targets, loss_fn=gtree.error_rate_loss,
                                  max_depth=2,
                                  min_to_split=10,
                                  num_trees=10)

In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [34]:
from sklearn import tree
from sklearn.model_selection import train_test_split

clf = tree.DecisionTreeClassifier(max_depth=2)
clf = clf.fit(features, targets)

In [35]:
predictions = pd.Series(clf.predict_proba(features_validation)[:, 1], index=features_validation.index)
gtree.error_rate_loss(predictions, targets_validation)

0.29600000000000004

In [36]:
from sklearn.externals.six import StringIO  
from sklearn import tree as sklearn_tree
import pydot 
dot_data = StringIO() 
sklearn_tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("iris.pdf") 

Oct 15 17:03:49  dot[71655] <Error>: The function ‘CGFontGetGlyphPath’ is obsolete and will be removed in an upcoming update. Unfortunately, this application, or a library it uses, is using this obsolete function, and is thereby contributing to an overall degradation of system performance.
Oct 15 17:03:49  dot[71655] <Error>: The function ‘CGFontGetGlyphPaths’ is obsolete and will be removed in an upcoming update. Unfortunately, this application, or a library it uses, is using this obsolete function, and is thereby contributing to an overall degradation of system performance.



True

In [37]:
%alias_magic t timeit

Created `%t` as an alias for `%timeit`.
Created `%%t` as an alias for `%%timeit`.


In [38]:
sel = features[features['feature_3'] < 0].index

In [39]:
sel

Int64Index([   0,    1,    2,    4,    7,    9,   13,   16,   17,   18,
            ...
            4961, 4969, 4973, 4975, 4981, 4982, 4987, 4990, 4994, 4997],
           dtype='int64', length=1755)

In [46]:
%t features.loc[sel]

The slowest run took 10.91 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 513 µs per loop


In [43]:
%t df.reindex_axis(sel, copy=False)

1000 loops, best of 3: 213 µs per loop


In [77]:
# BC Dataset

In [97]:
bc_info = datasets.load_breast_cancer()
features = pd.DataFrame(bc_info['data'])
target = pd.Series(bc_info['target']).dropna()
features = features.loc[target.index]

In [104]:
target.value_counts(dropna=False)

1    357
0    212
dtype: int64

In [108]:
%pdb off

Automatic pdb calling has been turned OFF


In [None]:
gtree.tree_logger.setLevel(logging.WARNING)


In [112]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, target,
                                   loss_fn=gtree.cross_entropy_loss,
                                   max_depth=3, min_to_split=10,
                                   num_generations=30, num_survivors=10,
                                   num_children=50, num_seed_trees=5)

04:39:40 evolution DEBUG:Growing Seed: 1 of 5
04:40:09 evolution DEBUG:Growing Seed: 2 of 5
04:40:38 evolution DEBUG:Growing Seed: 3 of 5
04:41:04 evolution DEBUG:Growing Seed: 4 of 5
04:41:33 evolution DEBUG:Growing Seed: 5 of 5
04:42:00 evolution DEBUG:Resplitting the data
04:42:00 evolution DEBUG:Mating to create 50 children
04:42:11 evolution DEBUG:Surviving Generation: 1:0.2075, 1:0.2377, 1:0.2401, 0:0.2492, 1:0.2497, 1:0.2511, 1:0.2529, 1:0.2533, 1:0.2556, 1:0.2606
04:42:11 evolution INFO:Generation 0 Training Loss: 0.1712 Hold Out Loss 0.2075

04:42:11 evolution DEBUG:Resplitting the data
04:42:11 evolution DEBUG:Mating to create 50 children
04:42:26 evolution DEBUG:Surviving Generation: 2:0.2038, 1:0.2081, 2:0.2105, 1:0.2133, 2:0.2174, 2:0.2235, 2:0.2276, 2:0.2303, 2:0.2334, 1:0.2340
04:42:26 evolution INFO:Generation 1 Training Loss: 0.1865 Hold Out Loss 0.2038

04:42:26 evolution DEBUG:Resplitting the data
04:42:26 evolution DEBUG:Mating to create 50 children
04:42:38 evoluti

04:46:42 evolution DEBUG:Resplitting the data
04:46:42 evolution DEBUG:Mating to create 50 children
04:46:54 evolution DEBUG:Surviving Generation: 20:0.1093, 17:0.1121, 19:0.1121, 20:0.1149, 18:0.1182, 16:0.1184, 17:0.1184, 19:0.1184, 18:0.1184, 21:0.1184
04:46:54 evolution INFO:Generation 24 Training Loss: 0.1486 Hold Out Loss 0.1093

04:46:54 evolution DEBUG:Resplitting the data
04:46:54 evolution DEBUG:Mating to create 50 children
04:47:06 evolution DEBUG:Surviving Generation: 17:0.1156, 19:0.1156, 22:0.1223, 22:0.1254, 20:0.1373, 21:0.1373, 21:0.1373, 20:0.1430, 16:0.1457, 17:0.1457
04:47:06 evolution INFO:Generation 25 Training Loss: 0.2164 Hold Out Loss 0.1156

04:47:06 evolution DEBUG:Resplitting the data
04:47:06 evolution DEBUG:Mating to create 50 children
04:47:16 evolution DEBUG:Surviving Generation: 22:0.1122, 20:0.1229, 21:0.1230, 21:0.1230, 20:0.1252, 17:0.1253, 19:0.1254, 16:0.1298, 17:0.1298, 20:0.1305
04:47:16 evolution INFO:Generation 26 Training Loss: 0.1741 Hold Out