In [1]:
from __future__ import division

In [2]:
import sys
sys.path.append('..')

In [3]:
import math
import random
import numpy as np
import pandas as pd

In [4]:
from sklearn import datasets

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
%load_ext autoreload
%autoreload 1

%aimport gtree
%aimport tools

In [7]:
import logging
%config Application.log_level="INFO"
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

## Goals:

### Separate the structure of a tree from the data of a tree.  In other words,
  fitting a tree does two things: It creates the structure of a tree and it
  creates a mapping of each leaf to a value.  Lookup therefore requires both
  finding the leaf node AND using the map to lookup the value.
  
### The loss function optimized by the tree is configurable, as is the leaf


## Terms:

### Tree
A Tree is an object that takes input data and determines what leaf it ends up in.  Unlike many tree implementations, the Tree itself doesn't store data about the value of a leaf.  That is stored externally.


### loss_fn
A loss_fn is a function that takes data rows, the predicted targets for those rows, and the actual targets for those rows, and returns a single value that determines the "LOSS" or "COST" of that prediction (lower cost/loss is better)

```
def loss_fn(predicted_targets, actual_targets) -> float
```

A loss function must be additive (so, one should not apply a mean as a part of it)

### leaf_prediction_fn
A leaf_prediction_fn is a function which takes the features and actual targets that end up in a leaf and returns a Series of the predictions for each row ending up in that leaf.  It is typically a constant function whose value is either the mean good rate in that leaf (among the actual targets) or the median target, but can be anything else

```
def leaf_prediction_fn(features) -> pd.Series
```


### leaf_prediction_builder
A leaf_prediction_builder is a function which takes the features and actual targets that end up in a TRANING leaf and returns a leaf_prediction_fn.  This leaf_prediction_fn is used to predict the value of testing rows that end up in the same leaf.

```
def leaf_prediction_builder(features, actual_targets) -> leaf_prediction_fn
```


### leaf_prediction_map
A leaf_prediction_map is a map of leaf ids (eg their hash) to the leaf_prediction_fn for that leaf.  One can only use a tree to score data if one has a leaf_prediction_map.  This design allows on to use the same tree as a subset of another tree without having their leaf values become entangled.

--------------

## Test Tree Manipulation Functions

In [None]:
%pdb off

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.LeafNode()

o = gtree.BranchNode('C', 0.1, None, None)
o.left = gtree.LeafNode()
o.right = gtree.LeafNode()

t.prn()
print '\n\n'
o.prn()

In [None]:
u = gtree.replace_branch_split(t, t.right, o)
u.prn()
print '\n\n'
t.prn()

In [None]:
v = gtree.replace_node(t, t.left, o)
v.prn()
print '\n\n'
t.prn()

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode()
t.right = gtree.BranchNode('B', 0.9, None, None)
t.right.left = gtree.LeafNode()
t.right.right = gtree.BranchNode('C', 0.9, None, None)
t.right.right.right = gtree.LeafNode()
t.right.right.left = gtree.LeafNode()

t.prn()
print '\n\n'
gtree.prune(t, 2).prn()
print '\n\n'
t.prn()

In [None]:
data = pd.DataFrame({'A': [0.1, 10, .02],
                     'B': [10, 20, 30]},
                    index=['foo', 'bar', 'baz'])

In [None]:
class StaticLeaf(object):

    def __init__(self, val):
        self.val = val
        
    def predict(self, df):
        return np.array([self.val for _ in range(len(df))])

In [None]:
t = gtree.BranchNode('A', 0.5, None, None)
t.left = gtree.LeafNode() #'A', 0.5, 10, 20)
t.right = gtree.LeafNode() #'A', 0.5, 100, 0)

leaf_map = {hash(t.left): StaticLeaf(10),
            hash(t.right): StaticLeaf(20)}

t.predict(data, leaf_map)

In [None]:
t

In [None]:
# Create a split on a DataFrame

df = pd.DataFrame({'foo': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])})

gtree._single_variable_best_split(
    df,
    'foo',
    pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1]),
    loss='error_rate',
    leaf_prediction='mean')

In [None]:
threshold = 0.5
truth     = pd.Series([1, 0, 1], dtype=np.float32)
predicted = pd.Series([0, 1, 1], dtype=np.float32)

print gtree.loss(truth, predicted, type='error_rate')

print 1.0 - ((predicted >= threshold) == truth).mean() #+ (predicted < threshold) * (1 - truth)

# Test Split Finding

In [None]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]}, dtype=np.float32)
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0], dtype=np.float32)

tree, leaf_map = gtree.train_greedy_tree(df, target, loss='error_rate')

print '\nTree:\n'
tree.prn()

print leaf_map

In [None]:
gtree.calculate_leaf_map(tree, df, target)

In [None]:
gtree.random_node(tree)

In [None]:
print gtree.get_all_nodes(tree)

In [None]:
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   'B': [10, 20, 50, 30, 40, 50, 60, 50, 70, 90, 100, 110 ]}, dtype=np.float32)
target = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0], dtype=np.float32)

In [None]:
gtree._single_variable_best_split(df,'B', target,
                                  loss='error_rate',
                                  leaf_prediction='mean')

In [None]:
tree, leaf_map = gtree.train_greedy_tree(df, target,
                                         loss='error_rate',
                                         feature_sample_rate=.5,
                                         row_sample_rate=.5)

print '\nTree:\n'
tree.prn()

print leaf_map

In [None]:
gtree.mate(tree, tree).prn()

In [None]:
#def make_hastie_sample(n_samples):
#
#    features, targets = datasets.make_hastie_10_2(n_samples=n_samples)
#
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#    return features, targets



In [None]:
#def make_kddcup(n_samples):
#    
#    features, targets = datasets.fetch_kddcup99(subset='smtp')
#
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#    
#    features = featurse.sample(n=n_samples)
#    
#    return features, targets.loc[features.index]
    

#def make_random_classification(n_samples, n_features=100):
       
#    features, targets = datasets.make_classification(n_samples=n_samples,
#                                                     n_features=n_features,
#                                                     n_informative=8,
#                                                     n_classes=2,
#                                                     n_clusters_per_class=4)
#                                                     
#    features = pd.DataFrame(features, columns=['feature_{}'.format(i) for i in range(features.shape[1])])
#    targets = pd.Series(targets, name='target')
#    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)
#        
#    return features, targets.loc[features.index]

# Start the Test Analysis Here

In [None]:
#features, targets = make_hastie_sample(10000)
features, targets = tools.make_random_classification(10000)
features = pd.DataFrame(features, dtype=np.float32)
targets = pd.Series(targets, dtype=np.float32)

In [None]:
features.shape

In [None]:
targets.value_counts()

In [None]:
gtree.tree_logger.setLevel(logging.INFO)
tree, leaf_map = gtree.train_greedy_tree(features, targets,
                                         loss='cross_entropy',
                                         leaf_prediction='logit',
                                         max_depth=3)

In [None]:
#gtree.cross_entropy_loss(targets[features.feature_45 < .2326])

In [None]:
#gtree._single_variable_best_split(features, 'feature_45', targets, None, None, None)

In [None]:
tree.prn()

In [None]:
set(pd.Series([1, 2, 3]))

In [None]:
tree.predict(features, leaf_map)

In [None]:
results = pd.DataFrame({'truth': targets, 'prediction': tree.predict(features, leaf_map)})

In [None]:
1.0 - gtree.error_rate_loss(results.prediction, results.truth) / len(targets)

In [None]:
results.plot(kind='scatter', x='prediction', y='truth')

In [None]:
fig = plt.figure(figsize=(12,8))

for label, grp in tree.predict(features, leaf_map).groupby(targets):
    grp.hist(normed=True, alpha=0.5, label=str(label)) #, label=label)
plt.legend(loc='best')
None

# Compare Methods

In [None]:
features, targets = tools.make_random_classification(5000)
features = pd.DataFrame(features, dtype=np.float32)
targets = pd.Series(targets, dtype=np.float32)

features_validation = features.sample(frac=.3)
targets_validation = targets.loc[features_validation.index]

features = features[~features.index.isin(features_validation.index)]
targets = targets.loc[features.index]

In [None]:
%pdb off

In [None]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, targets,
                                   loss='cross_entropy',
                                   max_depth=3, min_to_split=10,
                                   num_generations=15, num_survivors=10,
                                   num_children=200, num_seed_trees=5)


In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [None]:
generations[-1]['best_of_generation']['tree'].find_leaves(features).value_counts()

In [None]:
for gen in generations[-1]['generation']:
    print '--------------------{:.4f}----------------------------'.format(gen['loss_testing'])
    gen['tree'].prn()

In [None]:
for result in generations[-1]['generation']:
    print '---------------------------------------------'
    result['tree'].prn()

In [None]:
result = gtree.train_random_trees(features, targets, loss_fn=gtree.error_rate_loss,
                                  max_depth=2,
                                  min_to_split=10,
                                  num_trees=10)

In [None]:
leaf_map = gtree.calculate_leaf_map(result['tree'], features, targets, gtree.leaf_good_rate_split_builder)

print gtree.error_rate_loss(result['tree'].predict(features_validation, leaf_map), targets_validation)

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split

clf = tree.DecisionTreeClassifier(max_depth=2)
clf = clf.fit(features, targets)

In [None]:
predictions = pd.Series(clf.predict_proba(features_validation)[:, 1], index=features_validation.index)
gtree.error_rate_loss(predictions, targets_validation)

In [None]:
from sklearn.externals.six import StringIO  
from sklearn import tree as sklearn_tree
import pydot 
dot_data = StringIO() 
sklearn_tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("iris.pdf") 

In [None]:
%alias_magic t timeit

In [None]:
sel = features[features['feature_3'] < 0].index

In [None]:
sel

In [None]:
%t features.loc[sel]

In [None]:
%t df.reindex_axis(sel, copy=False)

# Evolve

In [None]:
# BC Dataset

In [8]:
bc_info = datasets.load_breast_cancer()
features = pd.DataFrame(bc_info['data'], dtype=np.float32)
target = pd.Series(bc_info['target'], dtype=np.float32).dropna()
features = features.loc[target.index]

In [9]:
target.value_counts(dropna=False)

1.0    357
0.0    212
dtype: int64

In [None]:
%pdb off

In [10]:
gtree.tree_logger.setLevel(logging.WARNING)


In [11]:
gtree.tree_logger.setLevel(logging.WARNING)

result, generations = gtree.evolve(features, target,
                                   loss='cross_entropy',
                                   leaf_prediction='logit',
                                   max_depth=3, min_to_split=10,
                                   num_generations=30, num_survivors=10,
                                   num_children=50, num_seed_trees=5)

05:18:02 evolution DEBUG:Growing Seed: 1 of 5
05:18:02 evolution DEBUG:Growing Seed: 2 of 5
05:18:03 evolution DEBUG:Growing Seed: 3 of 5
05:18:03 evolution DEBUG:Growing Seed: 4 of 5
05:18:04 evolution DEBUG:Growing Seed: 5 of 5
05:18:04 evolution DEBUG:Resplitting the data
05:18:04 evolution DEBUG:Mating to create 50 children
05:19:14 evolution DEBUG:Surviving Generation: 1:0.8079, 1:0.8079, 1:0.8079, 1:0.8079, 1:0.8752, 1:0.8752, 0:0.8752, 1:0.8752, 0:0.8752, 1:0.8752
05:19:14 evolution INFO:Generation 0 Training Loss: 0.8253 Hold Out Loss 0.8079

05:19:14 evolution DEBUG:Resplitting the data
05:19:14 evolution DEBUG:Mating to create 50 children
05:20:12 evolution DEBUG:Surviving Generation: 1:0.8752, 2:0.8752, 2:0.8752, 1:0.9425, 2:0.9425, 2:0.9425, 2:0.9425, 2:0.9425, 2:1.0098, 2:1.0099
05:20:12 evolution INFO:Generation 1 Training Loss: 0.6602 Hold Out Loss 0.8752

05:20:12 evolution DEBUG:Resplitting the data
05:20:12 evolution DEBUG:Mating to create 50 children
05:21:53 evoluti

05:44:14 evolution DEBUG:Resplitting the data
05:44:14 evolution DEBUG:Mating to create 50 children
05:45:31 evolution DEBUG:Surviving Generation: 24:0.7405, 24:0.8079, 24:0.8079, 24:0.8079, 24:0.8079, 23:0.8752, 24:0.9425, 23:0.9425, 24:0.9425, 23:0.9426
05:45:31 evolution INFO:Generation 24 Training Loss: 0.6189 Hold Out Loss 0.7405

05:45:31 evolution DEBUG:Resplitting the data
05:45:31 evolution DEBUG:Mating to create 50 children
05:47:03 evolution DEBUG:Surviving Generation: 25:0.4713, 24:0.5386, 24:0.5386, 25:0.6060, 25:0.6732, 25:0.6733, 25:0.7406, 25:0.7406, 24:0.7406, 25:0.7406
05:47:03 evolution INFO:Generation 25 Training Loss: 0.6602 Hold Out Loss 0.4713

05:47:03 evolution DEBUG:Resplitting the data
05:47:03 evolution DEBUG:Mating to create 50 children
05:47:59 evolution DEBUG:Surviving Generation: 26:0.4713, 26:0.4713, 26:0.4713, 25:0.4713, 25:0.4713, 24:0.5386, 25:0.5386, 24:0.5386, 26:0.5386, 25:0.5386
05:47:59 evolution INFO:Generation 26 Training Loss: 0.9491 Hold Out

In [12]:
generations

[{'best_of_generation': {'gen': 1,
   'loss_testing': 1.4810696840286255,
   'loss_training': 0.9902824759483337,
   'tree': <gtree.BranchNode at 0x11175e610>},
  'generation': [{'gen': 1,
    'loss_testing': 1.4810696840286255,
    'loss_training': 0.9902824759483337,
    'tree': <gtree.BranchNode at 0x11175e610>},
   {'gen': 1,
    'loss_testing': 7.203172206878662,
    'loss_training': 7.138035297393799,
    'tree': <gtree.BranchNode at 0x1117a2f50>},
   {'gen': 1,
    'loss_testing': 5.924090385437012,
    'loss_training': 5.776424407958984,
    'tree': <gtree.BranchNode at 0x111768810>},
   {'gen': 1,
    'loss_testing': 7.203156471252441,
    'loss_training': 7.014230251312256,
    'tree': <gtree.BranchNode at 0x1117a2290>},
   {'gen': 1,
    'loss_testing': 2.8950071334838867,
    'loss_training': 2.4758737087249756,
    'tree': <gtree.BranchNode at 0x111754150>},
   {'gen': 1,
    'loss_testing': 1.1444816589355469,
    'loss_training': 0.7014605402946472,
    'tree': <gtree.Br

In [20]:
for gen in generations:
    best = gen['best_of_generation']
    print '=========================={:.4f} {:.4f}==============================\n'.format(
        best['loss_training'],
        best['loss_testing'])
    best['tree'].prn()


			<Leaf>
		17 0.01033
			<Leaf>
	0 14.48000
			<Leaf>
		0 17.02000
			<Leaf>
27 0.13790
		<Leaf>
	23 706.00000
			<Leaf>
		0 15.06000
			<Leaf>

			<Leaf>
		12 2.57700
			<Leaf>
	0 14.99000
			<Leaf>
		1 20.26000
			<Leaf>
27 0.13740
		<Leaf>
	10 0.25300
			<Leaf>
		0 14.22000
			<Leaf>

			<Leaf>
		1 20.26000
			<Leaf>
	0 14.99000
			<Leaf>
		1 20.26000
			<Leaf>
27 0.13740
		<Leaf>
	27 0.13790
			<Leaf>
		23 576.00000
			<Leaf>

			<Leaf>
		13 49.11000
			<Leaf>
	0 14.99000
			<Leaf>
		27 0.13740
			<Leaf>
27 0.13740
		<Leaf>
	27 0.13740
			<Leaf>
		20 12.47000
			<Leaf>

			<Leaf>
		18 0.02124
			<Leaf>
	0 14.99000
			<Leaf>
		18 0.02124
			<Leaf>
27 0.13740
		<Leaf>
	27 0.13790
			<Leaf>
		4 0.08858
			<Leaf>

		<Leaf>
	0 14.99000
			<Leaf>
		16 0.01608
			<Leaf>
27 0.13740
		<Leaf>
	16 0.03850
			<Leaf>
		20 12.47000
			<Leaf>

			<Leaf>
		4 0.08858
			<Leaf>
	0 14.99000
			<Leaf>
		19 0.00353
			<Leaf>
27 0.13740
	<Leaf>

			<Leaf>
		6 0.07390
			<Leaf>
	25 0.15500
		<Leaf>
27 

In [None]:
X = np.array([[1, 2, 3], [4, 5, 6]])
Y = X[0:2]

In [None]:
type(X)

In [None]:
id(Y.data)

In [None]:
[1, 2, 3][0:3]

In [None]:
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
X

In [None]:
fns = {0: lambda x: (x[:,0] + x[:,1] + x[:,2]).reshape(len(x), 1), 1: lambda x: (-1*x[:,0]).reshape(len(x), 1)}

hashes = np.array([[0], [1], [0]])
hashes

In [None]:
predictions = np.zeros((len(X), 1))
predictions

In [None]:
zero = np.zeros((len(X), 1))

for i in [0, 1]:
    #comparison = np.full((len(X), 1), i)
    predictions[hashes.reshape(len(X))==i] = fns[i](X[hashes.reshape(len(X))==i, :]) # += np.where(hashes==i, fns[i](X), zero)
    
predictions

In [None]:
X[hashes==comparison, :]

In [None]:
X[:,0].reshape(3, 1)

In [None]:
X[hashes==np.array([1]).reshape((len(X), 1))]

In [None]:
hashes==1

In [None]:
X[np.array([True, False, True]), :]

In [None]:
import numpy as np
import statsmodels.discrete.discrete_model as sm
import statsmodels.tools.tools as sm_tools

X = np.array([[1, 2,  3],
              [2, 7,  5],
              [3, 10, 7],
              [5, 18, 10],
              [-10, 70, 3]             
             ], dtype=np.float64)

y = np.array([[1], [0], [1], [0], [1]], dtype=np.float64)

logit = sm.Logit(y, sm_tools.add_constant(X))
fit = logit.fit_regularized(method='l1', alpha=1.0)
fit.params

In [None]:
logit.predict(fit.params, sm_tools.add_constant(X))

In [None]:
from sklearn.svm.base import _fit_liblinear

coef_, intercept_, n_iter = _fit_liblinear(
                X, np.ravel(y), C=1.0, fit_intercept=True, intercept_scaling=1.0,
                class_weight=None, penalty='l1', dual=False, verbose=True,
                max_iter=5000, tol=1e-4, random_state=None,
                sample_weight=None)
#n_iter = np.array([n_iter])

(coef_, intercept_, n_iter)

In [None]:
from scipy.special import expit
expit(coef_.dot(X.T) + intercept_)