In [20]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import pylab
####################### Plotting tools
import matplotlib
import matplotlib.pyplot as plt
#import seaborn
####################### data frames
import pandas as pd
from pandas.tools import plotting
####################### Utilities
import urllib
from six.moves import cPickle as pickle
import random 
from csv import reader
from math import sqrt
from math import floor
import os
import numpy as np
import scipy.sparse as sps
from pandas import DataFrame
from IPython.display import display
######################## From scikit-learn
from sklearn import random_projection
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
######################## From home directory
from forest_class import forest
from generalTrees_class import flex_binary_trees, master_trees, getDepth
from tree_utilities import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset - Uniform distribution over hypercube

In [21]:
def sample_uniform_hypercube(n_samples, dimension):
    """
    Returns sample size by dimension
    """
    def map_function(null):
        return np.random.uniform(low=0.0, high=1.0, size=n_samples)
        
    return np.transpose(np.array(list(map(map_function, [None]*dimension))))

## test
#sample_uniform_hypercube(10, 5)
seed = 1
np.random.seed(seed)
dimension = 10
n_samples = 10000
data = sample_uniform_hypercube(n_samples, dimension)
mu = 5
sigma = 2
randmat = sigma*np.random.randn(10)+mu ## random normal
labels = np.dot(data, randmat) ## labels generated by a simple linear model
data_tr = data[:8000,:]
labels_tr = labels[:8000]
data_tt = data[8000:,:]
labels_tt = labels[8000:]
print('In uniform cube, there are %d train and %d test data' %(len(labels_tr), len(labels_tt)))

In uniform cube, there are 8000 train and 2000 test data


In [22]:
######------ Utility functions
def cross_valid_split(n_data, n_folds):
    """
    Given size of the data and number of folds
    Returns n_folds disjoint sets of indices, where indices
    in each fold are chosen u.a.r. without replacement
    """
    data_ind = list(range(n_data))    
    folds = list()
    fold_size = floor(n_data/n_folds)
    for i in range(n_folds):
        if i < n_folds-1:
            fold = list()
            while len(fold) <= fold_size:
                index = random.randrange(len(data_ind))
                fold.append(data_ind.pop(index))
            folds.append(fold)
        else:
            ## assign all remaining data to the last fold
            folds.append(data_ind)
    return folds

def zero_one_loss(labels, predictions):
    correct = 0
    for i in range(len(labels)):
        if labels[i] == predictions[i]:
            correct += 1
    loss = (len(labels)-correct)/float(len(labels))
    return loss

def explained_var_loss(labels, predictions):
    #res_var = np.sum(np.array([diff**2 for diff in labels-predictions]))
    res_var = np.var(np.array(labels)-np.array(predictions))
    tot_var = np.var(np.array(labels))
    
    return 1-res_var/tot_var

def l2_loss(labels, predictions):
    loss = np.linalg.norm(np.array(labels)-np.array(predictions))
    return loss/(len(labels)**(1/2))

def csize_decrease_rate(data, tree):
    """
    This is an unsupervised evaluation, which tries to capture how fast
    the data size of a cell decreases after building a tree
    """
    diam_s,_,_ = data_diameter(data)
    diam_f = 0
    
    if hasattr(tree, 'slave_tree'):
        ## if this is a master tree
        for leaf in traverseLeaves_mtree(tree):
            diam, _, _ = data_diameter(leaf.slave_tree.data[leaf.slave_tree.data_ind, :])
            if diam > diam_f:
                diam_f = diam
            
    else:
        ## if this is normal binary tree
        for leaf in traverseLeaves(tree):
            diam, _, _ = data_diameter(leaf.data[leaf.data_ind,:])
            if diam > diam_f:
                diam_f = diam
        
    return (diam_s/diam_f)/getDepth(tree)
        

def cross_valid_eval(data, labels, n_folds, loss, algorithm, sklearn=False, need_ind=False, **kwargs):
    """
    Given data and labels, a loss function, and a method
    generate a list of cv-losses
    
    """
    ## generate random folds
    folds_ind = cross_valid_split(data.shape[0], n_folds)
    losses = list()
    for fold_ind in folds_ind:
        print("Evaluating the %d-th fold" %(len(losses)+1))
        test_ind = fold_ind
        folds_ind_ = list(folds_ind) # this ensures we are not modifying the original list!
        folds_ind_.remove(fold_ind)
        train_ind = [item for sublist in folds_ind_ for item in sublist] #flatten remaining index set
        ## Further divide the data into train and test
        data_tr = data[train_ind,:]
        labels_tr = labels[train_ind]
        data_tt = data[test_ind,:]
        labels_tt = labels[test_ind]
        # train the algorithm 
        if sklearn:
            alg = algorithm(**kwargs)
            alg.fit(data_tr, labels_tr)
        elif need_ind:
            data_ind = range(len(data_tr))
            alg = algorithm(data_tr, data_indices=data_ind, labels=labels_tr, **kwargs) #init
            alg.train()
        else:
            # RF and master trees don't need to be given an index 
            alg = algorithm(data_tr, labels=labels_tr, **kwargs) #init
            alg.train()
        #c = print_mtree_leaves(alg) ## added should be removed
        #print("There are %d partitions" %c)
        
        # calculate loss on the current fold
        #dp = None
        #if isinstance(alg, DecisionTreeClassifier):
        #    dp = alg.decision_path(data_tr)
        losses.append(loss(labels_tt, alg.predict(data_tt)))
        print('Generated tree with height', getDepth(alg,0))
        #print(labels[0],alg.predict(data_tt)[0])
        del alg
    return losses           

## Base tree evaluation

In [16]:
######## Median Tree ############
proj_design={'name':'projmat','params':{'name':'breiman','sparsity':1,'target_dim':1}}  
split_design = {'name':'median'}
stop_design={'name':'naive'}
kwargs = {'proj_design':proj_design, 'split_design':split_design, 'stop_design':stop_design, 'predict_type':'regress'}
scores_c1 = list()
#scores_c1_test = list()
#params_c1 = list()
data_ind = range(len(data_tr))
alg = flex_binary_trees(data_tr, data_indices=data_ind, labels=labels_tr, **kwargs)
alg.train()
predict_on_train = alg.predict(data_tr)
print('Training loss: %f' %l2_loss(labels_tr, predict_on_train))
predict_on_test = alg.predict(data_tt)
print('Test loss: %f' %l2_loss(labels_tt, predict_on_test))

##
# scores_c1.append(cross_valid_eval(data_tr, labels_tr, 4, l2_loss, flex_binary_trees, 
#                                               need_ind=True, **kwargs))
# scores_c1_test.append(cross_valid_eval(data_tt, labels_tt, 5, zero_one_loss, flex_binary_trees, 
#     #                                          need_ind=True, **kwargs))
#params_c1.append([t_dim])

Training loss: 0.000000
Training loss: 171.673693


In [8]:
scores_c1


[[190.81450214753312,
  187.89859330078139,
  184.2559363676277,
  173.38674069948627]]

In [18]:
######## Median Spill Tree ############
proj_design={'name':'projmat','params':{'name':'breiman','sparsity':1,'target_dim':1}}  
split_design = {'name':'median_spill'}
stop_design={'name':'naive'}
kwargs = {'proj_design':proj_design, 'split_design':split_design, 'stop_design':stop_design, 'predict_type':'regress'}
data_ind = range(len(data_tr))
alg = flex_binary_trees(data_tr, data_indices=data_ind, labels=labels_tr, **kwargs)
alg.train()
predict_on_train = alg.predict(data_tr)
print('Training loss: %f' %l2_loss(labels_tr, predict_on_train))
predict_on_test = alg.predict(data_tt)
print('Test loss: %f' %l2_loss(labels_tt, predict_on_test))
#scores_mspill = list()
#scores_c1_test = list()
#params_c1 = list()

##
# scores_mspill.append(cross_valid_eval(data_tr, labels_tr, 4, l2_loss, flex_binary_trees, 
#                                               need_ind=True, **kwargs))

Training loss: 0.000000
Test loss: 113.065578


In [11]:
scores_mspill

[[107.51614968978102,
  125.00119234989374,
  116.91040888219152,
  118.08578107669975]]

In [19]:
######## Randomized Spill Tree ############
proj_design={'name':'projmat','params':{'name':'dasgupta','target_dim':1}}  
split_design = {'name':'median_spill'}
stop_design={'name':'naive'}
kwargs = {'proj_design':proj_design, 'split_design':split_design, 'stop_design':stop_design, 'predict_type':'regress'}
data_ind = range(len(data_tr))
alg = flex_binary_trees(data_tr, data_indices=data_ind, labels=labels_tr, **kwargs)
alg.train()
predict_on_train = alg.predict(data_tr)
print('Training loss: %f' %l2_loss(labels_tr, predict_on_train))
predict_on_test = alg.predict(data_tt)
print('Test loss: %f' %l2_loss(labels_tt, predict_on_test))
#scores_rpspill = list()
#scores_c1_test = list()
#params_c1 = list()

##
# scores_rpspill.append(cross_valid_eval(data_tr, labels_tr, 4, l2_loss, flex_binary_trees, 
#                                               need_ind=True, **kwargs))

Training loss: 0.000000
Test loss: 103.683036


## Forest evaluations

In [23]:
## Forest level parameters
n_trees_list = [10,50,100]
#n_samples_list = [10, 50, 100]
    

In [26]:
######## Base tree: median Tree ############
proj_design={'name':'projmat','params':{'name':'breiman','sparsity':1,'target_dim':1}}  
split_design = {'name':'median'}
stop_design={'name':'naive'}
####
kwargs = {'tree_design':{"tree":'flex','proj_design':proj_design,'split_design':split_design,'stop_design':stop_design}, 
          'predictor_type':'regress'}
####
for n_trees in n_trees_list:
    kwargs['n_trees'] = n_trees
    fc_estimator = forest(data_tr, labels=labels_tr, **kwargs)
    fc_estimator.train()
    predict_on_train = fc_estimator.predict(data_tr)
    print('Training loss: %f' %l2_loss(labels_tr, predict_on_train))
    predict_on_test = fc_estimator.predict(data_tt)
    print('Test loss: %f' %l2_loss(labels_tt, predict_on_test))
    

Training loss: 0.000000
Test loss: 101.691774
Training loss: 0.000000
Test loss: 85.794812
Training loss: 0.000000
Test loss: 84.410717
