In [52]:
import numpy as np
import pandas as pd
import scipy.io
from scipy.stats import itemfreq

In [79]:
a = [1,1,1,1,2,2,2,2,3,3,4,5,5]
freq = itemfreq(a)[:, 1].astype(float)
freq

array([ 4.,  4.,  2.,  1.,  2.])

In [228]:
class decision_tree:
    
    class node:
        def __init__(self, left, right, split_rule, is_leaf, label):
            self.left = left
            self.right = right
            self.split_rule = split_rule
            self.is_leaf = is_leaf
            self.label = label
    
    def __init__(self, max_depth=1e10):
        self.max_depth = max_depth
        self.root = self.node(None, None, None, 0, None)
        self.root.split_rule = (1, 2)
        
    # utility function for entropy calculation
    def entropy(self, indices):
        p = itemfreq(a)[:, 1].astype(float) / len(indices)
        return -p.dot(np.log2(p))
    
    # calculate entropy the number of instances in each class in known
    def entropy_n(self, all_n):
        p = all_n / (np.sum(all_n)+1e-20)
        return -p.dot(np.log2(p+1e-20))
    
    # calculate the impurity("badness") of the specified split on the input data
    def impurity(self, left_label_hist, right_label_hist):
        Sl = np.sum(left_label_hist)
        Sr = np.sum(right_label_hist)
        return (Sl*self.entropy_n(left_label_hist) + Sr * self.entropy_n(right_label_hist)) / (Sl+Sr)
    
    # find the threshold that best split the data points with a certain feature
    # Note: <= th goes to S_left and > th goes to S_right
    def find_threshold(self, feature, labels):
        all_f = sorted(set(feature)) # sorted in ascending order
        all_l = set(labels) # list unique labels
        
        freq_mat = np.zeros([len(all_f), len(all_l)])
        for i, f in enumerate(all_f):
            for j, l in enumerate(all_l):
                freq_mat[i, j] = len(labels[np.where(labels[np.where(feature==f)]==l)])
        
        # calculate the average of two neighboring values as threshold
        # iterates from min to max
        all_threshold = (np.hstack((all_f[1:], all_f[-1])) + all_f) / 2.
        
        # in the beginning, all goes to the right node
        n_left = np.zeros([len(all_l)])
        n_right = np.sum(freq_mat, axis=0)
        n_left_sum = 0
        min_threshold = all_threshold[0]
        min_H = self.impurity(n_left, n_right)
        # loop through all threshold to find the one with the minimum impurity
        for i, th in enumerate(all_threshold):
            n_left += freq_mat[i, :]
            n_right -= freq_mat[i, :]
            H = self.impurity(n_left, n_right)
            if H < min_H:
                min_H = H
                min_threshold = th
        return min_threshold, min_H
    

    
    # find the best feature and threshold to split data points
    def segmenter(self, data, labels):
        n_feature = data.shape[1]
        min_H = 1e20
        min_th = 0
        min_i = 0
        for i in range(n_feature):
            threshold, H = find_threshold(data[:, i], labels)
            if H < min_H:
                min_H = H
                min_th = threshold
                min_i = i
        return min_i, min_th
        
    # train the decision tree
    def train(data, labels):
        pass
    
    # predict labels of test data
    def predict(data):
        pass

In [238]:
# test find_threshold

test = decision_tree()

feature = np.array([1,1,1,2,2,3,3,2,1,4,6,5,5])
labels = np.array([0,0,0,1,1,1,0,1,0,1,0,1,0])
min_threshold, min_H = test.find_threshold(feature, labels)
print(min_threshold, min_H)

feature = np.array([0,0,0,0,0,0,1,1,1,1,1,1,1])
labels = np.array([0,0,0,0,0,1,1,1,1,1,1,1,1])

min_threshold, min_H = test.find_threshold(feature, labels)
print(min_threshold, min_H)

1.5 0.63574326973
0.5 0.300010348453


In [223]:
all_f = np.array([1, 2, 3, 4])
all_threshold = (np.hstack((all_f[1:], all_f[-1])) + all_f) / 2.
all_threshold+1
np.zeros([3])

array([ 0.,  0.,  0.])

In [41]:
class random_forest:
    def __init__(self, num_of_tree, max_depth=1e10):
        self.max_depth = max_depth
        self.num_of_tree = num_of_tree
        self.trees = np.array([decision_tree(max_depth)] * num_of_tree)
     

In [42]:
test = random_forest(5)
test.trees

array([<__main__.decision_tree object at 0x113a68da0>,
       <__main__.decision_tree object at 0x113a68da0>,
       <__main__.decision_tree object at 0x113a68da0>,
       <__main__.decision_tree object at 0x113a68da0>,
       <__main__.decision_tree object at 0x113a68da0>], dtype=object)

### 1. spam

In [14]:
data = scipy.io.loadmat('./spam_dist/spam_data.mat')
train_X = data['training_data']
train_y = data['training_labels'].ravel()
test_X = data['test_data']

### 2. census

In [117]:
train_data = pd.read_csv('./census_dist/train_data.csv')
test_data = pd.read_csv('./census_dist/test_data.csv')
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,59,Private,307423,9th,5,Never-married,Other-service,Not-in-family,Black,Male,0,0,50,United-States,0
1,32,Private,192965,HS-grad,9,Separated,Sales,Not-in-family,White,Female,0,0,45,United-States,0
2,19,Private,125591,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,40,United-States,0
3,51,Without-pay,124963,Assoc-acdm,12,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,0
4,57,Self-emp-inc,146103,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,15024,0,30,United-States,1


### 3. Titanic

In [116]:
train_data = pd.read_csv('./titanic_dist/titanic_training.csv')
test_data = pd.read_csv('./titanic_dist/titanic_testing_data.csv')
train_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0.0,3.0,male,,0.0,0.0,SOTON/OQ 392086,8.05,,S
1,0.0,1.0,male,22.0,0.0,0.0,PC 17760,135.6333,,C
2,0.0,2.0,male,23.0,0.0,0.0,SC/PARIS 2133,15.0458,,C
3,0.0,2.0,male,42.0,0.0,0.0,211535,13.0,,S
4,0.0,3.0,male,20.0,0.0,0.0,7534,9.8458,,S
