In [36]:
import numpy as np
import scipy.io as spio
import scipy.stats as stats
import math
from sortedcollections import SortedSet

def build_step_for_feature(image_matrix):
    move_step = []

    for i in range(image_matrix.shape[1]):
        a = SortedSet(image_matrix[:,i].tolist())
        move_step.append(a)
    
    return move_step


class Node:
    def __init__(self,feature,threshold,left,right,label):
        self.feature=feature
        self.threshold=threshold
        self.left=left
        self.right=right
        self.label=label

def calculate_max_label_count(labels):
    count_map={}
    for i in range(len(labels)):
        if not(labels[i] in count_map):
            count_map[labels[i]]=1
        else:
            count_map[labels[i]]=count_map[labels[i]]+1
            
     ##check this       
    return max(count_map.values())
        
def calculate_max_label(labels):
    count_map={}
    curr_max=0
    max_label=0
    for i in range(len(labels)):
        if not(labels[i] in count_map):
            count_map[labels[i]]=1
        else:
            count_map[labels[i]]=count_map[labels[i]]+1
        if(count_map[labels[i]]>curr_max):
            curr_max=count_map[labels[i]]
            max_label=labels[i]
     ##check this       
    return max_label

def entropy(labels):
    if labels is None or len(labels)==0:
        return 0;
    return (len(labels)-calculate_max_label_count(labels))*1.0*(1.0/len(labels))
    
def shuffle_and_split(data,partition):
    ran_order = np.arange(len(data['X']))
    np.random.shuffle(ran_order)
    ran_order_training = ran_order[:(int)(len(data['X'])*partition)] 
    ran_order_test = ran_order[(int)(len(data['X'])*partition):] 
    training_data = data['X'][ran_order_training]
    training_label = data['Y'][ran_order_training]
    test_data = data['X'][ran_order_test]
    test_label = data['Y'][ran_order_test]
    return [training_data,training_label,test_data,test_label]

def calculate_optimal_feature_threshold(feature_space,labels):
    opt_f=0
    opt_t=0
    max_uncert_red=0.0
    
    for f in range(feature_space.shape[1]):
        a=(max(feature_space[:,f])-min(feature_space[:,f]))/10
        b=[min(feature_space[:,f])+i*a for i in range(10)]
        for t in b:
            try:
                filter_arr_left = np.array(feature_space[:,f]<t)
                filter_arr_right = np.array(feature_space[:,f]>=t)
                left_labels = np.array(labels)[filter_arr_left].tolist()
                right_labels = np.array(labels)[filter_arr_right].tolist()
            except TypeError:
                print("Error partitioning labels")
          #  left_labels=[]
          #  right_labels=[]
          #  for i in range(len(feature_space)):
          #      if(feature_space[i,f]<t):
          #          left_labels.append(labels[i])
          #      else:
          #          right_labels.append(labels[i])
            if(len(left_labels)==0 or len(right_labels)==0):
                continue
            uncertainty_red = entropy(labels)-(len(left_labels)*1.0/len(labels))*entropy(left_labels) - (len(left_labels)*1.0/len(labels))*entropy(right_labels)
            if(uncertainty_red > max_uncert_red):
                opt_f=f
                opt_t=t
    
    return [opt_f,opt_t]

def build_tree(feature_space,labels,param,live_count):
    global tree_depth
    if len(feature_space)<=param or live_count > 10:
        return Node(None,None,None,None,calculate_max_label(labels))
    
    live_count += 1
    tree_depth = max(tree_depth,live_count)
    print(live_count)
    
    left_space=[]
    left_labels=[]
    right_space=[]
    right_labels=[]
    
    [f,t]=calculate_optimal_feature_threshold(feature_space,labels)
    print(f,t)
    if(f==0 and t== 0):
        return Node(None,None,None,None,calculate_max_label(labels))
    
    for i in range(len(feature_space)):
        if(feature_space[i][f]<t):
            
            left_space.append(feature_space[i])
            left_labels.append(labels[i])
        else:
            right_space.append(feature_space[i])
            right_labels.append(labels[i])
    
    left_node=build_tree(np.array(left_space),left_labels,param,live_count)
    right_node=build_tree(np.array(right_space),right_labels,param,live_count)
    
    return Node(f,t,left_node,right_node,None)
    
    ## find the optimal feature and threshold
    ## split on the optimal feature and threshold
    ## node.left=build_tree(left_space)
    ## node.right=build_tree(right_space)
    ## return node


class DecisionTree:
    
    def __init__(self,num_classes,root):
        self.num_classes=num_classes
        self.root=root

                

    def train_model(self,training_data,labels,param):
        print("------Pre-processing Data-----")
        for i in range(len(training_data)):
            if (i == 0):
                feature_mle_mu = training_data[i]
                self.dim = (training_data[i].shape)[0]
            else:
                feature_mle_mu += training_data[i]
            
        feature_mle_mu = feature_mle_mu * 1.0/len(training_data)
        
        for i in range(len(training_data)):
            x_minus_mu_feature = training_data[i]-feature_mle_mu
            if (i == 0):
                feature_mle_sigma = x_minus_mu_feature**2
            else:
                feature_mle_sigma += x_minus_mu_feature**2
                
        feature_mle_sigma = feature_mle_sigma * 1.0/len(training_data)
        
        keys = np.linspace(0,self.dim-1,self.dim)
        feature_var = list(zip(keys,feature_mle_sigma))
        
        feature_var_sorted = sorted(feature_var, key=lambda f: f[1], reverse = True)
        self.top200 = [int(e[0]) for e in feature_var_sorted[:200]]  
        print("------Done Pre-processing Data -----")
        # slice the data
        training_data = training_data[:,self.top200]

        self.root=build_tree(training_data,labels,param,0)

    def predict(self,x):
        prediction=None
        curr_node=self.root
        while prediction==None:
            if curr_node.label!= None:
                prediction=curr_node.label
            else:
                if(x[curr_node.feature]<curr_node.threshold):
                    curr_node=curr_node.left
                else:
                    curr_node=curr_node.right
        return prediction
                
                
                
                
                
        
    def testing_error(self,test_data,labels):
        misses=0
        test_data = test_data[:,self.top200]
        for i in range(len(test_data)):
            if(self.predict(test_data[i])!=labels[i]):
                misses=misses+1
        return (misses*1.0/len(test_data))
            
    
        ##Steps:
        ##1: return argmax P[X=x/Y=y]*P[Y=y]

        #Partition = 0.7 for 70-30 split
        
        
mat = spio.loadmat('hw1data.mat', squeeze_me=True)
data_split = shuffle_and_split(mat,0.7)
image_matrix = data_split[0]
move_step = build_step_for_feature(image_matrix)
tree_depth = 0
label_array= data_split[1]

decision_tree= DecisionTree(10,None)
#[f,t]= calculate_optimal_feature_threshold(image_matrix[:30],label_array[:30])
decision_tree.train_model(image_matrix,label_array,10)

#Training error
print(decision_tree.testing_error(data_split[0],data_split[1]))
#Test Error
print(decision_tree.testing_error(data_split[2],data_split[3]))

------Pre-processing Data-----
------Done Pre-processing Data -----
1
145 25.5
2
180 25.5
3
183 51.0
4
146 25.5
5
184 144.9
6
184 58.5
7
184 45.9
8
184 40.5
9
184 31.5
10
184 26.1
11
184 21.6
5
184 25.5
6
198 25.5
7
179 229.5
8
179 197.1
9
176 127.5
10
198 20.7
11
198 18.0
10
196 184.5
11
194 227.7
7
198 209.2
8
198 98.4
9
198 54.8
10
198 37.0
10
199 227.7
11
199 129.6
9
198 153.0
10
199 229.5
11
198 118.6
10
198 185.8
11
199 150.3
11
199 75.9
8
198 251.0
9
198 229.0
10
199 229.5
9
198 253.0
10
199 226.8
11
198 251.9
10
192 168.3
11
196 208.8
6
195 111.6
7
195 31.5
8
195 8.1
9
193 51.0
10
191 76.5
11
196 69.9
11
197 226.8
10
197 51.0
11
199 25.5
11
197 194.4
4
198 25.5
5
197 51.0
6
196 229.5
7
198 21.6
8
198 14.4
9
196 91.6
10
199 228.6
11
198 9.0
10
196 161.0
11
199 227.7
11
196 196.5
7
198 21.6
8
196 252.6
9
196 247.8
10
197 30.6
11
199 226.8
10
199 226.8
11
197 36.0
9
198 3.6
10
196 254.0
11
193 111.6
11
198 0.9
6
197 234.8
7
198 22.5
8
197 143.5
9
198 16.2
10
198 13.5
11
198 9.0
9


In [2]:

print(decision_tree.testing_error(image_matrix,label_array))
print(decision_tree.testing_error(image_matrix[6000:],label_array[6000:]))


0.3552
0.501


In [26]:
print(decision_tree.testing_error(image_matrix[0:1200,],label_array[0:1200]))

0.095


In [11]:
decision_tree.train_model(image_matrix[0:1000,],label_array[0:1000],5)

TypeError: train_model() takes 3 positional arguments but 4 were given

In [None]:
import numpy as np
import scipy.io as spio
import scipy.stats as stats
import math
import time
from sortedcollections import SortedSet

def build_step_for_feature(image_matrix):
    move_step = []

    for i in range(image_matrix.shape[1]):
        a = SortedSet(image_matrix[:,i].tolist())
        move_step.append(a)
    
    return move_step

class Node:
    def __init__(self,feature,threshold,left,right,label):
        self.feature=feature
        self.threshold=threshold
        self.left=left
        self.right=right
        self.label=label

def calculate_max_label_count(labels):
    count_map={}
    for i in range(len(labels)):
        if not(labels[i] in count_map):
            count_map[labels[i]]=1
        else:
            count_map[labels[i]]=count_map[labels[i]]+1
            
     ##check this       
    return max(count_map.values())
    
def calculate_max_label(labels):
    count_map={}
    curr_max=0
    max_label=0
    for i in range(len(labels)):
        if not(labels[i] in count_map):
            count_map[labels[i]]=1
        else:
            count_map[labels[i]]=count_map[labels[i]]+1
        if(count_map[labels[i]]>curr_max):
            curr_max=count_map[labels[i]]
            max_label=labels[i]
     ##check this       
    return max_label

def entropy(labels):
    if labels is None or len(labels)==0:
        return 0;
    return (len(labels)-calculate_max_label_count(labels))*1.0*(1.0/len(labels))
    

def calculate_optimal_feature_threshold(feature_space,labels):
    opt_f=0
    opt_t=0
    max_uncert_red=0.0
    
    for f in range(feature_space.shape[1]):
        #print(range(min(feature_space[:,f]),max(feature_space[:,f]),1))
        #for t in range(min(feature_space[:,f]),min(max(feature_space[:,f]),500),10):
        for t in move_step[f]:
            left_labels=[]
            right_labels=[]
            for i in range(len(feature_space)):
                if(feature_space[i,f]<t):
                    left_labels.append(labels[i])
                else:
                    right_labels.append(labels[i])
            if(len(left_labels)==0 or len(right_labels)==0):
                continue
            uncertainty_red = entropy(labels)-(len(left_labels)*1.0/len(labels))*entropy(left_labels) - (len(left_labels)*1.0/len(labels))*entropy(right_labels)
            if(uncertainty_red > max_uncert_red):
                opt_f=f
                opt_t=t
    return [opt_f,opt_t]

def build_tree(feature_space,labels,param,live_count):
    global tree_depth
    if len(feature_space)<=param or live_count > 9:
        return Node(None,None,None,None,calculate_max_label(labels))
    
    live_count += 1
    tree_depth = max(tree_depth,live_count)
    print tree_depth
    
    left_space=[]
    left_labels=[]
    right_space=[]
    right_labels=[]
    
    [f,t]=calculate_optimal_feature_threshold(feature_space,labels)
    print(f,t)
    
    for i in range(len(feature_space)):
        if(feature_space[i][f]<t):
            
            left_space.append(feature_space[i])
            left_labels.append(labels[i])
        else:
            right_space.append(feature_space[i])
            right_labels.append(labels[i])
    
    left_node=build_tree(np.array(left_space),left_labels,param,live_count)
    right_node=build_tree(np.array(right_space),right_labels,param,live_count)
    return Node(f,t,left_node,right_node,None)
    
    ## find the optimal feature and threshold
    ## split on the optimal feature and threshold
    ## node.left=build_tree(left_space)
    ## node.right=build_tree(right_space)
    ## return node


class DecisionTree:
    
    def __init__(self,num_classes,root):
        self.num_classes=num_classes
        self.root=root

                

    def train_model(self,training_data,labels):
        print("------Pre-processing Data-----")
        for i in range(len(training_data)):
            if (i == 0):
                feature_mle_mu = training_data[i]
                self.dim = (training_data[i].shape)[0]
            else:
                feature_mle_mu += training_data[i]
            
        feature_mle_mu = feature_mle_mu * 1.0/len(training_data)
        
        for i in range(len(training_data)):
            x_minus_mu_feature = training_data[i]-feature_mle_mu
            if (i == 0):
                feature_mle_sigma = x_minus_mu_feature**2
            else:
                feature_mle_sigma += x_minus_mu_feature**2
                
        feature_mle_sigma = feature_mle_sigma * 1.0/len(training_data)
        
        keys = np.linspace(0,self.dim-1,self.dim)
        feature_var = list(zip(keys,feature_mle_sigma))
        
        feature_var_sorted = sorted(feature_var, key=lambda f: f[1], reverse = True)
        self.top200 = [int(e[0]) for e in feature_var_sorted[:200]]  
        print("------Done Pre-processing Data -----")
        # slice the data
        training_data = training_data[:,self.top200]
        t0 = time.time()
        self.root=build_tree(training_data,labels,40,0)
        t1 = time.time()
        print ("Total time taken into building the tree: " + str(t1-t0))

    def predict(self,x):
        prediction=None
        curr_node=self.root
        while prediction==None:
            if curr_node.label!= None:
                prediction=curr_node.label
            else:
                if(x[curr_node.feature]<curr_node.threshold):
                    curr_node=curr_node.left
                else:
                    curr_node=curr_node.right
        return prediction
                        
    def testing_error(self,test_data,labels):
        misses=0
        test_data = test_data[:,self.top200]
        for i in range(len(test_data)):
            if(self.predict(test_data[i])!=labels[i]):
                misses=misses+1
        return (misses*1.0/len(test_data))
        ##Steps:
        ##1: return argmax P[X=x/Y=y]*P[Y=y]

#Partition = 0.7 for 70-30 split
def shuffle_and_split(data,partition):
    ran_order = np.arange(len(data['X']))
    np.random.shuffle(ran_order)
    ran_order_training = ran_order[:(int)(len(data['X'])*partition)] 
    ran_order_test = ran_order[(int)(len(data['X'])*partition):] 
    training_data = data['X'][ran_order_training]
    training_label = data['Y'][ran_order_training]
    test_data = data['X'][ran_order_test]
    test_label = data['Y'][ran_order_test]
    return [training_data,training_label,test_data,test_label]
        
mat = spio.loadmat('hw1data.mat', squeeze_me=True)
data_split = shuffle_and_split(mat,0.1)
image_matrix = data_split[0]
move_step = build_step_for_feature(image_matrix)
tree_depth = 0
label_array= data_split[1]

decision_tree= DecisionTree(10,None)
#[f,t]= calculate_optimal_feature_threshold(image_matrix[:30],label_array[:30])
decision_tree.train_model(image_matrix,label_array)

#Training error
print(decision_tree.testing_error(data_split[0],data_split[1]))
#Test Error
print(decision_tree.testing_error(data_split[2],data_split[3]))