In [109]:
import numpy as np
import scipy.io as spio
import scipy.stats as stats
import math


class Node:
    def __init__(self,feature,threshold,left,right,label):
        self.feature=feature
        self.threshold=threshold
        self.left=left
        self.right=right
        self.label=label

def calculate_max_label_count(labels):
    count_map={}
    for i in range(len(labels)):
        if not(labels[i] in count_map):
            count_map[labels[i]]=1
        else:
            count_map[labels[i]]=count_map[labels[i]]+1
            
     ##check this       
    return max(count_map.values())
        
def calculate_max_label(labels):
    count_map={}
    curr_max=0
    max_label=0
    for i in range(len(labels)):
        if not(labels[i] in count_map):
            count_map[labels[i]]=1
        else:
            count_map[labels[i]]=count_map[labels[i]]+1
        if(count_map[labels[i]]>curr_max):
            curr_max=count_map[labels[i]]
            max_label=labels[i]
     ##check this       
    return max_label

def entropy(labels):
    if labels is None or len(labels)==0:
        return 0;
    return (len(labels)-calculate_max_label_count(labels))*1.0*(1.0/len(labels))
    

def calculate_optimal_feature_threshold(feature_space,labels):
    opt_f=0
    opt_t=0
    max_uncert_red=0.0
    
    for f in range(feature_space.shape[1]):
        #print(range(min(feature_space[:,f]),max(feature_space[:,f]),1))
        for t in range(min(feature_space[:,f]),min(max(feature_space[:,f]),500),10):
            left_labels=[]
            right_labels=[]
            for i in range(len(feature_space)):
                if(feature_space[i,f]<t):
                    left_labels.append(labels[i])
                else:
                    right_labels.append(labels[i])
            if(len(left_labels)==0 or len(right_labels)==0):
                continue
            uncertainty_red = entropy(labels)-(len(left_labels)*1.0/len(labels))*entropy(left_labels) - (len(left_labels)*1.0/len(labels))*entropy(right_labels)
            if(uncertainty_red > max_uncert_red):
                opt_f=f
                opt_t=t
    return [opt_f,opt_t]

def build_tree(feature_space,labels,param):
    
    if len(feature_space)<=param:
        return Node(None,None,None,None,calculate_max_label(labels))
    left_space=[]
    left_labels=[]
    right_space=[]
    right_labels=[]
    
    [f,t]=calculate_optimal_feature_threshold(feature_space,labels)
    print(f,t)
    
    for i in range(len(feature_space)):
        if(feature_space[i][f]<t):
            
            left_space.append(feature_space[i])
            left_labels.append(labels[i])
        else:
            right_space.append(feature_space[i])
            right_labels.append(labels[i])
    
    left_node=build_tree(np.array(left_space),left_labels,param)
    right_node=build_tree(np.array(right_space),right_labels,param)
    
    return Node(f,t,left_node,right_node,None)
    
    ## find the optimal feature and threshold
    ## split on the optimal feature and threshold
    ## node.left=build_tree(left_space)
    ## node.right=build_tree(right_space)
    ## return node


class DecisionTree:
    
    def __init__(self,num_classes,root):
        self.num_classes=num_classes
        self.root=root

                

    def train_model(self,training_data,labels):
        print("------Pre-processing Data-----")
        for i in range(len(training_data)):
            if (i == 0):
                feature_mle_mu = training_data[i]
                self.dim = (training_data[i].shape)[0]
            else:
                feature_mle_mu += training_data[i]
            
        feature_mle_mu = feature_mle_mu * 1.0/len(training_data)
        
        for i in range(len(training_data)):
            x_minus_mu_feature = training_data[i]-feature_mle_mu
            if (i == 0):
                feature_mle_sigma = x_minus_mu_feature**2
            else:
                feature_mle_sigma += x_minus_mu_feature**2
                
        feature_mle_sigma = feature_mle_sigma * 1.0/len(training_data)
        
        keys = np.linspace(0,self.dim-1,self.dim)
        feature_var = list(zip(keys,feature_mle_sigma))
        
        feature_var_sorted = sorted(feature_var, key=lambda f: f[1], reverse = True)
        self.top200 = [int(e[0]) for e in feature_var_sorted[:200]]  
        print("------Done Pre-processing Data -----")
        # slice the data
        training_data = training_data[:,self.top200]

        self.root=build_tree(training_data,labels,40)

    def predict(self,x):
        prediction=None
        curr_node=self.root
        while prediction==None:
            if curr_node.label!= None:
                prediction=curr_node.label
            else:
                if(x[curr_node.feature]<t):
                    curr_node=curr_node.left
                else:
                    curr_node=curr_node.right
        return prediction
                
                
                
                
                
        
    def testing_error(self,test_data,labels):
        misses=0
        test_data = test_data[:,self.top200]
        for i in range(len(test_data)):
            if(self.predict(test_data[i])!=labels[i]):
                misses=misses+1
        return (misses*1.0/len(test_data))
            
    
        ##Steps:
        ##1: return argmax P[X=x/Y=y]*P[Y=y]

mat = spio.loadmat('hw1data.mat', squeeze_me=True)
image_matrix=np.asarray(mat['X'], dtype=np.int32)

label_array=np.asarray(mat['Y'], dtype=np.int32)

decision_tree= DecisionTree(10,None)
#[f,t]= calculate_optimal_feature_threshold(image_matrix[:30],label_array[:30])  
decision_tree.train_model(image_matrix[0:2000,],label_array[0:2000])
print(decision_tree.testing_error(image_matrix[1300:1500,],label_array[1300:1500]))




------Pre-processing Data-----
------Done Pre-processing Data -----
199 490
160 10
195 60
176 250
173 50
192 20
195 40
180 20
199 250
195 30
192 10
189 10
187 240
187 160
195 20
199 240
199 140
199 130
199 120
199 40
199 20
197 250
197 210
197 200
197 190
197 170
197 160
197 110
197 60
197 30
197 20
194 250
191 250
190 250
196 200
194 130
196 190
196 120
196 110
194 70
190 240
190 70
196 90
190 30
196 70
185 250
196 10
181 250
186 250
193 150
193 140
186 190
186 150
186 120
186 100
185 240
184 170
183 230
184 70
196 230
196 200
193 250
193 210
199 250
199 210
199 190
199 180
199 170
199 100
199 40
199 30
199 10
196 80
193 200
195 30
193 120
193 60
192 10
196 70
191 250
191 150
191 100
191 40
191 20
190 20
196 250
196 220
196 190
196 180
194 20
196 160
196 150
196 100
196 20
196 10
195 40
195 30
195 20
193 230
193 220
195 10
198 250
198 190
198 120
198 90
198 70
194 10
193 70
199 250
197 60
198 20
196 200
198 10
196 30
199 50
198 210
198 100
198 250
196 200
198 100
195 241
198 90
195 15

In [97]:
a=[]
print(max( max(image_matrix[:300,:].tolist())))
#print(decision_tree.testing_error(image_matrix[:300,],label_array[:300]))


43065
