In [4]:
import numpy as np
import scipy.io as spio
import scipy.stats as stats
import math
import time

class GaussianProbabilisticModel:
    
    def __init__(self):
        self.num_classes = 0
        self.classes = []
        self.mle_mu_map = {}
        self.mle_sigma_map = {}
        self.class_prob_map = {}
        self.dim = 0
        self.topN_features = 200

    def __log_gaussian_pdf(self,x,mu,sigma):
        # calculate log of f(x)~multivariate Normal density
        # sigma is adjusted by 0.1*I to avoid inverting a singular matrix
        
        ndim = len(mu)
        sigma_new = np.matrix(sigma+0.1*np.eye(ndim))
        
        det = np.linalg.det(sigma_new)
        C = (det)**(-0.5)*(2*math.pi)**(-ndim*0.5)
        x_minus_mu_mat = np.matrix(x-mu).T
        result = np.log(C) + float(-0.5*x_minus_mu_mat.T*sigma_new.I*x_minus_mu_mat)

        return result
                
       
    def train_model(self,training_data,labels):
        # Steps:
        # 1: Find the N=200 most significant features, and slice the data based on those features
        # 2: Partition data according to labels
        # 3: Compute MLE of mu and sigma for each label
        
        # Step 1.1 - find the MLE of mu and sigma for each feature over the entire dataset
        for i in range(len(training_data)):
            if (i == 0):
                feature_mle_mu = training_data[i]
                self.dim = (training_data[i].shape)[0]
            else:
                feature_mle_mu += training_data[i]
            
        feature_mle_mu = feature_mle_mu * 1.0/len(training_data)
        
        for i in range(len(training_data)):
            x_minus_mu_feature = training_data[i]-feature_mle_mu
            if (i == 0):
                feature_mle_sigma = x_minus_mu_feature**2
            else:
                feature_mle_sigma += x_minus_mu_feature**2
                
        feature_mle_sigma = feature_mle_sigma * 1.0/len(training_data)
        
        # Step 1.2 - find the N=200 most significant features (features exhibiting the largest variance)
        keys = np.linspace(0,self.dim-1,self.dim)
        feature_var = list(zip(keys,feature_mle_sigma))
        
        feature_var_sorted = sorted(feature_var, key=lambda f: f[1], reverse = True)
        self.topN = [int(e[0]) for e in feature_var_sorted[:self.topN_features]]  
        
        # Step 1.3 - slice the training data on the most significant features
        training_data = training_data[:,self.topN]
        self.train_size = len(training_data)
        nums={}
        
        # Step 2.1 - calculate MLE of mu for each label
        for i in range(len(training_data)):
            if not (labels[i] in self.mle_mu_map):
                self.mle_mu_map[labels[i]] = training_data[i]
                self.classes += [labels[i]]
                nums[labels[i]] = 1
                self.num_classes += 1
                
            else:
                self.mle_mu_map[labels[i]] += training_data[i]
                nums[labels[i]] += 1
                
        for i in self.classes:
            self.mle_mu_map[i]=(1.0/nums[i])*self.mle_mu_map[i]
        
        # Step 2.2 - calculate MLE of sigma for each label
        for i in range(len(training_data)):
            # this is a column vector
            x_minus_mu = np.matrix(training_data[i]-self.mle_mu_map[labels[i]]).T
            
            if not (labels[i] in self.mle_sigma_map):
                self.mle_sigma_map[labels[i]]=(x_minus_mu)*(x_minus_mu.T)
            else:
                self.mle_sigma_map[labels[i]]+=(x_minus_mu)*(x_minus_mu.T)
               
        for i in self.classes:
            self.mle_sigma_map[i]=(1.0/nums[i])*self.mle_sigma_map[i]
            self.class_prob_map[i]=(nums[i]*1.0)/len(training_data)
            
    def predict(self,x):
        # prediction is equal to the maximum likelihood class
        prediction=0
        curr_max=-1e300;
        for i in range(self.num_classes):
            p = self.__log_gaussian_pdf(x,self.mle_mu_map[i],self.mle_sigma_map[i]) + np.log(self.class_prob_map[i])
            if(p>curr_max):
                curr_max=p
                prediction=i
        return prediction
                        
    def test_model(self,test_data,labels):
        misses=0        
        # slice the data based on the most significant features
        test_data = test_data[:,self.topN]
        self.test_size = len(test_data)
        results = {'Test Error': [], 'Training Size': [], 'Testing Size': [], 'Predicted': [], 'Actual': []}
        results['Training Size'] = self.train_size
        results['Testing Size'] = self.test_size

        for i in range(len(test_data)):
            pred = self.predict(test_data[i])
            actual = labels[i]
            
            results['Predicted'] += [pred]
            results['Actual'] += [actual]
            
            if(pred != actual):
                misses=misses+1
                
        results['Test Error'] = misses*1.0/len(test_data)
        return results


mat = spio.loadmat('hw1data.mat', squeeze_me=True)
image_matrix=mat['X']
label_array=mat['Y']   

tstart = time.time()
    
gaussian_model= GaussianProbabilisticModel()
gaussian_model.train_model(image_matrix[:1000,:],label_array[:1000])
print('Training complete')
print(gaussian_model.test_model(image_matrix[1001:1101,:],label_array[1001:1101]))

tend = time.time()
print("Total time taken: " + str(tend-tstart))



Training complete
{'Test Error': 0.21, 'Training Size': 1000, 'Testing Size': 100, 'Predicted': [2, 0, 9, 8, 7, 6, 4, 7, 0, 1, 4, 8, 2, 6, 1, 9, 9, 3, 8, 9, 6, 3, 7, 7, 1, 1, 8, 3, 9, 1, 1, 4, 6, 8, 3, 3, 0, 3, 3, 0, 3, 0, 8, 7, 1, 7, 5, 1, 3, 2, 2, 3, 3, 9, 5, 2, 1, 9, 8, 7, 2, 5, 4, 0, 3, 3, 9, 7, 6, 0, 1, 9, 3, 3, 3, 7, 1, 4, 1, 8, 7, 5, 2, 7, 5, 1, 8, 3, 5, 0, 8, 9, 6, 3, 4, 0, 9, 1, 0, 7], 'Actual': [2, 2, 9, 5, 7, 6, 4, 9, 0, 1, 4, 8, 2, 6, 1, 9, 7, 3, 8, 9, 6, 3, 7, 7, 1, 1, 7, 3, 9, 1, 1, 4, 6, 8, 2, 8, 0, 3, 3, 0, 5, 0, 2, 7, 1, 7, 5, 1, 2, 2, 2, 2, 3, 9, 5, 2, 1, 9, 8, 7, 2, 5, 4, 0, 2, 2, 4, 7, 6, 0, 1, 9, 2, 2, 5, 7, 1, 4, 1, 2, 7, 5, 2, 7, 5, 1, 8, 2, 5, 0, 8, 5, 6, 3, 3, 0, 9, 1, 0, 7]}
Total time taken: 12.373589515686035


In [196]:
def gaussian1(x,mu,sigma):
    # calculate f(x)~multivariate Normal density
    rv=stats.multivariate_normal(mean=mu,cov=sigma)
    return rv.pdf(x)
def gaussian2(x,mu,sigma):
    det = np.linalg.det(sigma)
    C = (det)**(-0.5)*(2*math.pi)**(-len(mu.tolist())*0.5)
    x_minus_mu_mat = np.matrix(x-mu).T
    sigma_mat = np.matrix(sigma)
    return np.log(C) + float(-0.5*x_minus_mu_mat.T*sigma_mat.I*x_minus_mu_mat)
    
x = np.array([1,2,6000])
mu = np.array([0.6,0.7,0.4])
sigma = np.array([[1,0.2,0.5],
                  [0.2,1.5,0.4],
                  [0.5,0.4,3.9]])

print(sigma.flatten())
print(np.log(gaussian1(x,mu,sigma)))
print(gaussian2(x,mu,sigma))

[ 1.   0.2  0.5  0.2  1.5  0.4  0.5  0.4  3.9]
-inf
-5014806.3369


