# Mustererkennung Aufgabe 3
Yu He, Remi

In [1]:
import numpy as np

In [2]:
# function to load the data and preprocess the data
def read_preprocess(filename):
    f = open(filename)
    raw_data = f.readlines()
    
    point_lst = []
    for ele in raw_data:
        point = []
        for value in ele.split():
            point.append(float(value))
        point_lst.append(point)
        
    return np.array(point_lst)

In [3]:
filename = './zip.train'
training_array = read_preprocess(filename)

In [4]:
filename = './zip.test'
testing_array = read_preprocess(filename)

In [5]:
train_digit_three = training_array[:,0] - 3 == 0
train_digit_five = training_array[:,0] - 5 == 0
train_digit_seven = training_array[:,0] - 7 == 0
train_digit_eight = training_array[:,0] - 8 == 0

In [6]:
train_dict = {3:train_digit_three, 5:train_digit_five, 7:train_digit_seven, 8:train_digit_eight}

In [7]:
test_digit_three = testing_array[:,0] - 3 == 0
test_digit_five = testing_array[:,0] - 5 == 0
test_digit_seven = testing_array[:,0] - 7 == 0
test_digit_eight = testing_array[:,0] - 8 == 0

In [8]:
test_dict = {3:test_digit_three, 5:test_digit_five, 7:test_digit_seven, 8:test_digit_eight}

In [9]:
def gen_trainset(digit_one, digit_two, train):
    digit_one_array = train[train_dict[digit_one]]
    digit_two_array = train[train_dict[digit_two]]
    train_set = np.concatenate((digit_one_array, digit_two_array), axis = 0)
    X = train_set[:,1:]
    Y = train_set[:,0]
   
    num_data = X.shape[0]
    bias = np.ones((num_data,1))
    X = np.concatenate((bias, X), axis = 1)

    return X, Y


def gen_testset(digit_one, digit_two, test):
    digit_one_array = test[test_dict[digit_one]]
    digit_two_array = test[test_dict[digit_two]]
    test_set = np.concatenate((digit_one_array, digit_two_array), axis = 0)  
    X = test_set[:,1:]
    Y = test_set[:,0]
    num_data = X.shape[0]
    bias = np.ones((num_data,1))
    X = np.concatenate((bias, X), axis = 1)   
    
    return X,Y


# Gaussian classifier

Function predict_one() takes only one test data point as input. So we need to use loop, if we want to obtain the accuracy of the model using the entire test data set.

Function predict() takes the entire test data set as input. Consider the situation that we have a N * M matrix as the test set. N represents the number of data points, M represents the number of features. So the covariance matrix will be in shape of M * M. The result of $ (X - \mu)*\Sigma^{-1}*(X - \mu)^{T} $ will be a N * N matrix. Each value at the diagnoal of the matrix is the scalar we need for calculating the probability.

function predict_one() was use for only one data point in the test set. If it is used for many data points, loop and list shall be used for accuracy calculation, which also means it will take more time.

In [10]:
class gaussian_classifier:
    def fit(self, X, y):
        self.class_one = np.unique(y)[0]
        self.class_two = np.unique(y)[1]
        
        class_one_X = (X[y==self.class_one])
        class_two_X = (X[y==self.class_two])
        
        self.covariance1 = np.cov(class_one_X, rowvar = False)
        self.covariance2 = np.cov(class_two_X, rowvar = False)
                
        self.mean1 = np.mean(class_one_X, axis = 0)
        self.mean2 = np.mean(class_two_X, axis = 0)
        
    def predict_one(self, x):
        factor1 = 1.0/ np.sqrt(np.linalg.norm(2*np.pi*self.covariance1))
        factor2 = 1.0/ np.sqrt(np.linalg.norm(2*np.pi*self.covariance2))
        
        index1 = -1/2 * np.dot(np.dot(x - self.mean1,np.linalg.pinv(self.covariance1)), (x - self.mean1).T)
        index2 = -1/2 * np.dot(np.dot(x - self.mean2,np.linalg.pinv(self.convariance2)), (x - self.mean2).T)
        
        prob1 = factor1*np.exp(index1)
        prob2 = factor2*np.exp(index2)
        
        if prob1 >= prob2:
            return self.class_one
        else:
            return self.class_two
        
    def predict(self, X):
        factor1 = 1.0/ np.sqrt(np.linalg.norm(2*np.pi*self.covariance1))
        factor2 = 1.0/ np.sqrt(np.linalg.norm(2*np.pi*self.covariance2))
        
        index1 = -1/2 * np.dot(np.dot(X - self.mean1,np.linalg.pinv(self.covariance1)), (X - self.mean1).T)
        index2 = -1/2 * np.dot(np.dot(X - self.mean2,np.linalg.pinv(self.covariance2)), (X - self.mean2).T)
        
        prob1 = factor1*np.exp(np.diag(index1))
        prob2 = factor2*np.exp(np.diag(index2))
        
        self.result = np.zeros((X.shape[0]))
        self.result[prob1 - prob2 >= 0] = self.class_one
        self.result[prob1 - prob2 < 0] = self.class_two
        
    def multifit(self, X,y):
        self.class_one = np.unique(y)[0]
        self.class_two = np.unique(y)[1]
        self.class_three = np.unique(y)[2]
        self.class_four = np.unique(y)[3]
        
        class_one_X = (X[y==self.class_one])
        class_two_X = (X[y==self.class_two])
        class_three_X = (X[y==self.class_three])
        class_four_X = (X[y==self.class_four])
        
        self.covariance1 = np.cov(class_one_X, rowvar = False)
        self.covariance2 = np.cov(class_two_X, rowvar = False)
        self.covariance3 = np.cov(class_three_X, rowvar = False)
        self.covariance4 = np.cov(class_four_X, rowvar = False)

        self.mean1 = np.mean(class_one_X, axis = 0)
        self.mean2 = np.mean(class_two_X, axis = 0)
        self.mean3 = np.mean(class_three_X, axis = 0)
        self.mean4 = np.mean(class_four_X, axis = 0)

    def multipredit(self, X):
        factor1 = 1.0/ np.sqrt(np.linalg.norm(2*np.pi*self.covariance1))
        factor2 = 1.0/ np.sqrt(np.linalg.norm(2*np.pi*self.covariance2))
        factor3 = 1.0/ np.sqrt(np.linalg.norm(2*np.pi*self.covariance3))
        factor4 = 1.0/ np.sqrt(np.linalg.norm(2*np.pi*self.covariance4))

        
        
        index1 = -1/2 * np.dot(np.dot(X - self.mean1,np.linalg.pinv(self.covariance1)), (X - self.mean1).T)
        index2 = -1/2 * np.dot(np.dot(X - self.mean2,np.linalg.pinv(self.covariance2)), (X - self.mean2).T)
        index3 = -1/2 * np.dot(np.dot(X - self.mean3,np.linalg.pinv(self.covariance3)), (X - self.mean3).T)
        index4 = -1/2 * np.dot(np.dot(X - self.mean4,np.linalg.pinv(self.covariance4)), (X - self.mean4).T)
        
        prob1 = factor1*np.exp(np.diag(index1)).reshape(1,X.shape[0])
        prob2 = factor2*np.exp(np.diag(index2)).reshape(1,X.shape[0])
        prob3 = factor2*np.exp(np.diag(index3)).reshape(1,X.shape[0])
        prob4 = factor2*np.exp(np.diag(index4)).reshape(1,X.shape[0])
                
        prob = np.concatenate((prob1, prob2, prob3, prob4), axis = 0)
                
        self.result = np.argmax(prob, axis = 0)
        
        # we need to put self.result[self.result == 0] = self.class_one at the end,
        # because if it is first executed, it will have conflict with digit 8.
        # all the values==3 in matrix result will finally be 8 at last

        self.result[self.result == 1] = self.class_two
        self.result[self.result == 2] = self.class_three
        self.result[self.result == 3] = self.class_four
        self.result[self.result == 0] = self.class_one

    def score(self, y):
        return sum(self.result - y == 0)/len(y)

# digit 3,5

In [11]:
X_train, Y_train = gen_trainset(3,5, training_array)
X_test, Y_test = gen_testset(3,5, testing_array)

In [12]:
gc = gaussian_classifier()

In [13]:
gc.fit(X_train, Y_train)

In [14]:
gc.predict(X_test)

In [15]:
gc.score(Y_test)

0.92331288343558282

# digit 3,7

In [16]:
X_train, Y_train = gen_trainset(3, 7, training_array)
X_test, Y_test = gen_testset(3,7 ,testing_array)
gc = gaussian_classifier()
gc.fit(X_train, Y_train)
gc.predict(X_test)
gc.score(Y_test)

0.92971246006389774

# digit 3, 8

In [17]:
X_train, Y_train = gen_trainset(3, 8, training_array)
X_test, Y_test = gen_testset(3,8 ,testing_array)
gc = gaussian_classifier()
gc.fit(X_train, Y_train)
gc.predict(X_test)
gc.score(Y_test)

0.84337349397590367

# digit 5,7

In [18]:
X_train, Y_train = gen_trainset(5, 7, training_array)
X_test, Y_test = gen_testset(5, 7 ,testing_array)
gc = gaussian_classifier()
gc.fit(X_train, Y_train)
gc.predict(X_test)
gc.score(Y_test)

0.9315960912052117

# digit 5, 8

In [19]:
X_train, Y_train = gen_trainset(5, 8, training_array)
X_test, Y_test = gen_testset(5, 8,testing_array)
gc = gaussian_classifier()
gc.fit(X_train, Y_train)
gc.predict(X_test)
gc.score(Y_test)

0.87423312883435578

# digit 7, 8

In [20]:
X_train, Y_train = gen_trainset(7, 8, training_array)
X_test, Y_test = gen_testset(7, 8,testing_array)
gc = gaussian_classifier()
gc.fit(X_train, Y_train)
gc.predict(X_test)
gc.score(Y_test)

0.83706070287539935

# digit 3,5,7,8

In [21]:
X_train1, Y_train1 = gen_trainset(3,5, training_array)
X_test1, Y_test1 = gen_testset(3,5, testing_array)

In [22]:
X_train2, Y_train2 = gen_trainset(7,8, training_array)
X_test2, Y_test2 = gen_testset(7,8, testing_array)

In [23]:
X_train = np.concatenate((X_train1,X_train2), axis = 0)
X_test = np.concatenate((X_test1, X_test2), axis = 0)
Y_train = np.concatenate((Y_train1, Y_train2), axis = 0)
Y_test = np.concatenate((Y_test1, Y_test2), axis = 0)

In [24]:
gc = gaussian_classifier()
gc.multifit(X_train, Y_train)
gc.multipredit(X_test)
gc.score(Y_test)

0.83881064162754304