# Mustererkennung Aufgabe 2

In [1]:
from collections import Counter
import numpy as np
import pandas as pd

In [2]:
# function to load the data and preprocess the data
def read_preprocess(filename):
    f = open(filename)
    raw_data = f.readlines()
    
    point_lst = []
    for ele in raw_data:
        point = []
        for value in ele.split():
            point.append(float(value))
        point_lst.append(point)
        
    return np.array(point_lst)

In [3]:
filename = './zip.train'
training_array = read_preprocess(filename)

In [4]:
filename = './zip.test'
testing_array = read_preprocess(filename)

# for getting digit 3,5,7,8

In [5]:
train_digit_three = training_array[:,0] - 3 == 0
train_digit_five = training_array[:,0] - 5 == 0
train_digit_seven = training_array[:,0] - 7 == 0
train_digit_eight = training_array[:,0] - 8 == 0

In [6]:
train_dict = {3:train_digit_three, 5:train_digit_five, 7:train_digit_seven, 8:train_digit_eight}

In [7]:
test_digit_three = testing_array[:,0] - 3 == 0
test_digit_five = testing_array[:,0] - 5 == 0
test_digit_seven = testing_array[:,0] - 7 == 0
test_digit_eight = testing_array[:,0] - 8 == 0

In [8]:
test_dict = {3:test_digit_three, 5:test_digit_five, 7:test_digit_seven, 8:test_digit_eight}

In [9]:
def gen_trainset(digit_one, digit_two, train):
    digit_one_array = train[train_dict[digit_one]]
    digit_two_array = train[train_dict[digit_two]]
    train_set = np.concatenate((digit_one_array, digit_two_array), axis = 0)
    X = train_set[:,1:]
    Y = train_set[:,0]
   
    num_data = X.shape[0]
    bias = np.ones((num_data,1))
    X = np.concatenate((bias, X), axis = 1)

    return X, Y


def gen_testset(digit_one, digit_two, test):
    digit_one_array = test[test_dict[digit_one]]
    digit_two_array = test[test_dict[digit_two]]
    test_set = np.concatenate((digit_one_array, digit_two_array), axis = 0)  
    X = test_set[:,1:]
    Y = test_set[:,0]
    num_data = X.shape[0]
    bias = np.ones((num_data,1))
    X = np.concatenate((bias, X), axis = 1)   
    
    return X,Y


# classifier definition

In [10]:
class linear_regression:        
    def fit(self, X,y):
        inverse = np.linalg.inv(np.dot(X.T, X))
        self.W = np.dot(np.dot(inverse, X.T),y)
        
        self.bigger = np.sort(np.unique(y))[1]
        self.smaller = np.sort(np.unique(y))[0]
        self.mean = np.mean(np.unique(y))
        
    def predict(self, X):
        self.predictions = np.dot(X,self.W)
        self.predictions[self.predictions - self.mean >= 0] = self.bigger
        self.predictions[self.predictions - self.mean < 0 ] = self.smaller
        
        return self.predictions
    
    def score(self, Y):
        return np.mean(self.predictions - Y == 0)

# classifier for digit 3,5

In [11]:
X_train, Y_train = gen_trainset(3,5, training_array)

In [12]:
X_test, Y_test = gen_testset(3,5, testing_array)

In [13]:
lr = linear_regression()

In [14]:
lr.fit(X_train, Y_train)

In [15]:
lr.predict(X_test)

array([ 3.,  3.,  5.,  3.,  3.,  3.,  3.,  3.,  5.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  5.,  3.,
        3.,  3.,  3.,  5.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  5.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  5.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  5.,  3.,  3.,  3.,  5.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  5.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  5.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  5.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        5.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  5.,  5.,  5.,
        5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5

In [16]:
lr.score(Y_test)

0.92944785276073616

# classifier for digit 3,7

In [17]:
X_train, Y_train = gen_trainset(3,7, training_array)
X_test, Y_test = gen_testset(3,7, testing_array)
lr = linear_regression()
lr.fit(X_train, Y_train)
lr.predict(X_test)
lr.score(Y_test)

0.9744408945686901

# classifier for digit 3,8

In [18]:
X_train, Y_train = gen_trainset(3,8, training_array)
X_test, Y_test = gen_testset(3,8, testing_array)
lr = linear_regression()
lr.fit(X_train, Y_train)
lr.predict(X_test)
lr.score(Y_test)

0.95180722891566261

# classifier for digit 5,7

In [19]:
X_train, Y_train = gen_trainset(5,7, training_array)
X_test, Y_test = gen_testset(5,7, testing_array)
lr = linear_regression()
lr.fit(X_train, Y_train)
lr.predict(X_test)
lr.score(Y_test)

0.98371335504885993

# classifier for digit 5,8

In [20]:
X_train, Y_train = gen_trainset(5,8, training_array)
X_test, Y_test = gen_testset(5,8, testing_array)
lr = linear_regression()
lr.fit(X_train, Y_train)
lr.predict(X_test)
lr.score(Y_test)

0.96319018404907975

# classifier for digit 7,8

In [22]:
X_train, Y_train = gen_trainset(7,8, training_array)
X_test, Y_test = gen_testset(7,8, testing_array)
lr = linear_regression()
lr.fit(X_train, Y_train)
lr.predict(X_test)
lr.score(Y_test)

0.46964856230031948