In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
train_digits = pd.read_csv("data/mnist_train.csv")
test_digits = pd.read_csv("data/mnist_test.csv")
#convert pandas Data Frame to Numpy Array
train_array = train_digits.to_numpy()
test_array = test_digits.to_numpy()

In [3]:
def show_digit(sample):
    """
    shows digit
    """
    img = test_array[sample-1, 1:]
    img.shape = (28,28)
    plt.imshow(img, 'gray')

In [4]:
def avg_digit_img(dat, digit):
    """
    compute average intensities of all rows describing the same number
    imshow as output
    """

    list_digit = []
    for i in range(0, dat.shape[0]):
        if dat[i, 0] == digit:
            list_digit.append(i)

    avg = np.zeros((1,784))
    
    for j in range(0, len(list_digit)):
        avg += dat[list_digit[j], 1:]
    
    avg.shape = (28,28)
    avg /= len(list_digit)

    plt.imshow(avg, 'gray')
    plt.colorbar()


In [5]:
def avg_digit_arr(dat, digit):
    """
    computes average intensities of all rows describing the same number
    returns numpy array
    """
   
    list_digit = []
    for i in range(0, dat.shape[0]):
        if dat[i, 0] == digit:
            list_digit.append(i)

    avg = np.zeros((1,784))

    for j in range(0, len(list_digit)):
        avg += dat[list_digit[j], 1:]

    avg /= len(list_digit)

    return avg

In [6]:
def digit_recognition(sample):
    """
    compare sample image with averaged images pixel by pixel
    select digit with lowest difference in intensity (output)
    """
    intensities_list = [] 
    sample_img = test_array[sample-1, 1:]

    for i in range(0,10):
        diff_arr = sample_img - avg_digit_arr(train_array, i)
        
        diff_list = []
        for j in range(0, 784):
            diff_list.append(diff_arr[0, j])

        intensity_sum = 0
        for k in range(0, len(diff_list)):
            diff_list[k] = diff_list[k]**2
            diff_list[k] = np.sqrt(diff_list[k])
            intensity_sum += diff_list[k]

        intensities_list.append(intensity_sum)

    return intensities_list.index(min(intensities_list))

In [7]:
def return_label(sample):
    """
    can be used for validation
    """
    return test_array[sample-1, 0]

In [9]:
avg_list = []

for i in range(0,10):
    avg_list.append(avg_digit_arr(train_array, i))

def digit_recognition_fast(sample):
    """
    faster digit recognition because avg images are stored at the beginning and not calculated each time
    """
    intensities_list = [] 
    sample_img = test_array[sample-1, 1:] 

    for i in range(0,10):
        diff_arr = sample_img - avg_list[i]
        
        diff_list = []
        for j in range(0, 784):
            diff_list.append(diff_arr[0, j])

        intensity_sum = 0
        for k in range(0, len(diff_list)):
            diff_list[k] = diff_list[k]**2
            diff_list[k] = np.sqrt(diff_list[k])
            intensity_sum += diff_list[k]

        intensities_list.append(intensity_sum)

    return intensities_list.index(min(intensities_list))

In [11]:
#Überprüfung digit recognition mit methode vergleich zu avg images

true = 0
false = 0

#for i in range(0, test_array.shape[0]):
if digit_recognition_fast(i+1) == test_array[i, 0]:
    true += 1
else:
    false += 1

print(f'Anzahl richtig erkannter Digits: {true} \n\
Anzahl falsch erkannter Digits: {false} \n\
Richtig: {true/test_array.shape[0]*100} Prozent')
#66,85 Prozent

Anzahl richtig erkannter Digits: 1 
Anzahl falsch erkannter Digits: 0 
Richtig: 0.010001000100010001 Prozent


In [12]:
#plots for project proposal

#plt.hist(train_array[:,0], rwidth = 0.9)
#plt.xlabel('digits')
#plt.ylabel('counts')
#plt.title('training dataset')


In [13]:
def digit_rec_comp(sample):
    """
    nice output for digit recognition comparison
    """
    
    print(f'Prediction: {digit_recognition(sample)}\n\
Real digit: {return_label(sample)}')
    show_digit(sample)

In [14]:
#Z-Transformation
#Druchschnittsberechnung

mean_pixels = np.zeros((1, 784))
for i in range(0, train_array.shape[0]):
    mean_pixels += train_array[i, 1:train_array.shape[1]]
mean_pixels /= train_array.shape[0]

#Standardabweichung
std_pixels = np.zeros((1, 784))
for i in range(1, train_array.shape[1]):
    std_pixels[0, i-1] = np.std(train_array[:, i])

#print(std_pixels)

#Erstellen z-array
z_array = np.zeros(train_array.shape)

for i in range(0, train_array.shape[0]):
    z_array[i, 0] = train_array[i, 0]
    for j in range(1, train_array.shape[1]):
        if std_pixels[0, j-1] != 0:
            z_array[i, j] = (train_array[i, j] - mean_pixels[0, j-1])/std_pixels[0, j-1]

#print(z_array)


In [16]:
#kNN

def kNN(img, k=5):
    counter = [0,0,0,0,0,0,0,0,0,0]
    max_indices = []

    M = img.shape[0]
    N = train_array.shape[0]

    img_dots = (img*img).sum(axis=1).reshape((M,1))*np.ones(shape=(1,N))
    train_array_dots = (train_array*train_array).sum(axis=1)*np.ones(shape=(M,1))
    D_squared =  img_dots + train_array_dots -2*img.dot(train_array.T)
    dist = D_squared.tolist()

    small_dist = sorted(range(len(dist)), key = lambda sub: dist[sub])[:k]

    for j in range(0, len(small_dist)):
        counter[train_array[small_dist[j],0]] += 1

    for k in range(0, len(counter)):
        if counter[k] == max(counter):
            max_indices.append(k)

    if len(max_indices) == 1:
        return max_indices[0]

    else:
        return 10

In [None]:
def distance_matrix(A, B, squared=False):
    """
    Compute all pairwise distances between vectors in A and B.

    Parameters
    ----------
    A : np.array
        shape should be (M, K)
    B : np.array
        shape should be (N, K)

    Returns
    -------
    D : np.array
        A matrix D of shape (M, N).  Each entry in D i,j represnets the
        distance between row i in A and row j in B.

    See also
    --------
    A more generalized version of the distance matrix is available from
    scipy (https://www.scipy.org) using scipy.spatial.distance_matrix,
    which also gives a choice for p-norm.
    """
    M = A.shape[0]
    N = B.shape[0]

    assert A.shape[1] == B.shape[1], f"The number of components for vectors in A \
        {A.shape[1]} does not match that of B {B.shape[1]}!"

    A_dots = (A*A).sum(axis=1).reshape((M,1))*np.ones(shape=(1,N))
    B_dots = (B*B).sum(axis=1)*np.ones(shape=(M,1))
    D_squared =  A_dots + B_dots -2*A.dot(B.T)

    if squared == False:
        zero_mask = np.less(D_squared, 0.0)
        D_squared[zero_mask] = 0.0
        return np.sqrt(D_squared)

    return D_squared