In [1]:
import numpy as np
import struct
from numpy import linalg as LA

In [2]:
def getData(trainOrTest = 'testing'):
    
    if trainOrTest == "training":
        fname_img = "./Data/trainSetImg"
        fname_lbl = "./Data/trainSetLab"
    else:
        fname_img = "./Data/testSetImg"
        fname_lbl = "./Data/testSetLab"

    # Load everything in some numpy arrays
    with open(fname_lbl, 'rb') as flbl:
        magic, num = struct.unpack(">II", flbl.read(8))
        lbl = np.fromfile(flbl, dtype=np.int8)

    with open(fname_img, 'rb') as fimg:
        magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
        img = np.fromfile(fimg, dtype=np.uint8).reshape(len(lbl), rows, cols)

    return list(zip(img, lbl))

In [3]:
trainingData = getData('training')
testingData = getData('testing')

In [4]:
def getAccuracy(list1, list2):
    if len(list1) != len(list2):
        return 'Error: sizes are not the same'
    counter = 0
    for i, j in zip(list1, list2):
        if i == j: 
            counter += 1
    return counter / len(list1)

In [5]:
# Method 1: Always guess 0
def predictZero(data):
    return [0] * len(data)

accuracy = getAccuracy(predictZero(testingData), [digit[1] for digit in testingData])
print('Accuracy when always predicting 0: ' + str(accuracy))

Accuracy when always predicting 0: 0.098


In [6]:
# Method 2: "Average darkness" (find the number whose "average darkness" is closest to the test example)

def avgDark(imageAndLab):
    totalAndCount = [[0,0] for _ in range(10)]
    for digit in imageAndLab:
        img = digit[0]
        character = digit[1]
        totalAndCount[character][1] += 1
        totalAndCount[character][0] += sum([sum(i) for i in zip(*img)])
    for i in range(len(totalAndCount)):
        totalAndCount[i] = totalAndCount[i][0] / totalAndCount[i][1]
    return totalAndCount

In [7]:
def avgDarkPred(model, testingDigits):
    predictions = []
    for digit in testingDigits:
        digitDark = sum([sum(i) for i in zip(*digit)])
        darkDiff = [abs(digitDark - avgDark) for avgDark in model]
        predictions.append(darkDiff.index(min(darkDiff)))
    return predictions

In [8]:
avgDarkModel = avgDark(trainingData)

In [9]:
model2Preds = avgDarkPred(avgDarkModel, [digit[0] for digit in testingData])
model2Acc = getAccuracy(model2Preds, [digit[1] for digit in testingData])
print('Accuracy when average darkness: ' + str(model2Acc))

Accuracy when average darkness: 0.223


In [10]:
# Method 3"Nearest neighbor" (find the single training example in Euclidean space that is closest to the test example)

In [11]:
# Create distance for calculating euclidean distance
def distance(point1, point2):
    difference = np.array(point1) - np.array(point2)
    return LA.norm(difference)

In [None]:
def nneighbor(trainingData, testDataImgs):
    predictedDigits = []

    trainingImages = [digit[0] for digit in trainingData]
    nnDistance = np.vectorize(distance)
    for image in testDataImgs:
        distances = nnDistance(trainingImages, image)
        indexOfSmallest = distances.index(min(distances))
        predictedDigits.append(trainingData[indexOfSmallest][1])
    return predictedDigits

In [None]:
model3Preds = nneighbor(trainingData, [digit[0] for digit in testingData])
model3Acc = getAccuracy(model3Preds, [digit[1] for digit in testingData])
print('Accuracy with nearest neighbor: ' + str(model3Acc))