# Various sklearn models

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import os
from mlxtend.data import loadlocal_mnist
from sklearn.metrics import accuracy_score, f1_score

Xtrain = np.load(os.getcwd()+'/../feature_representations/feature_representation_1_train.npy')
_, ytrain = loadlocal_mnist(
    images_path=os.getcwd()+'/../emnist/emnist-letters-train-images-idx3-ubyte', 
    labels_path=os.getcwd()+'/../emnist/emnist-letters-train-labels-idx1-ubyte')
Xtest = np.load(os.getcwd()+'/../feature_representations/feature_representation_1_test.npy')
_, ytest = loadlocal_mnist(
    images_path=os.getcwd()+'/../emnist/emnist-letters-test-images-idx3-ubyte', 
    labels_path=os.getcwd()+'/../emnist/emnist-letters-test-labels-idx1-ubyte')

In [6]:
Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

((124800, 18), (124800,), (20800, 18), (20800,))

I have noticed better performance so far by removing the last 10 features.

In [8]:
Xtrain = Xtrain[:,:8]
Xtest = Xtest[:,:8]
Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

((124800, 8), (124800,), (20800, 8), (20800,))

# KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

k = 3
model = KNeighborsClassifier(n_neighbors=k)
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [14]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.4948076923076923
F1 score is 0.49930297849493993


# Naive Bayes

In [15]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [16]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.13557692307692307
F1 score is 0.10424439913171545


# Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [18]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.3870673076923077
F1 score is 0.37035153358816303


# Support Vector Machine

This may take a little while to run...

In [None]:
from sklearn.svm import SVC

model = SVC()
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [None]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

#### Optional: play with the feature representation of a flattened, downsampled image.

In [20]:
from PIL import Image

Xtrain = np.zeros((Xtrain.shape[0], 64))
Xtest = np.zeros((Xtest.shape[0], 64))
for i in range(XorigTrain.shape[0]):
    x = XorigTrain[i,:]
    x.reshape(28,28)
    img = Image.fromarray(x).resize((8,8), Image.BICUBIC)
    x = np.array(img).flatten()
    Xtrain[i,:] = x
for i in range(XorigTest.shape[0]):
    x = XorigTest[i,:]
    x.reshape(28,28)
    img = Image.fromarray(x).resize((8,8), Image.BICUBIC)
    x = np.array(img).flatten()
    Xtest[i,:] = x
    
Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

((124800, 64), (124800,), (20800, 64), (20800,))