In [None]:
# load the data - download in matlab format from:
# https://www.nist.gov/itl/iad/image-group/emnist-dataset
import scipy.io
mat = scipy.io.loadmat('../Dataset/emnist-letters.mat')
d = mat['dataset']
images = d[0,0]['train'][0,0]['images']
labels = d[0,0]['train'][0,0]['labels']
test_images = d[0,0]['test'][0,0]['images']
test_labels = d[0,0]['test'][0,0]['labels']


which_half = '1'
if which_half == '1':
    half = [1,2,3,6,7,9,10,13,16,19,21,23,24]
else:
    half = [4,5,8,11,12,14,15,17,18,20,22,25,26]


In [None]:
import numpy as np
from sklearn.decomposition import PCA

n= images.shape[0]
# format training set
images_2 = np.asarray(images[0:n][:],dtype=np.float64)/255
labels_2 = np.asarray(labels[0:n],dtype=np.int32).ravel();
training_ind = [i in half for i in labels_2]

# format testing set
test_images_2 = np.asarray(test_images,dtype=np.float64)/255
test_labels_2 = np.asarray(test_labels,dtype=np.int32).ravel();
testing_ind = [i in half for i in test_labels_2]

# perform dimensionality reduction
ncomps = 25;  # We'll reduce from 784 dimensions to 25
pca = PCA(n_components=ncomps);
pca.fit(images_2)
# Those 25 dimensions still explain a lot of the variance
print 'Explained Variance: ' + str(round(100*sum(pca.explained_variance_ratio_))) + '%'

images_pca = pca.transform(images_2)
images_pca = images_pca[training_ind][:]
labels_2 = labels_2[training_ind]
test_images_pca = pca.transform(test_images_2)
test_images_pca = test_images_pca[testing_ind][:]
test_labels_2 = test_labels_2[testing_ind]

In [None]:
# save pca matrix - This is the matrix that we will use to perform the transformation in the app
mat = pca.components_
pca_mat = mat.tolist()
import csv
with open('../Models/pca_matrix.csv','wb') as f:
    writer = csv.writer(f)
    writer.writerows(pca_mat)

In [None]:
# learn the model
import time
start = time.time()

nsamps = 20000
from sklearn import svm
clf = svm.SVC(probability=True)
clf.fit(images_pca[:nsamps][:],labels_2[:nsamps])

end = time.time()
print( "Time it took to learn the model: " + str(end - start) + " seconds")

In [None]:
start = time.time()
answers = clf.predict(test_images_pca)
end = time.time()
print( "Time it took to test the model: " + str(end - start) + " seconds")

In [None]:
cnt = 0;
s=[i==j for i,j in zip(test_labels_2,answers)]
incorrect=[i!=j for i,j in zip(test_labels_2,answers)]
print("Percent Correct: " + str(100*sum(s)/len(s)) + "%")

In [None]:
import coremltools
coreml_model = coremltools.converters.sklearn.convert(clf, 'convertedPCAValues', output_feature_names='letterIndex')
coreml_model.author = 'Kate Bonnen and Conrad Stoll'
coreml_model.license = 'MIT'
coreml_model.short_description = "Recognize the hand-drawn letter from an input image."

coreml_model.input_description['convertedPCAValues'] = 'The input image alpha values multiplied by a PCA matrix.'

coreml_model.output_description['letterIndex'] = 'Most likely letter index, ranging from 1 to 26.'
coreml_model.output_description['classProbability'] = 'The probability of each letter index.'

coreml_model.save('../Models/letters_pca_half_' + which_half + '.mlmodel')