In [103]:
import pickle

import cv2
import numpy as np
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer

In [62]:
def vect_norm_x_image_data(*args):
    norm_vect_data = []
    for index, arg in enumerate(args):
        # vectorize image data by reshaping the axes
        vectorized_shape = (arg.shape[0], arg.shape[1] * arg.shape[2])
        vect_arg = np.array(arg).reshape(vectorized_shape)
        # improve performance by normalizing data
        norm_vect_data.append(Normalizer().transform(vect_arg))
    return norm_vect_data

def unpickle_model(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

In [111]:
_, (x_test, y_test) = tf.keras.datasets.mnist.load_data(path="mnist.npz")
x_test, = vect_norm_x_image_data(x_test)
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}\n")

x_test shape: (10000, 784)
y_test shape: (10000,)



In [27]:
k_neighbors: KNeighborsClassifier = unpickle_model('kneighbors_digit_model.dat')
y_pred = k_neighbors.predict(x_test)
score = k_neighbors.score(x_test, y_test)
print(f'Score = {score}')
print('Predicted Actual')
print('--------- ------')
for i, prediction in enumerate(y_pred):
    if i == 100:
        break
    print('{0:9} {1:6}'.format(prediction, y_test[i]))

x_test shape: (10000, 784)
y_test shape: (10000,)

Score = 0.9753
Predicted Actual
--------- ------
        7      7
        2      2
        1      1
        0      0
        4      4
        1      1
        4      4
        9      9
        5      5
        9      9
        0      0
        6      6
        9      9
        0      0
        1      1
        5      5
        9      9
        7      7
        3      3
        4      4
        9      9
        6      6
        6      6
        5      5
        4      4
        0      0
        7      7
        4      4
        0      0
        1      1
        3      3
        1      1
        3      3
        4      4
        7      7
        2      2
        7      7
        1      1
        2      2
        1      1
        1      1
        7      7
        4      4
        2      2
        3      3
        5      5
        1      1
        2      2
        4      4
        4      4
        6      6
        3      3
        5      5

In [28]:
random_forest: RandomForestClassifier = unpickle_model('randomforest_digit_model.dat')
y_pred = random_forest.predict(x_test)
score = random_forest.score(x_test, y_test)
print(f'Score = {score}')
print('Predicted Actual')
print('--------- ------')
for i, prediction in enumerate(y_pred):
    if i == 100:
        break
    print('{0:9} {1:6}'.format(prediction, y_test[i]))

Score = 0.9669
Predicted Actual
--------- ------
        7      7
        2      2
        1      1
        0      0
        4      4
        1      1
        4      4
        9      9
        5      5
        9      9
        0      0
        6      6
        9      9
        0      0
        1      1
        5      5
        9      9
        7      7
        3      3
        4      4
        9      9
        6      6
        6      6
        5      5
        4      4
        0      0
        7      7
        4      4
        0      0
        1      1
        3      3
        1      1
        3      3
        4      4
        7      7
        2      2
        7      7
        1      1
        2      2
        1      1
        1      1
        7      7
        4      4
        2      2
        3      3
        5      5
        1      1
        2      2
        4      4
        4      4
        6      6
        3      3
        5      5
        5      5
        6      6
        0      0

In [99]:
from PIL import Image
test_image = "test_3.png"
# 3 for test since worst image; 0 is best image
resized_img = Image.open(test_image).resize((28, 28), resample=0)
resized_img = resized_img.convert('1')  # black and white


In [139]:
def predict(image):
    img = cv2.imread(image, 0)
    img = cv2.resize(img, (28, 28))  # resize: model trained on 28x28
    img = cv2.bitwise_not(img)  # invert black and white

    img = img.reshape(img.shape[0] * img.shape[1])
    img = img/255  # scale 255 to 1
    img = np.array([img])

    pred = random_forest.predict(img)[0]
    pred_prob = random_forest.predict_proba(img)
    print(f'Random forest prediction: {pred}')
    print(f'{pred_prob}\n')

    pred = k_neighbors.predict(img)[0]
    pred_prob = k_neighbors.predict_proba(img)
    print(f'K neighbors prediction: {classes[pred]}')
    print(f'{pred_prob}\n\n')

In [140]:
predict("test_digit.png")
predict("test_3.png")


Image: test_digit.png
Random forest prediction: 1
[[0.01666667 0.23       0.08       0.07166667 0.05833333 0.225
  0.075      0.07333333 0.10166667 0.06833333]]

K neighbors prediction: 1
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


Image: test_3.png
Random forest prediction: 3
[[0.08333333 0.04666667 0.14333333 0.23166667 0.04333333 0.105
  0.025      0.19666667 0.04166667 0.08333333]]

K neighbors prediction: 3
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]




        models = (
            KNeighborsClassifier(
                algorithm='kd_tree',
                leaf_size=10,
                n_jobs=-1,
                n_neighbors=4,
                weights='distance'
            ),
            RandomForestClassifier(
                class_weight='balanced',
                criterion='entropy',
                max_features='log2',
                n_estimators=600,
                n_jobs=-1,
                random_state=cls.random_state,
            ),
        )