In [1]:
import numpy as np
from distances import euclidean_distance, cosine_distance
from nearest_neighbors import KNNClassifier
from cross_validation import kfold, knn_cross_val_score

In [2]:
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', return_X_y=True)
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

### Эксперимент №1

**Условие**: исследовать, какой алгоритм поиска ближайших соседей будет быстрее работать в различных ситуациях. Для каждого объекта тестовой выборки найти 5 его ближайших соседей в обучающей для евклидовой метрики. Для выборки нужно выбрать подмножество признаков, по которому будет считаться расстояние, размера 10, 20, 100 (подмножество признаков выбирается один раз для всех объектов, случайно). Необходимо проверить все алгоритмы поиска ближайших соседей, указанные в спецификации к заданию.

In [14]:
# Генератор случайных номеров признаков
a = np.random.permutation(X.shape[1])[:10]
b = np.random.permutation(X.shape[1])[:20]
c = np.random.permutation(X.shape[1])[:100]

In [15]:
X_tr_a, X_t_a = X_train[:, a], X_test[:, a]
X_tr_b, X_t_b = X_train[:, b], X_test[:, b]
X_tr_c, X_t_c = X_train[:, c], X_test[:, c]

In [16]:
# Время работы алгоритма 'my_own'
import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='my_own', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_a, y_train)
cl.find_kneighbors(X_t_a, return_distance=False)
time = (time.time() - start_time)
print('number of features is 10,', 'time is',"%s seconds" % time)

import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='my_own', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_b, y_train)
cl.find_kneighbors(X_t_b, return_distance=False)
time = (time.time() - start_time)
print('number of features is 20,', 'time is',"%s seconds" % time)

import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='my_own', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_c, y_train)
cl.find_kneighbors(X_t_c, return_distance=False)
time = (time.time() - start_time)
print('number of features is 100,', 'time is',"%s seconds" % time)

number of features is 10, time is 240.01704907417297 seconds
number of features is 20, time is 316.8139147758484 seconds
number of features is 100, time is 346.03232979774475 seconds


In [20]:
# Время работы алгоритма 'brute'
import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='brute', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_a, y_train)
cl.find_kneighbors(X_t_a, return_distance=False)
time = (time.time() - start_time)
print('number of features is 10,', 'time is',"%s seconds" % time)

import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='brute', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_b, y_train)
cl.find_kneighbors(X_t_b, return_distance=False)
time = (time.time() - start_time)
print('number of features is 20,', 'time is',"%s seconds" % time)

import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='brute', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_c, y_train)
cl.find_kneighbors(X_t_c, return_distance=False)
time = (time.time() - start_time)
print('number of features is 100,', 'time is',"%s seconds" % time)

number of features is 10, time is 15.89961290359497 seconds
number of features is 20, time is 15.501868963241577 seconds
number of features is 100, time is 18.60097908973694 seconds


In [13]:
# Время работы алгоритма 'kd_tree'
import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='kd_tree', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_a, y_train)
cl.find_kneighbors(X_t_a, return_distance=False)
time = (time.time() - start_time)
print('number of features is 10,', 'time is',"%s seconds" % time)

import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='kd_tree', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_b, y_train)
cl.find_kneighbors(X_t_b, return_distance=False)
time = (time.time() - start_time)
print('number of features is 20,', 'time is',"%s seconds" % time)

import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='kd_tree', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_c, y_train)
cl.find_kneighbors(X_t_c, return_distance=False)
time = (time.time() - start_time)
print('number of features is 100,', 'time is',"%s seconds" % time)

number of features is 10, time is 2.6283137798309326 seconds
number of features is 20, time is 16.898250818252563 seconds
number of features is 100, time is 154.25878810882568 seconds


In [14]:
# Время работы алгоритма 'ball_tree'
import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='ball_tree', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_a, y_train)
cl.find_kneighbors(X_t_a, return_distance=False)
time = (time.time() - start_time)
print('number of features is 10,', 'time is',"%s seconds" % time)

import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='ball_tree', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_b, y_train)
cl.find_kneighbors(X_t_b, return_distance=False)
time = (time.time() - start_time)
print('number of features is 20,', 'time is',"%s seconds" % time)

import time
start_time = time.time()
cl = KNNClassifier(k=5, strategy='ball_tree', metric='euclidean', weights = False, test_block_size=1)
cl.fit(X_tr_c, y_train)
cl.find_kneighbors(X_t_c, return_distance=False)
time = (time.time() - start_time)
print('number of features is 100,', 'time is',"%s seconds" % time)

number of features is 10, time is 12.218386173248291 seconds
number of features is 20, time is 41.74462270736694 seconds
number of features is 100, time is 155.46788477897644 seconds


### Эксперимент №2

**Условие**: оценить по кросс-валидации с 3 фолдами точность (долю правильно предсказанных ответов) и время работы k ближайших соседей в зависимости от следующих факторов:
* (a) k от 1 до 10 (только влияние на точность).
* (b) Используется евклидова или косинусная метрика.

In [53]:
# (a)
k_lst = [i for i in range(1, 11)]
knn_cross_val_score(X_train, y_train.astype(np.int), k_list=k_lst,
                    score='accuracy', cv=None, k=5,
                    strategy='brute', metric='euclidean',
                    weights=False, test_block_size=1)

{1: array([0.96895, 0.96675, 0.9667 ]),
 2: array([0.9603 , 0.95995, 0.96125]),
 3: array([0.96955, 0.96825, 0.96715]),
 4: array([0.96705, 0.9667 , 0.9671 ]),
 5: array([0.9681, 0.9673, 0.9672]),
 6: array([0.96555, 0.96495, 0.9657 ]),
 7: array([0.9652, 0.965 , 0.9656]),
 8: array([0.96415, 0.9639 , 0.96525]),
 9: array([0.9637 , 0.9632 , 0.96455]),
 10: array([0.96245, 0.9622 , 0.9637 ])}

In [4]:
# (b)
import time
start_time = time.time()
knnClassifier1 = KNNClassifier(k=5, strategy='brute', metric='euclidean',
                               weights=False, test_block_size=1)
knnClassifier1.fit(X_train, y_train)
knnClassifier1.find_kneighbors(X_test, return_distance=False)
time = (time.time() - start_time)
print('euclidean metric,', 'time is', "%s seconds" % time)
print('score --', knn_cross_val_score(X_train, y_train.astype(np.int),
      k_list=[5], score='accuracy', cv=None, k=5, strategy='brute',
      metric='euclidean', weights=False, test_block_size=1).values())

import time
start_time = time.time()
knnClassifier2 = KNNClassifier(k=5, strategy='brute', metric='cosine',
                               weights=False, test_block_size=1)
knnClassifier2.fit(X_train, y_train)
knnClassifier2.find_kneighbors(X_test, return_distance=False)
time = (time.time() - start_time)
print('cosine metric,', 'time is', "%s seconds" % time)
print('score --', knn_cross_val_score(X_train, y_train.astype(np.int),
      k_list=[5], score='accuracy', cv=None, k=5, strategy='brute',
      metric='cosine', weights=False, test_block_size=1).values())

euclidean metric, time is 28.817243099212646 seconds
score -- dict_values([array([0.9681, 0.9673, 0.9672])])
cosine metric, time is 34.93546104431152 seconds
score -- dict_values([array([0.97275, 0.97015, 0.9716 ])])


### Эксперимент №3

**Условие**: сравнить взвешенный метод k ближайших соседей, где голос объекта равен 1/(distance + ε), где ε — 10−5, с методом без весов при тех же фолдах и параметрах.

In [72]:
print('unweighted method: score --', knn_cross_val_score(X_train,
      y_train.astype(np.int),
      k_list=[5], score='accuracy', cv=None, k=5, strategy='brute',
      metric='euclidean', weights=False, test_block_size=1).values())

import time
start_time = time.time()
knnClassifier1 = KNNClassifier(k=5, strategy='brute', metric='euclidean',
                               weights=False, test_block_size=1)
knnClassifier1.fit(X_train, y_train)
knnClassifier1.find_kneighbors(X_test, return_distance=False)
time = (time.time() - start_time)
print('time is', "%s seconds" % time)

print('weighted method: score --', knn_cross_val_score(X_train,
      y_train.astype(np.int),
      k_list=[5], score='accuracy', cv=None, k=5, strategy='brute',
      metric='euclidean', weights=True, test_block_size=1).values())

import time
start_time = time.time()
knnClassifier2 = KNNClassifier(k=5, strategy='brute', metric='euclidean',
                               weights=True, test_block_size=1)
knnClassifier2.fit(X_train, y_train)
knnClassifier2.find_kneighbors(X_test, return_distance=True)
time = (time.time() - start_time)
print('time is', "%s seconds" % time)

unweighted method: score -- dict_values([array([0.9681, 0.9673, 0.9672])])
time is 29.51373314857483 seconds
weighted method: score -- dict_values([array([0.96925, 0.96825, 0.96875])])
time is 30.208980083465576 seconds


### Эксперимент №4

**Условие**: применить лучший алгоритм к исходной обучающей и тестовой выборке. Подсчитать точность. Сравнить с точностью по кросс-валидации с 3 фолдами. Сравнить с указанной в интернете точностью лучших алгоритмов на данной выборке. Построить и проанализировать матрицу ошибок (confusion matrix). Визуализировать несколько объектов из тестовой выборки, на которых были допущены ошибки. Проанализировать и указать их общие черты.

In [23]:
knnClassifier = KNNClassifier(k=3, strategy='brute', metric='cosine',
                               weights=True, test_block_size=1)
knnClassifier.fit(X_train, y_train)
y_pred_train = knnClassifier.predict(X_train)
y_pred_test = knnClassifier.predict(X_test)

n_correct = np.sum(y_pred_train == y_train.astype(np.int))
accuracy = n_correct / y_train.shape[0]
print('train predict accuracy:', accuracy)

n_correct = np.sum(y_pred_test == y_test.astype(np.int))
accuracy = n_correct / y_test.shape[0]
print('test predict accuracy:', accuracy)

train predict accuracy: 1.0
test predict accuracy: 0.9742


In [26]:
print('score --', knn_cross_val_score(X_train,
      y_train.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.97495, 0.9725 , 0.97175])])


#### Точность лучших алгоритмов на данной выборке
На сайте MNIST(http://yann.lecun.com/exdb/mnist/) можно найти список лучших результатов, достигнутых алгоритмами на это наборе данных. Так, худший из записанных результатов достигнут простым линейным классификатором (12% ошибок), а подавляющее большинство лучших результатов получены алгоритмами на основе нейронных сетей. Так, ансамбль из 35 сверточных нейронных сетей в 2012 году сумел получить всего 0.23% ошибок на наборе данных, что является очень хорошим результатом, вполне сравнимым с человеком: classifier - committee of 35 conv.net, 1-20-P-40-P150-10 (elastic distortions).

In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_pred_train.astype(np.str))

array([[5923,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, 6742,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0, 5958,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0, 6131,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 5842,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0, 5421,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0, 5918,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 6265,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0, 5851,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 5949]])

In [28]:
confusion_matrix(y_test, y_pred_test.astype(np.str))

array([[ 977,    1,    0,    0,    0,    0,    1,    1,    0,    0],
       [   0, 1130,    3,    1,    0,    0,    1,    0,    0,    0],
       [   8,    1, 1009,    1,    1,    0,    0,    8,    4,    0],
       [   0,    1,    3,  975,    0,   13,    0,    5,    7,    6],
       [   2,    2,    0,    0,  946,    0,    6,    1,    1,   24],
       [   4,    0,    0,    9,    1,  859,    7,    1,    6,    5],
       [   4,    2,    0,    0,    2,    3,  947,    0,    0,    0],
       [   1,   10,    6,    0,    1,    0,    0,  996,    0,   14],
       [   6,    2,    2,   10,    4,    4,    4,    3,  936,    3],
       [   8,    7,    2,    5,    7,    3,    1,    6,    3,  967]])

In [29]:
np.array(np.where((y_pred_test==9) & (y_test=='4')))

array([[ 115,  571,  707,  740,  774,  881, 1178, 1242, 1422, 2130, 2447,
        2771, 2901, 3405, 3490, 4017, 4194, 4598, 4860, 4911, 5936, 8520,
        8527, 9211]])

In [46]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
%config InlineBackend.figure_format = 'pdf'

label = y_test[4911]

# The rest of columns are pixels
pixels = X_test[4911]

# Make those columns into a array of 8-bits pixels
# This array will be of 1D with length 784
# The pixel intensity values are integers from 0 to 255
pixels = np.array(pixels, dtype='uint8')

# Reshape the array into 28 x 28 array (2-dimensional array)
pixels = pixels.reshape((28, 28))

# Plot
plt.title('Label is {label}'.format(label=label))
plt.imshow(pixels, cmap="Greys")
plt.show()

<Figure size 432x288 with 1 Axes>

In [33]:
cl = ['0', '1', '2','3','4','5','6','7','8','9']

In [65]:
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.winter):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=20)
    plt.yticks(tick_marks, classes, fontsize=20)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", 
                 color="white" if cm[i, j] < thresh else "black", fontsize=20)

    plt.tight_layout()
    plt.xlabel('Predicted label', fontsize=20)

    return plt
cm = confusion_matrix(y_test, y_pred_test.astype(np.str))
fig = plt.figure(figsize=(10, 10))
plot = plot_confusion_matrix(cm, classes=cl, normalize=False, title='Confusion matrix')
plt.savefig('line.pdf')

<Figure size 720x720 with 2 Axes>

### Эксперимент №5

**Условие**: Размножить обучающую выборку с помощью поворотов, смещений и применений гауссовского фильтра. Разрешается использовать библиотеки для работы с изображениями. Подобрать по кросс-валидации с 3 фолдами параметры преобразований. Рассмотреть следующие параметры для преобразований и их комбинации:
* (a) Величина поворота: 5, 10, 15 (в каждую из двух сторон)
* (b) Величина смещения: 1, 2, 3 пикселя (по каждой из двух размерностей)
* (c) Дисперсия фильтра Гаусса: 0.5, 1, 1.5

In [3]:
# (a)
import scipy
from scipy.ndimage import rotate


X_tr_a5 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_a5[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_a5[i] = scipy.ndimage.rotate(X_train[i-60000].reshape((28, 28)), 5, reshape = False, prefilter=False).reshape(784,)
y_tr_2x = np.zeros(120000)
for i in range(60000):
    y_tr_2x[i] = y_train[i]
for i in range(60000, 120000):
    y_tr_2x[i] = y_train[i-60000]
print('score a(5) --', knn_cross_val_score(X_tr_a5, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score a(5) -- dict_values([array([0.99815 , 0.996675, 0.9943  ])])


In [4]:
X_tr_a10 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_a10[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_a10[i] = scipy.ndimage.rotate(X_train[i-60000].reshape((28, 28)), 10, reshape = False, prefilter=False).reshape(784,)
print('score a(10) --', knn_cross_val_score(X_tr_a10, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score a(10) -- dict_values([array([0.985025, 0.98505 , 0.980375])])


In [5]:
X_tr_a15 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_a15[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_a15[i] = scipy.ndimage.rotate(X_train[i-60000].reshape((28, 28)), 15, reshape = False, prefilter=False).reshape(784,)
print('score --', knn_cross_val_score(X_tr_a15, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.972275, 0.9781  , 0.973275])])


In [6]:
X_tr_a5_r = np.zeros((120000, 784))
for i in range(60000):
    X_tr_a5_r[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_a5_r[i] = scipy.ndimage.rotate(X_train[i-60000].reshape((28, 28)), -5, reshape = False, prefilter=False).reshape(784,)
print('score --', knn_cross_val_score(X_tr_a5_r, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.998075, 0.996425, 0.9942  ])])


In [7]:
X_tr_a10_r = np.zeros((120000, 784))
for i in range(60000):
    X_tr_a10_r[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_a10_r[i] = scipy.ndimage.rotate(X_train[i-60000].reshape((28, 28)), -10, reshape = False, prefilter=False).reshape(784,)
print('score --', knn_cross_val_score(X_tr_a10_r, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.983575, 0.9845  , 0.98005 ])])


In [8]:
X_tr_a15_r = np.zeros((120000, 784))
for i in range(60000):
    X_tr_a15_r[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_a15_r[i] = scipy.ndimage.rotate(X_train[i-60000].reshape((28, 28)), -15, reshape = False, prefilter=False).reshape(784,)
print('score --', knn_cross_val_score(X_tr_a15_r, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.96985 , 0.97705 , 0.973425])])


In [9]:
# (b)
from scipy.ndimage import shift


X_tr_b1 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_b1[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_b1[i] = scipy.ndimage.shift(X_train[i-60000].reshape((28, 28)), 1, prefilter=False).reshape(784,)
print('score --', knn_cross_val_score(X_tr_b1, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.96995 , 0.978125, 0.97105 ])])


In [10]:
X_tr_b2 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_b2[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_b2[i] = scipy.ndimage.shift(X_train[i-60000].reshape((28, 28)), 2, prefilter=False).reshape(784,)
print('score --', knn_cross_val_score(X_tr_b2, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.96475, 0.97545, 0.96865])])


In [11]:
X_tr_b3 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_b3[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_b3[i] = scipy.ndimage.shift(X_train[i-60000].reshape((28, 28)), 3, prefilter=False).reshape(784,)
print('score --', knn_cross_val_score(X_tr_b3, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.96465, 0.97505, 0.9686 ])])


In [17]:
# (c)
import scipy


X_tr_c1 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_c1[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_c1[i] = scipy.ndimage.filters.gaussian_filter(X_train[i-60000].reshape((28, 28)), 0.5).reshape(784,)
print('score --', knn_cross_val_score(X_tr_c1, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([1., 1., 1.])])


In [13]:
X_tr_c2 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_c2[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_c2[i] = scipy.ndimage.filters.gaussian_filter(X_train[i-60000].reshape((28, 28)), 1).reshape(784,)
print('score --', knn_cross_val_score(X_tr_c2, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.9985  , 0.993425, 0.986175])])


In [18]:
X_tr_c3 = np.zeros((120000, 784))
for i in range(60000):
    X_tr_c3[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_c3[i] = scipy.ndimage.filters.gaussian_filter(X_train[i-60000].reshape((28, 28)), 1.5).reshape(784,)
print('score --', knn_cross_val_score(X_tr_c3, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())

score -- dict_values([array([0.989   , 0.98075 , 0.967625])])


In [20]:
X_tr_all = np.zeros((120000, 784))
for i in range(60000):
    X_tr_all[i] = X_train[i]
for i in range(60000, 120000):
    X_tr_all[i] = scipy.ndimage.rotate(X_train[i-60000].reshape((28, 28)), 5, reshape = False, prefilter=False).reshape(784,)
    X_tr_all[i] = scipy.ndimage.shift(X_train[i-60000].reshape((28, 28)), 1, prefilter=False).reshape(784,)
    X_tr_all[i] = scipy.ndimage.filters.gaussian_filter(X_train[i-60000].reshape((28, 28)), 0.5).reshape(784,)
print('score --', knn_cross_val_score(X_tr_all, y_tr_2x.astype(np.int),
      k_list=[3], score='accuracy', cv=None, k=3, strategy='brute',
      metric='cosine', weights=True, test_block_size=1).values())  

score -- dict_values([array([1., 1., 1.])])


In [None]:
knnClassifier = KNNClassifier(k=3, strategy='brute', metric='cosine',
                               weights=True, test_block_size=1)
knnClassifier.fit(X_tr_all, y_tr_2x.astype(np.int))
y_pred_test = knnClassifier.predict(X_test)

In [25]:
n_correct = np.sum(y_pred_test == y_test.astype(np.int))
accuracy = n_correct / y_test.shape[0]
print('train predict accuracy:', accuracy)

train predict accuracy: 0.9751


In [26]:
X_tr_all = np.zeros((120000, 784))
for i in range(60000):
    X_tr_all[i] = X_train[i]
for i in range(60000, 80000):
    X_tr_all[i] = scipy.ndimage.rotate(X_train[i-60000].reshape((28, 28)), 5, reshape = False, prefilter=False).reshape(784,)
for i in range(80000, 100000):
    X_tr_all[i] = scipy.ndimage.shift(X_train[i-60000].reshape((28, 28)), 1, prefilter=False).reshape(784,)
for i in range(100000, 120000):
    X_tr_all[i] = scipy.ndimage.filters.gaussian_filter(X_train[i-60000].reshape((28, 28)), 0.5).reshape(784,)

In [28]:
knnClassifier = KNNClassifier(k=3, strategy='brute', metric='cosine',
                               weights=True, test_block_size=1)
knnClassifier.fit(X_tr_all, y_tr_2x.astype(np.int))
y_pred_test = knnClassifier.predict(X_test)
n_correct = np.sum(y_pred_test == y_test.astype(np.int))
accuracy = n_correct / y_test.shape[0]
print('train predict accuracy:', accuracy)

train predict accuracy: 0.9766


In [32]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_test.astype(np.str))

array([[ 978,    0,    0,    0,    0,    0,    0,    1,    1,    0],
       [   0, 1128,    3,    0,    0,    1,    3,    0,    0,    0],
       [   7,    0, 1006,    3,    1,    0,    1,   10,    4,    0],
       [   0,    0,    2,  989,    1,    6,    0,    4,    5,    3],
       [   0,    1,    0,    0,  948,    0,    5,    4,    2,   22],
       [   5,    1,    0,   11,    1,  857,    9,    1,    3,    4],
       [   4,    2,    0,    0,    2,    2,  946,    0,    2,    0],
       [   0,   11,    7,    0,    3,    0,    0, 1001,    0,    6],
       [   5,    0,    3,   11,    3,    7,    3,    4,  935,    3],
       [   3,    4,    1,    4,    3,    3,    1,    9,    3,  978]])

### Эксперимент №6

**Условие**: Реализовать описанный выше алгоритм, основанный на преобразовании объектов тестовой выборки. Проверить то же самое множество параметров, что и в предыдущем пункте. Проанализировать как изменилась матрица ошибок, какие ошибки алгоритма помогает исправить каждое преобразование. Качественно сравнить два подхода (5 и 6 пункты) между собой.
Замечание. В рамках данного эксперимента подразумевается обучение модели на оригинальном датаесете, преобразования объектов тестовой выборки, применение модели к преобразованным копиям изображения из тестовой выборки и получение результата путем голосования среди преобразованных объектов.

In [104]:
X_tests_a = np.zeros((10000, 784))
for i in range(10000):
    X_tests_a[i] = scipy.ndimage.rotate(X_test[i].reshape((28, 28)), 5, reshape = False, prefilter=False).reshape(784,)
X_tests_b = np.zeros((10000, 784))
for i in range(10000):
    X_tests_b[i] = scipy.ndimage.shift(X_test[i].reshape((28, 28)), 1, prefilter=False).reshape(784,)
X_tests_c = np.zeros((10000, 784))
for i in range(10000):
    X_tests_c[i] = scipy.ndimage.filters.gaussian_filter(X_test[i].reshape((28, 28)), 0.5).reshape(784,)

In [105]:
knnClassifier = KNNClassifier(k=3, strategy='brute', metric='cosine',
                               weights=True, test_block_size=1)
knnClassifier.fit(X_train, y_train.astype(int))

In [106]:
y_test_a = knnClassifier.predict(X_tests_a)
y_test_b = knnClassifier.predict(X_tests_b)
y_test_c = knnClassifier.predict(X_tests_c)

In [108]:
Y_T = np.zeros((3, 10000))
Y_T[0], Y_T[1], Y_T[2] = y_test_a, y_test_b, y_test_c

In [109]:
res = []
for i in Y_T.T:
    values, counts = np.unique(i, return_counts=True)
    ind = np.argmax(counts)
    res.append(values[ind])
y_res = np.array(res)

In [111]:
n_correct = np.sum(y_res == y_test.astype(np.int))
accuracy = n_correct / y_test.shape[0]
print('train predict accuracy:', accuracy)

train predict accuracy: 0.9715


In [151]:
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.cubehelix):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=20)
    plt.yticks(tick_marks, classes, fontsize=20)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", 
                 color="white" if cm[i, j] < thresh else "black", fontsize=20)

    plt.tight_layout()
    plt.xlabel('Predicted label', fontsize=20)

    return plt
cm = confusion_matrix(y_test, y_res.astype(np.int).astype(np.str))
fig = plt.figure(figsize=(10, 10))
plot = plot_confusion_matrix(cm, classes=cl, normalize=False, title='Confusion matrix')
plt.savefig('final.pdf')

<Figure size 720x720 with 2 Axes>