In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml

In [2]:
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist['data'], mnist['target']
y = y.astype(np.uint8)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [3]:
# 1
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
# prepared data
# full_pipeline = Pipeline([
#         ('std_scaler', StandardScaler())
# ])

# X_train_prepared = full_pipeline.fit_transform(X_train.astype(np.float64))
# X_test_prepared = full_pipeline.transform(X_test.astype(np.float64))
# print('End Prepared')
#
# StandardScale을 하면 정확도가 더 낮아짐

# hyper parameter tunning
param_grid = {
    'weights': ['distance'], # ['uniform', 'distance'],
    'n_neighbors': [4]       # [3, 4, 5] 시간 절약
}

print('Start Search')
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=1, n_jobs=-1)
print('End Search, Start Fit')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
#

# model prediction
final_model = grid_search.best_estimator_

final_predictions = final_model.predict(X_test)
#

# print result
print('Test set accuracy: ', metrics.accuracy_score(y_test, final_predictions))

# from sklearn.metrics import confusion_matrix
# print(confusion_matrix(y_test, final_predictions))

from sklearn.metrics import classification_report
print(classification_report(y_test, final_predictions))

# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(knn_clf, X_test_prepared, y_test, cv =5, n_jobs=-1)
# print('Model accuracy: ', np.sqrt(-scores))
#

Start Search
End Search, Start Fit
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 13.0min finished


{'n_neighbors': 4, 'weights': 'distance'}
0.9716166666666666
Test set accuracy:  0.9714
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.97      1.00      0.98      1135
           2       0.98      0.96      0.97      1032
           3       0.97      0.96      0.97      1010
           4       0.98      0.97      0.97       982
           5       0.96      0.97      0.96       892
           6       0.98      0.99      0.98       958
           7       0.96      0.97      0.96      1028
           8       0.99      0.94      0.97       974
           9       0.96      0.96      0.96      1009

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000



In [4]:
import joblib

joblib.dump(final_model, 'mnist_model.pkl')

['mnist_model.pkl']

In [5]:
# 2
from scipy.ndimage.interpolation import shift

def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode='constant')
    return shifted_image.reshape([-1])

In [7]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

In [8]:
shuffle_idx = np.random.permutation(len(X_train_augmented))
print(shuffle_idx)
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

[125412  74095 107370 ...  57100   3377 146030]


In [15]:
# a = np.array([1, 2, 3, 4, 5])
# s = np.random.permutation(len(a))
# print(a)
# print(s)
# a_s = a[s]
# print(a_s)

[1 2 3 4 5]
[0 2 4 3 1]
[1 3 5 4 2]


In [16]:
final_model.fit(X_train_augmented, y_train_augmented)

y_pred = final_model.predict(X_test)
print('Test set accuracy: ', metrics.accuracy_score(y_test, final_predictions))

Test set accuracy:  0.9714
