In [1]:
import numpy as np
import os

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import joblib

In [2]:
DATA_PATH = '../data'
IMAGE_SIZE = 784  # 28 * 28

In [3]:
def load_mnist(path, kind='train'):
    """
    'train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz',
    't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz'
    Before use, you need to download the above four files to the `path` directory and unzip them
    """
    labels_path = os.path.join(path, '%s-labels.idx1-ubyte' % kind)
    images_path = os.path.join(path, '%s-images.idx3-ubyte' % kind)

    with open(labels_path, 'rb') as label_file:
        labels = np.frombuffer(label_file.read(), dtype=np.uint8, offset=8)

    with open(images_path, 'rb') as image_file:
        images = np.frombuffer(image_file.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), IMAGE_SIZE)

    return images, labels

In [4]:
# Read MNIST dataset
x_train, y_train = load_mnist(DATA_PATH, kind='train')
# x_test, y_test = load_mnist(DATA_PATH, kind='t10k')

In [5]:
# Feature Engineering: Standardization
transfer5 = StandardScaler()
x_train = transfer5.fit_transform(x_train)

In [6]:
# x_train = x_train[:1000]
# y_train = y_train[:1000]

In [7]:
# SVM classifier
svm_model5 = SVC(kernel='poly', max_iter=5000, gamma='scale', probability=True)

In [8]:
# Grid search and cross validation
param_dict = {
    'C': [20, 22, 24, 26, 28, 30, 32],
     # 'kernel': ['linear', 'rbf', 'poly'],
     # 'gamma': ['scale', 'auto']
}
svm_model5 = GridSearchCV(svm_model5, param_dict, n_jobs=-1, cv=2)

In [9]:
# Train model
svm_model5.fit(x_train, y_train)



In [10]:
# Results of hyperparameter tuning on training data
print("Best parameters: \n", svm_model5.best_params_)
print("Best results (results in the validation set): \n", svm_model5.best_score_)
print("Best estimator: \n", svm_model5.best_estimator_)
print("Cross-validation results.: \n", svm_model5.cv_results_)

Best parameters: 
 {'C': 30}
Best results (results in the validation set): 
 0.9724666666666666
Best estimator: 
 SVC(C=30, kernel='poly', max_iter=5000, probability=True)
Cross-validation results.: 
 {'mean_fit_time': array([2173.97616136, 2196.6376009 , 2176.38098454]), 'std_fit_time': array([17.55495226, 13.71304584, 22.25641298]), 'mean_score_time': array([333.80479848, 326.42078912, 333.04039383]), 'std_score_time': array([3.50720131, 5.80744255, 4.49837041]), 'param_C': masked_array(data=[28, 30, 32],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 20}, {'C': 22}, {'C': 24}, {'C': 26}, {'C': 28}, {'C': 30}, {'C': 32}], 'split0_test_score': array([0.9726    , 0.97283333, 0.97283333]), 'split1_test_score': array([0.97213333, 0.9721    , 0.97206667]), 'mean_test_score': array([0.97236667, 0.97246667, 0.97245   ]), 'std_test_score': array([0.00023333, 0.00036667, 0.00038333]), 'rank_test_score': array([3, 1, 2])}


In [11]:
# Save model
joblib.dump(svm_model5, '../models/svm_model5.pkl')
# Save StandardScaler
joblib.dump(transfer5, '../models/transfer5.pkl')

['../models/transfer5.pkl']