In [1]:
import struct
import numpy as np
from sklearn import svm, metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)

In [3]:
raw_train = read_idx("./data/train-images.idx3-ubyte")
train_data = np.reshape(raw_train, (60000, 28*28))
train_label = read_idx("./data/train-labels.idx1-ubyte")

raw_test = read_idx("./data/t10k-images.idx3-ubyte")
test_data = np.reshape(raw_test, (10000, 28*28))
test_label = read_idx("./data/t10k-labels.idx1-ubyte")

  """


In [4]:
## Standardizing the features
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(train_data)
train_std = sc.transform(train_data)
test_std = sc.transform(test_data)

In [5]:
## x = data, y = target

idx = (train_label == 0) |(train_label == 1) |(train_label == 2) |(train_label == 3) |(train_label == 4) | (train_label == 5) | (train_label == 6) | (train_label == 7) | (train_label == 8) | (train_label == 9)
X = train_std[idx] / 255.0
Y = train_label[idx]

In [6]:
clf = svm.SVC(C=5, kernel='linear')
clf.fit(X,Y)

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [7]:
pred = clf.predict(test_std)
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test_label, pred))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       980
           1       0.72      0.99      0.83      1135
           2       0.95      0.82      0.88      1032
           3       0.90      0.87      0.89      1010
           4       0.87      0.95      0.91       982
           5       0.95      0.75      0.84       892
           6       0.90      0.97      0.93       958
           7       0.84      0.93      0.89      1028
           8       0.96      0.66      0.78       974
           9       0.95      0.80      0.87      1009

    accuracy                           0.88     10000
   macro avg       0.89      0.87      0.87     10000
weighted avg       0.89      0.88      0.87     10000



In [None]:
# cross validation AND gridsearch
# 파라메터 후보
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              }

results = GridSearchCV(clf, param_grid).fit(X,Y)



In [None]:
# 최종 모델 성능 점검
idx = (test_label == 0) |(test_label == 1) |(test_label == 2) |(test_label == 3) |(test_label == 4) | (test_label == 5) | (test_label == 6) | (test_label == 7) | (test_label == 8) | (test_label == 9)
x_test_grd = test_data[idx] / 255.0
y_true_grd = test_label[idx]
y_pred_grd = result.predict(x_test)

In [None]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=cmap_data)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['class', 'group']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, 100])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
cm = metrics.confusion_matrix(y_true, y_pred_grd)
plot_confusion_matrix(cm, ["0","1","2","3","4","5","6","7","8","9"], normalize=False)