In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)

In [2]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [2]:
X, y = mnist["data"], mnist["target"]

In [3]:
import numpy as np

y = y.astype(np.uint)

In [4]:
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

In [6]:
y_train_5 = y_train == 5
y_test_5 = y_test == 5

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

rbf_kernel_svm_clf = Pipeline([
    ("scalar", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", gamma=0.1, C=1000))
])

In [8]:
X_train_1 = X_train[:1000]
y_train_5_1 = y_train_5[:1000]

In [18]:
rbf_kernel_svm_clf.fit(X_train_1, y_train_5_1)

Pipeline(steps=[('scalar', StandardScaler()),
                ('svm_clf', SVC(C=1000, gamma=0.1))])

In [22]:
predictions = rbf_kernel_svm_clf.predict(X_train)

In [24]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, predictions)

array([[54579,     0],
       [ 5329,    92]], dtype=int64)

In [28]:
unique, count = np.unique(y_train_5_1, return_counts=True)
dict(zip(unique, count))

{False: 908, True: 92}

In [30]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'svm_clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'svm_clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
]

gs_clf = GridSearchCV(rbf_kernel_svm_clf, param_grid, cv=3, n_jobs=-1)

In [31]:
gs_clf.fit(X_train_1, y_train_1)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scalar', StandardScaler()),
                                       ('svm_clf', SVC(C=1000, gamma=0.1))]),
             n_jobs=-1,
             param_grid=[{'svm_clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'svm_clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001]}])

In [38]:
gs_clf.best_score_

0.9639939340538143

In [34]:
gs_clf.best_params_

{'svm_clf__C': 100, 'svm_clf__gamma': 0.001}

In [35]:
predictions = gs_clf.predict(X_train)

In [36]:
confusion_matrix(y_train_5, predictions)

array([[54202,   377],
       [ 1505,  3916]], dtype=int64)

In [37]:
from sklearn.metrics import precision_score, recall_score, f1_score
f1_score(y_train_5, predictions)

0.8062590076178711

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

rbf_kernel_svm_clf = Pipeline([
    ("scalar", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", gamma=0.001, C=100, verbose=True))
])

In [14]:
rbf_kernel_svm_clf.fit(X_train[:10000], y_train[:10000])

[LibSVM]

Pipeline(steps=[('scalar', StandardScaler()),
                ('svm_clf', SVC(C=100, gamma=0.001, verbose=True))])

In [16]:
predictions = rbf_kernel_svm_clf.predict(X_train[10000:])

In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
accuracy_score(y_train[10000:], predictions)

0.94534

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'svm_clf__C': [1, 3, 6, 10, 30, 60, 100], 'svm_clf__gamma': [0.01, 0.06, 0.003, 0.001, 0.0006, 0.0003, 0.0001]},
]

gs_clf = GridSearchCV(rbf_kernel_svm_clf, param_grid, cv=3, n_jobs=-1, verbose=2)

In [None]:
gs_clf.fit(X_train[:10000], y_train[:10000])

Fitting 3 folds for each of 49 candidates, totalling 147 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.1min


In [21]:
gs_clf.best_params_

{'svm_clf__C': 10, 'svm_clf__gamma': 0.0006}

In [22]:
gs_clf.best_score_

0.8439847032661404