# What to do
Train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?

## Load the MNIST dataset

In [1]:
import numpy as np
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata("MNIST original")
X = mnist["data"]
y = mnist["target"]

#train test split
X_train = X[:60000]
y_train = y[:60000]
X_test = X[60000:]
y_test = y[60000:]

# shuffle the train and test sets*
np.random.seed(42)
rnd_idx = np.random.permutation(60000)
X_train = X_train[rnd_idx]
y_train = y_train[rnd_idx]

## Scale the dataset

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled = scaler.transform(X_test.astype(np.float32))

## Train the model and print its accuracy

In [5]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

lin_clf = LinearSVC(C = 1, random_state = 42)

lin_clf.fit(X_train, y_train)


predictions = lin_clf.predict(X_train)

print('Accuracy on training set: ',accuracy_score(y_train, predictions))

Accuracy on training set:  0.843866666667


# Using a kernel

In [8]:
from sklearn.svm import SVC

svm_clf = SVC(decision_function_shape="ovr")
svm_clf.fit(X_train_scaled[:10000], y_train[:10000])

y_pred = svm_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

0.94615000000000005

# grid search to find the best hyperparameters

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2)
rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=8.85231605842, gamma=0.00176607465048 .........................
[CV] .......... C=8.85231605842, gamma=0.00176607465048, total=   0.9s
[CV] C=8.85231605842, gamma=0.00176607465048 .........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV] .......... C=8.85231605842, gamma=0.00176607465048, total=   0.9s
[CV] C=8.85231605842, gamma=0.00176607465048 .........................
[CV] .......... C=8.85231605842, gamma=0.00176607465048, total=   1.0s
[CV] C=1.82719601047, gamma=0.00636473705545 .........................
[CV] .......... C=1.82719601047, gamma=0.00636473705545, total=   1.0s
[CV] C=1.82719601047, gamma=0.00636473705545 .........................
[CV] .......... C=1.82719601047, gamma=0.00636473705545, total=   1.1s
[CV] C=1.82719601047, gamma=0.00636473705545 .........................
[CV] .......... C=1.82719601047, gamma=0.00636473705545, total=   1.1s
[CV] C=9.87519919377, gamma=0.0513498334519 ..........................
[CV] ........... C=9.87519919377, gamma=0.0513498334519, total=   1.1s
[CV] C=9.87519919377, gamma=0.0513498334519 ..........................
[CV] ........... C=9.87519919377, gamma=0.0513498334519, total=   1.1s
[CV] C=9.87519919377, gamma=0.0513498334519 ..........................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   47.1s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1073cca58>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1073ccfd0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=2)

In [10]:
rnd_search_cv.best_estimator_

SVC(C=8.8523160584230869, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001766074650481071,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [11]:
rnd_search_cv.best_score_

0.85599999999999998

# Final model

In [None]:
rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)

SVC(C=8.8523160584230869, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001766074650481071,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [None]:
y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

In [None]:
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)