# MNIST with SVM

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import pandas as pd

sns.set()

## Import MNIST dataset

In [2]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version = 1)

# mnist.keys()

X, y = mnist['data'], mnist['target']

### Split data into train and test set

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 6)

### Scale the training data set

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

### Inspect if the training data contains an even split of digit 0 - 9

In [5]:
pd.Series(y_train).value_counts()

1    6277
7    5811
3    5740
0    5616
2    5559
9    5519
8    5509
6    5500
4    5441
5    5028
dtype: int64

## Grid Search for hyperparameters

#### Choose a smaller data set for hyperparameter selection via cross validation

In [6]:
X_train_scaled_sample = X_train_scaled[0:1000]
y_train_sample = y_train[0:1000]


# check if the distribution is even
pd.Series(y_train_sample).value_counts()

1    112
3    109
4    108
2    106
8    106
7    101
5    100
9     93
6     83
0     82
dtype: int64

In [8]:
from sklearn import svm

svc_clf = svm.SVC(kernel = 'poly', max_iter = 1000)

from sklearn.model_selection import GridSearchCV

param_grid = [{'coef0':[0.0, 0.5, 1.0], 'degree':[1, 3, 5], 'C':[0.5, 1, 3, 5], 'gamma':[10,1,0.1,0.01]}]

grid_search = GridSearchCV(svc_clf, param_grid, cv = 3)

grid_search.fit(X_train_scaled_sample, y_train_sample)

print('Best score: {}'.format(grid_search.best_score_))

print('Parameters for the best estimator', grid_search.best_params_)



Best score: 0.892
Parameters for the best estimator {'C': 0.5, 'coef0': 1.0, 'degree': 3, 'gamma': 0.01}


## Train the best estimator

In [9]:
final_model = grid_search.best_estimator_

final_model.fit(X_train_scaled, y_train)



SVC(C=0.5, cache_size=200, class_weight=None, coef0=1.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='poly',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
final_model.score(scaler.transform(X_test), y_test)

0.9793571428571428

## Write those procedures in a pipeline

In [11]:
from sklearn.pipeline import Pipeline

svm_clf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm_clf', svm.SVC(C=0.5, cache_size=200, class_weight=None, coef0=1.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='poly',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))
])

svm_clf_pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm_clf', SVC(C=0.5, cache_size=200, class_weight=None, coef0=1.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='poly',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [12]:
# compare the score from pipeline with the previous one

svm_clf_pipeline.score(X_test, y_test)

0.9793571428571428