In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)

In [2]:
import numpy as np

X = mnist.data
y = mnist.target.astype(np.uint8)

y = y.sort_values()
X = X.reindex(y.index)

In [3]:
y.index

Int64Index([34999, 56424, 56419, 16705, 56415, 56404, 56397, 56389, 56388,
            56429,
            ...
            13698, 33531, 13695, 13692, 58898, 13687, 42651, 58914, 13678,
            58529],
           dtype='int64', length=70000)

In [4]:
X.index

Int64Index([34999, 56424, 56419, 16705, 56415, 56404, 56397, 56389, 56388,
            56429,
            ...
            13698, 33531, 13695, 13692, 58898, 13687, 42651, 58914, 13678,
            58529],
           dtype='int64', length=70000)

In [5]:
X_train, X_test = X[:56000], X[56000:]
y_train, y_test = y[:56000], y[56000:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(56000, 784) (56000,)
(14000, 784) (14000,)


In [6]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=uint8)

In [7]:
np.unique(y_test)

array([7, 8, 9], dtype=uint8)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(56000, 784) (56000,)
(14000, 784) (14000,)


In [9]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8)

In [10]:
np.unique(y_test)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8)

In [11]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(random_state=42, n_jobs=-1)

y_test_0 = (y_test == 0)
y_train_0 = (y_train == 0)

clf.fit(X_train, y_train_0)

SGDClassifier(n_jobs=-1, random_state=42)

In [12]:
import pickle

scores = [clf.score(X_train, y_train_0), clf.score(X_test, y_test_0)]
print(scores)

with open('sgd_acc.pkl', 'wb') as f1:
    pickle.dump(scores, f1)

[0.9832142857142857, 0.9818571428571429]


In [13]:
from sklearn.model_selection import cross_val_score, cross_val_predict

train_cross = cross_val_score(clf, X_train, y_train_0, cv=3, scoring="accuracy", n_jobs=-1)
print(train_cross)

[0.98842878 0.98805378 0.98799957]


In [14]:
with open('sgd_cva.pkl', 'wb') as f2:
    pickle.dump(train_cross, f2)

In [15]:
clf_all = SGDClassifier(random_state=42, n_jobs=-1)
clf_all.fit(X_train, y_train)

SGDClassifier(n_jobs=-1, random_state=42)

In [16]:
y_test_pred = cross_val_predict(clf_all, X_test, y_test, cv = 3, n_jobs = -1)

In [17]:
from sklearn.metrics import confusion_matrix

conf_m = confusion_matrix(y_test, y_test_pred)

conf_m

array([[1310,    0,    5,    5,    1,    9,   15,    1,   17,    2],
       [   1, 1514,    9,    4,    3,    6,    7,    3,   37,    8],
       [  23,   17, 1138,   35,   21,   15,   26,   20,   80,   14],
       [   9,    6,   39, 1144,    7,   72,    8,   11,   65,   33],
       [  11,    3,    7,    4, 1209,   11,   15,    4,   26,   76],
       [  21,   11,    9,   67,   29, 1047,   37,    4,   64,   21],
       [  11,    3,   11,    2,   15,   36, 1307,    1,   12,    1],
       [   5,    7,   15,    8,   25,    9,    2, 1210,   14,  148],
       [  15,   31,   23,   39,   22,   76,    5,    5, 1103,   51],
       [   9,    4,    3,   16,   55,   23,    0,   40,   31, 1191]])

In [18]:
with open('sgd_cmx.pkl', 'wb') as f3:
    pickle.dump(conf_m, f3)