In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [3]:
X = mnist.data
y = mnist.target.astype(np.uint8)
print(X.shape, y.shape)

(70000, 784) (70000,)


In [4]:
# # try sorting
# y = y.sort_values(ascending=True)
# X = X.reindex()

In [5]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(56000, 784) (14000, 784)
(56000,) (14000,)


In [6]:
# testing uniqueness
print(np.unique(y_train))
print(np.unique(y_test))

[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9]


In [7]:
# checking only zeros
y_train_0 = (y_train == 0)
y_test_0 = (y_test == 0)

In [8]:
# importing the classifier
from sklearn.linear_model import SGDClassifier

In [9]:
# feeding the model data
start = time.time()
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_0)
print(time.time() - start)

5.442911148071289


In [10]:
# measuring the accuracy of the clf
start = time.time()
score = cross_val_score(sgd_clf, X_train, y_train_0, cv=3, scoring="accuracy", n_jobs=-1)
print(time.time() - start)
print(score)

10.670475959777832
[0.98687523 0.98762522 0.98649952]


In [11]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_0, cv=3, n_jobs=-1)
print(y_train_pred)
print(confusion_matrix(y_train_0, y_train_pred))

[False False False ... False  True  True]
[[49966   474]
 [  254  5306]]


In [12]:
import pickle

In [13]:
accuracies = [sgd_clf.score(X_train, y_train_0), sgd_clf.score(X_test, y_test_0)]
with open('sgd_acc.pkl', 'wb') as f:
    pickle.dump(accuracies, f, pickle.HIGHEST_PROTOCOL)

In [14]:
cross_score = cross_val_score(sgd_clf, X_train, y_train_0, cv=3, scoring="accuracy", n_jobs=-1)
with open('sgd_cva.pkl', 'wb') as f:
    pickle.dump(cross_score, f, pickle.HIGHEST_PROTOCOL)

In [15]:
# feeding the model data
start = time.time()
sgd_wiel_clf = SGDClassifier(random_state=42,n_jobs=-1)
sgd_wiel_clf.fit(X_train, y_train)
print(time.time() - start)

70.96468496322632


In [16]:
y_train_pred = cross_val_predict(sgd_wiel_clf, X_train, y_train, cv=3, n_jobs=-1)
matrix = confusion_matrix(y_train, y_train_pred)
with open('sgd_cmx.pkl', 'wb') as f:
    pickle.dump(matrix, f, pickle.HIGHEST_PROTOCOL)

In [17]:
with open('sgd_acc.pkl', 'rb') as f:
    print(pickle.load(f))
    
with open('sgd_cva.pkl', 'rb') as f:
    print(pickle.load(f))
    
with open('sgd_cmx.pkl', 'rb') as f:
    print(pickle.load(f))

[0.9918571428571429, 0.9903571428571428]
[0.98687523 0.98762522 0.98649952]
[[5193    2   71   17   10   34   46    4  155   28]
 [   3 6121   49    9   14   17   20    8   30    6]
 [  42  109 4896  113   68   21   93   86  163   19]
 [  37   44  361 4312   26  497   28   86  224   93]
 [  15   31   49    2 5123    8   39   38   88  136]
 [  57   35   65  173  111 4069  127   36  298   69]
 [  56   33  204    3   66  122 4925    4   66    1]
 [  24   48   77   18  133   11    3 5225   44  207]
 [  47  216  705  130  133  446   77   68 3435  211]
 [  25   49   28   58  606   89    0  320  107 4256]]
