In [None]:
# MNIST Dataset

# Import to grab popular ML datasets
from sklearn.datasets import fetch_openml

# Grab MNIST
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

In [None]:
# Datasets grabbed by sklearn often have a similar dictionary strucutre containing
# 1. DESCR key describing the dataset
# 2. data key containing an array with one row per instance and one column per feature
# 3. target key containing an array with the labels

# Display arrays
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

# 70,000 images with 784 features

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

# Plot out image (28 x 28) into an array with matplotlib
some_digit = X[0]
some_digital_image = some_digit.reshape(28,28)

plt.imshow(some_digital_image, cmap=mpl.cm.binary, interpolation="nearest")
plt.axis('off')
plt.show
print(y[0])

In [None]:
# Cast y string variables into int variables
import numpy as np
y = y.astype(np.uint8)


In [11]:
# Create test/train sets
# Note: All sets are already shuffled
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]



In [12]:
# Create binary classifier for 5
y_train_5 = (y_train == 5) # True for 5, false for non-5
y_test_5 = (y_test == 5)

# Training classifier
from sklearn.linear_model import SGDClassifier

# SGDCLassifier model
# Note: very good at online learning
sgd_clf = SGDClassifier(random_state=42)
# Fit model
sgd_clf.fit(X_train, y_train_5)



SGDClassifier(random_state=42)

In [14]:
# Predict first image in array representing 5
sgd_clf.predict([some_digit])

# Correctly predicts the number 5

array([ True])

In [16]:
# Evaluate performance of SGDClassifier model at current time
# It is more difficult to predict accuracy of classifier models than regressor models
# Many different evaluation measures will be shown

# Measuring accuracy using cross-validation
# Similarly used in Chp_2 project
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

# In order to gain more control in this cross-validation process the following code implements
# cross-validation manually

# Random_state ensures repeatable outputs. 42 is generally used
skfolds = StratifiedKFold(n_splits=3, random_state=42)

# StratifiedKFold performs stratified sampling (expl. chp. 2)
# at each iteration the code creates a clone of the classifier, trains the clone on the training folds, and makes predictions on the
# test fold
# then it counts the number of correct predictions and outputs the ratio of correct predictions
for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]

    # Output ratio of correct predictions
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred)) # 0.95035, 0.96035, 0.9604

0.95035
0.96035
0.9604
