In [1]:
#the MNIST dataset is a hand labeled list of digits.  Lets train a model to identify images from this list
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)



In [None]:

#pull dataset for processing, MNIST is a dataset of 70000 small images of handwriten digits
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

In [None]:
#lets look at some of the data to get an idea
X= mnist["data"] 
y = mnist["target"]
X.shape

In [None]:
y.shape

In [None]:
print(X[0])

In [None]:
#so, 70,000 pictures with 784 features each (28 x 28 pixels)
#lets display one of these feature arrays in a way that lets us visualize the image
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis("off")

save_fig("some_digit_plot")
plt.show()


In [None]:
#looks like a 5, lets see what the label tells us
y[0]

In [None]:
#Yup, thats a 5.  Nice.  Well another thing to consider is that most ML algorithms expect numbers.  Lets cast the labels to integers

In [None]:
y = y.astype(np.uint8)

In [None]:
# before we start diving into the data, lets make sure that first we create a test set.  Very important
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
#lets start with making a binary classifier that is capable of distingusing between two classes, 5 and not 5
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [None]:
#now we have a list of images that have been deemed to be a 5
print(len(y_train))

In [None]:
#nice.  pretty big
#now lets  pick a classifier and train it.  
# a good place to start is to use a Stochastic Gradient Descent classifier
# A stochastic gradient descent classifier is great because it deals with training instances
# independently, which makes it good for online learning.  Also its very efficent
from sklearn.linear_model import SGDClassifier
sdg_clf = SGDClassifier(random_state=42)
sgd_clf = (X_train, y_train_5)

In [None]:
#we use the random state because stochastic gradient descent relies on randomness, hence the name
sgd_clf.predict([some_digit])