In [None]:
# MNIST Dataset

# Import to grab popular ML datasets
from sklearn.datasets import fetch_openml

# Grab MNIST
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

In [None]:
# Datasets grabbed by sklearn often have a similar dictionary strucutre containing
# 1. DESCR key describing the dataset
# 2. data key containing an array with one row per instance and one column per feature
# 3. target key containing an array with the labels

# Display arrays
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

# 70,000 images with 784 features

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

# Plot out image (28 x 28) into an array with matplotlib
some_digit = X[0]
some_digital_image = some_digit.reshape(28,28)

plt.imshow(some_digital_image, cmap=mpl.cm.binary, interpolation="nearest")
plt.axis('off')
plt.show
print(y[0])

In [None]:
# Cast y string variables into int variables
import numpy as np
y = y.astype(np.uint8)


In [None]:
# Create test/train sets
# Note: All sets are already shuffled
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]



In [None]:
# Create binary classifier for 5
y_train_5 = (y_train == 5) # True for 5, false for non-5
y_test_5 = (y_test == 5)

# Training classifier
from sklearn.linear_model import SGDClassifier

# SGDCLassifier model
# Note: very good at online learning
sgd_clf = SGDClassifier(random_state=42)
# Fit model
sgd_clf.fit(X_train, y_train_5)



In [None]:
# Predict first image in array representing 5
sgd_clf.predict([some_digit])

# Correctly predicts the number 5

In [None]:
# Evaluate performance of SGDClassifier model at current time
# It is more difficult to predict accuracy of classifier models than regressor models
# Many different evaluation measures will be shown

# Measuring accuracy using cross-validation
# Similarly used in Chp_2 project
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

# In order to gain more control in this cross-validation process the following code implements
# cross-validation manually

# Random_state ensures repeatable outputs. 42 is generally used
skfolds = StratifiedKFold(n_splits=3, random_state=42)

# StratifiedKFold performs stratified sampling (expl. chp. 2)
# at each iteration the code creates a clone of the classifier, trains the clone on the training folds, and makes predictions on the
# test fold
# then it counts the number of correct predictions and outputs the ratio of correct predictions
for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]

    # Output ratio of correct predictions
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred)) # 0.95035, 0.96035, 0.9604

In [None]:
# cross_val_score to gauge accuracy (same as code above)
from sklearn.model_selection import cross_val_score

# See Chp 2 for further information
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
# 0.95035, 0.96035, 0.9604

In [12]:
# Dumb classifier that will look at every image in not-5 category
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self,X,y=None):
        pass
    def predict(self,X):
        return np.zeros((len(X),1), dtype=bool)

never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

# returns 0.91125, 0.90855, 0.90915
# About 10% of the images are 5. So if you guess that an image is 'not-5' you will be right about 90% of the time
# Accuracy is not generally a good indicator for classification models

array([0.91125, 0.90855, 0.90915])

In [13]:
# Confusion Matrix -- more ideal method of calculating classifier performance
# Essentially counts the number of times instances of class A are classified as class B
# e.g knowing the amount of times the classifier confused images of 5s with 3s

# Initial step -- calculate set of predictions
from sklearn.model_selection import cross_val_predict

# cross_val_predict is similar to cross_val_score
# performs k fold cross-validation but calculates predictions instead of scores
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)



In [14]:
# Confusion matrix creation
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred) # returns 53892, 687 (next row) 1891, 3530
# each row represents an actual class, while each column represents a predicted class
# Row 1: Correctly guessed non-5's, then false positives (incorrectly guessed as 5)
# Row 2: False negatives (wrongly classified as non5s), then true positives (correctly classified as 5)
# A perfect model would only have true negatives and true positives

array([[53892,   687],
       [ 1891,  3530]], dtype=int64)

In [15]:
y_train_perfect_predictions = y_train_5 # pretending that the model has reached perfection
confusion_matrix(y_train_5, y_train_perfect_predictions) # Confusion matrix based on a 'perfect model' with our data

array([[54579,     0],
       [    0,  5421]], dtype=int64)

In [17]:
# Precision calculations
from sklearn.metrics import precision_score, recall_score

print(precision_score(y_train_5, y_train_pred)) # Precision score, usually used in conjunction with recall score
print(recall_score(y_train_5, y_train_pred)) # recall score

# Precision score:  TP / (TP + FP)
# Recall score: TP / (TP + FN)

# Prints 0.83708 ...
# Prints 0.651171 ...

0.8370879772350012
0.6511713705958311
