In [None]:
import pathlib
import requests

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import dummy, linear_model, feature_selection, metrics
from sklearn import model_selection, pipeline, preprocessing

# Performance metrics

## MNIST Dataset

The original [MNIST](http://yann.lecun.com/exdb/mnist/) dataset consists of 70000 28x28 black and white images in 10 classes. There are 60000 training images and 10000 test images.

In [None]:
# might be different if using Colab or Kaggle
PROJECT_ROOT_DIR = pathlib.Path(".")

DATA_DIR = PROJECT_ROOT_DIR / "data" / "mnist"
DATA_DIR.mkdir(parents=True, exist_ok=True)

RESULTS_DIR = PROJECT_ROOT_DIR / "results" / "mnist"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


### Download and extract the data (if using Colab or Kaggle!)

In [None]:
URL = "https://github.com/davidrpugh/machine-learning-for-tabular-data/blob/main/data/mnist/mnist.parquet?raw=true"

with open(DATA_DIR / "mnist.parquet", 'wb') as f:
    response = requests.get(URL)
    f.write(response.content)


In [None]:
%%bash
ls -lh ./data/mnist

## Load the data

We will load the data using the [Pandas](https://pandas.pydata.org/) library. Highly recommend the most recent edition of [*Python for Data Analysis*](https://learning.oreilly.com/library/view/python-for-data/9781491957653/) by Pandas creator Wes Mckinney for anyone interested in learning how to use Pandas.

In [None]:
data = pd.read_parquet(DATA_DIR / "mnist.parquet")
features = data.drop("label", axis=1)
target = data.loc[:, "label"]

## Explore the data

In [None]:
features.info()

In [None]:
features.head()

In [None]:
features.tail()

In [None]:
features.describe()

In [None]:
_ = (target.value_counts()
           .sort_index()
           .plot(kind="bar"))
_ = plt.xticks(rotation=-45)

## Train-Test Split

In [None]:
TEST_SIZE = 1e-1

# split the dataset into training and testing data
_random_state = np.random.RandomState(42)
train_features, test_features, train_target, test_target = model_selection.train_test_split(
    features,
    target,
    test_size=TEST_SIZE,
    stratify=target,
    random_state=_random_state
)

In [None]:
train_features.info()

In [None]:
train_features.head()

In [None]:
train_target.head()

In [None]:
train_features.describe()

Again, if you want to you can write out the train and test sets to disk to avoid having to recreate them later.

In [None]:
_ = (train_features.join(train_target)
                   .to_parquet(DATA_DIR / "train.parquet", index=False))

_ = (test_features.join(test_target)
                   .to_parquet(DATA_DIR / "test.parquet", index=False))

## Train a Binary Classifier

In [None]:
ml_pipeline = pipeline.make_pipeline(
    feature_selection.VarianceThreshold(threshold=0.0),
    preprocessing.StandardScaler(),
    linear_model.SGDClassifier()
)

In [None]:
ml_pipeline

In [None]:
_ = ml_pipeline.fit(train_features, train_target == 5)

## Performance measures

### Measuring accuracy using Cross Validation

In [None]:
CV_FOLDS = 3

cv_scores = model_selection.cross_val_score(
    ml_pipeline,
    X=train_features,
    y=(train_target == 5),
    cv=CV_FOLDS,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

In [None]:
cv_scores

In [None]:
cv_scores.mean()

In [None]:
# make predictions
cv_predictions = model_selection.cross_val_predict(
    ml_pipeline,
    X=train_features,
    y=(train_target == 5),
    cv=CV_FOLDS,
    n_jobs=-1,
    verbose=1
)


In [None]:
cv_accuracy = metrics.accuracy_score(
    (train_target == 5),
    cv_predictions,
)
print(cv_accuracy)

Cross-validated accurracy score is quite high! But is our model really that good? What is the relevant benchmark?

In [None]:
dummy_pipeline = pipeline.make_pipeline(
    feature_selection.VarianceThreshold(threshold=0.0),
    preprocessing.StandardScaler(),
    dummy.DummyClassifier(strategy="most_frequent")
)

dummy_pipeline_scores = model_selection.cross_val_score(
    dummy_pipeline,
    X=train_features,
    y=(train_target == 5),
    cv=CV_FOLDS,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

In [None]:
dummy_pipeline_scores.mean()

Simply guessing "not 5" will be correct 90% of the time! Why? This example demonstrates why accuracy is not a great metric to use when you have imbalanced datasets.

### Confusion matrix

In [None]:
cv_predictions = model_selection.cross_val_predict(
    ml_pipeline,
    X=train_features,
    y=(train_target == 5),
    cv=CV_FOLDS,
    n_jobs=-1,
    verbose=1
)

In [None]:
metrics.confusion_matrix?

In [None]:
confusion_matrix = metrics.confusion_matrix(
    (train_target == 5),
    cv_predictions,
    normalize=None
)
print(confusion_matrix)

### Precision

In [None]:
metrics.precision_score(
    (train_target == 5),
    cv_predictions,
)

### Recall

In [None]:
metrics.recall_score(
    (train_target == 5),
    cv_predictions,
)

### $F_1$ Score

In [None]:
metrics.f1_score(
    (train_target == 5),
    cv_predictions,
)

### Classification report

In [None]:
_report = metrics.classification_report(
    (train_target == 5),
    cv_predictions,
)
print(_report)

### Receiver Operating Characteristic (ROC) Area Under the Curve (AUC) Score

In [None]:
_scores = ml_pipeline.predict_proba(train_features)
metrics.roc_auc_score(
    (train_target == 5),
    _scores[:, 1],
)

In [None]:
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(
    (train_target == 5),
    _scores[:, 1]
)

plt.plot(false_positive_rate, true_positive_rate, label="Logistic Regression")
plt.plot(false_positive_rate, false_positive_rate, "k--", label="Random Classifier")
plt.xlabel("False Positive Rate (Fall Out)")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curve")
plt.grid()

In [None]:
precisions, recalls, thresholds = metrics.precision_recall_curve(
    (train_target == 5),
    _scores[:, 1]
)

plt.plot(recalls, precisions, label="Logistic Regression")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.grid()

Since the ROC curve is so similar to the precision/recall (PR) curve, you may wonder how to decide which one to use.

1. Prefer the PR curve whenever the positive class is rare or when you care more about the false positives than the false negatives.
2. Otherwise, use the ROC curve.
