In [None]:
import pathlib
import requests

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model, feature_selection, metrics
from sklearn import model_selection, pipeline, preprocessing

# Multiclass Classification

# MNIST Dataset

The original [MNIST](http://yann.lecun.com/exdb/mnist/) dataset consists of 70000 28x28 black and white images in 10 classes. There are 60000 training images and 10000 test images.

In [None]:
# might be different if using Colab or Kaggle
PROJECT_ROOT_DIR = pathlib.Path(".")

DATA_DIR = PROJECT_ROOT_DIR / "data" / "mnist"
DATA_DIR.mkdir(parents=True, exist_ok=True)

RESULTS_DIR = PROJECT_ROOT_DIR / "results" / "mnist"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


### Download and extract the data (if using Colab or Kaggle!)

In [None]:
URL = "https://github.com/davidrpugh/machine-learning-for-tabular-data/blob/main/data/mnist/mnist.parquet?raw=true"

with open(DATA_DIR / "mnist.parquet", 'wb') as f:
    response = requests.get(URL)
    f.write(response.content)


In [None]:
%%bash
ls -lh ./data/mnist

## Load the data

We will load the data using the [Pandas](https://pandas.pydata.org/) library. Highly recommend the most recent edition of [*Python for Data Analysis*](https://learning.oreilly.com/library/view/python-for-data/9781491957653/) by Pandas creator Wes Mckinney for anyone interested in learning how to use Pandas.

In [None]:
data = pd.read_parquet(DATA_DIR / "mnist.parquet")
features = data.drop("label", axis=1)
target = data.loc[:, "label"]

## Explore the data

In [None]:
features.info()

In [None]:
features.head()

In [None]:
features.tail()

In [None]:
features.describe()

In [None]:
_ = (target.value_counts()
           .sort_index()
           .plot(kind="bar"))
_ = plt.xticks(rotation=-45)

## Train-Test Split

In [None]:
model_selection.train_test_split?

In [None]:
TEST_SIZE = 1e-1

# split the dataset into training and testing data
_random_state = np.random.RandomState(42)
train_features, test_features, train_target, test_target = model_selection.train_test_split(
    features,
    target,
    test_size=TEST_SIZE,
    stratify=target,
    random_state=_random_state
)

In [None]:
train_features.info()

In [None]:
train_features.head()

In [None]:
train_target.head()

In [None]:
train_features.describe()

Again, if you want to you can write out the train and test sets to disk to avoid having to recreate them later.

In [None]:
_ = (train_features.join(train_target)
                   .to_parquet(DATA_DIR / "train.parquet", index=False))

_ = (test_features.join(test_target)
                   .to_parquet(DATA_DIR / "test.parquet", index=False))

## Multiclass Classification


In [None]:
linear_model.LogisticRegression?

In [None]:
ml_pipeline = pipeline.make_pipeline(
     feature_selection.VarianceThreshold(threshold=0.0),
    preprocessing.MinMaxScaler(feature_range=(-1, rootr1)),
    linear_model.LogisticRegression(
        max_iter=1000
    ),
    verbose=True
)

In [None]:
_ = ml_pipeline.fit(train_features, train_target)

## Error analysis using the training dataset

In [None]:
train_predictions = ml_pipeline.predict(train_features)
_report = metrics.classification_report(
    train_target,
    train_predictions,
)
print(_report)

In [None]:
_ = (
    metrics.ConfusionMatrixDisplay
           .from_predictions(
                train_target,
                train_predictions
            )
)

In [None]:
_ = (
    metrics.ConfusionMatrixDisplay
           .from_predictions(
                train_target,
                train_predictions,
                normalize="true",
                values_format=".0%"
            )
)

In [None]:
_ = (
    metrics.ConfusionMatrixDisplay
           .from_predictions(
                train_target,
                train_predictions,
                normalize="true",
                sample_weight=(train_target != train_predictions),
                values_format=".0%"
            )
)

In [None]:
_ = (
    metrics.ConfusionMatrixDisplay
           .from_predictions(
                train_target,
                train_predictions,
                normalize="pred",
                sample_weight=(train_target != train_predictions),
                values_format=".0%"
            )
)

## Assess using Test dataset

In [None]:
test_predictions = ml_pipeline.predict(test_features)
_report = metrics.classification_report(
    test_target,
    test_predictions,
)
print(_report)