In [None]:
import pathlib
import requests

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import decomposition, ensemble, metrics
from sklearn import model_selection, pipeline, preprocessing, utils

# MNIST Dataset


### Load the data

We will load the data using the [Pandas](https://pandas.pydata.org/) library. Highly recommend the most recent edition of [*Python for Data Analysis*](https://learning.oreilly.com/library/view/python-for-data/9781491957653/) by Pandas creator Wes Mckinney for anyone interested in learning how to use Pandas.

In [None]:
%%bash
ls ./sample_data

In [None]:
train_df = pd.read_csv(
    "./sample_data/mnist_train_small.csv",
    header=None,
    names=["label"] + [f"p{i}" for i in range(784)],
)
train_features_df = train_df.drop("label", axis=1)
train_target = train_df.loc[:, "label"]

In [None]:
train_features_df.info()

In [None]:
train_target.head()

## Principal Components Analysis (PCA)

In [None]:
decomposition.PCA?

In [None]:
# hyper-parameters
_pca_hyperparameters = {
    "n_components": 153,
    "whiten": False,
}

feature_extractor = decomposition.PCA(**_pca_hyperparameters)
extracted_train_features_df = feature_extractor.fit_transform(train_features_df)

In [None]:
extracted_train_features_df.shape

In [None]:
extracted_train_features_df.mean(axis=0)

In [None]:
extracted_train_features_df.std(axis=0)

In [None]:
fig, ax = plt.subplots(1, 1)
_ = ax.scatter(
    extracted_train_features_df[:, 0],
    extracted_train_features_df[:, 1],
    c=train_target,
    alpha=0.05
)
_ = ax.set_xlabel("Component 1", fontsize=15)
_ = ax.set_ylabel("Component 2", fontsize=15)
_ = ax.set_title(type(feature_extractor))
_ = ax.grid(True)

### Exercise: To whiten, or not to whiten?

Take a close look at the doc string for the `decomposition.PCA` algorithm. What happens if you set `n_components` to a number between 0 and 1 (i.e., `n_components=0.95`)? Why might you want to do this? What does setting `whiten=True` do to the output of the algorithm? Re-run the PCA algorithm above setting `whiten=True` to confirm your answer. Why might you want to set `whiten=True`?

### Solution

In [None]:
# hyper-parameters
_pca_hyperparameters = {
    "n_components": 0.95,
    "whiten": True,
}

feature_extractor = decomposition.PCA(**_pca_hyperparameters)
extracted_train_features = feature_extractor.fit_transform(useful_train_features_df)

In [None]:
extracted_train_features.std(axis=0)

## Randomized PCA

In [None]:
# hyper-parameters
_pca_hyperparameters = {
    "n_components": 128,
    "svd_solver": "randomized",
    "random_state": 42
}

feature_extractor = decomposition.PCA(**_pca_hyperparameters)
extracted_train_features_df = (
    feature_extractor.fit_transform(
        train_features_df
    )
)

## Choosing the number of components

In [None]:
ml_pipeline = pipeline.make_pipeline(
    decomposition.PCA(svd_solver="randomized", random_state=42),
    ensemble.ExtraTreesClassifier(n_jobs=-1, random_state=42)
)

_param_grid = {
    "pca__n_components": [0.9, 0.95, 0.99],
    "extratreesclassifier__n_estimators": [10, 100, 1000]
}

grid_search_cv = model_selection.GridSearchCV(
    ml_pipeline,
    _param_grid,
    cv=2,
    n_jobs=-1,
)

In [None]:
_ = grid_search_cv.fit(train_features_df, train_target)

In [None]:
print(grid_search_cv.best_params_)

## PCA for Compression

In [None]:
reconstructed_train_features_df = (
    feature_extractor.inverse_transform(
        extracted_train_features_df
    )
)

## Incremental PCA

In [None]:
n_batches = 100
feature_extractor = decomposition.IncrementalPCA(n_components=128)

for X in np.array_split(train_features_df, n_batches):
    feature_extractor.partial_fit(X)

extracted_train_features_df = (
    feature_extractor.fit_transform(
        train_features_df
    )
)

### Exercise

Train a logistic regression classifier using the MNIST dataset and assess its performance. Next, create a pipeline with PCA and your logistic regression classifier and assess its performance. Does PCA improve your results?

### Solution

### Exercise

Train a tree-based classifier using the MNIST dataset and assess its performance. Next, create a pipeline with PCA and your tree-based classifier and assess its performance. Does PCA improve your results?

### Solution