In [None]:
! git clone https://github.com/data-psl/lectures2021
import sys
sys.path.append('lectures2021/notebooks/02_sklearn')
%cd 'lectures2021/notebooks/02_sklearn'

In this notebook we define a Pipeline estimator that first apply a `PCA` and then classify the reduced representation using a `LogisticRegression`.

We will use the `Pipeline` object and the `GridSearchCV` object, as well as a little `pandas` in the end

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

Now that you have all useful objects in hand, define a pipeline that first apply a PCA transform and then apply a LogisticRegression.
You may look at `Pipeline` documentation of scikit-learn

In [None]:
# %load "solutions/06-Pipeline-1.py"
# Copy and paste it in Colab (%load does not work, sorry)

The elements of the Pipeline are named and accessed using `estimator.get_params()`, and may be modified using `estimator.set_params()`

In [None]:
pipe.get_params()

In [None]:
pipe.set_params(pca__n_components=10)

## Performing a grid-search on multiple parameters

We want to test what reduction we should use in the `PCA`, along with what regularization in the `LogisticRegression`

How do we do that ? We use a `GridSearchCV` with multiple parameters

In [None]:
X_digits, y_digits = datasets.load_digits(return_X_y=True)

# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'pca__n_components': [5, 15, 30, 45, 64],
    'logistic__C': np.logspace(-4, 4, 4),
}
search = GridSearchCV(pipe, param_grid, n_jobs=-1)

In [None]:
search

In [None]:
search.fit(X_digits, y_digits)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

pca.fit(X_digits)

Let's see how how the explained signal vary with the PCA size


In [None]:
fig, ax0 = plt.subplots(nrows=1, figsize=(6, 6))


ax0.plot(np.arange(1, pca.n_components_ + 1),
         pca.explained_variance_ratio_, '+', linewidth=4)
ax0.set_ylabel('PCA explained variance ratio')

ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')
ax0.legend(prop=dict(size=12), loc='upper center')

ax0.set_xlabel('n_components')


### Exploring grid-search

To explore the grid search results, we may transform the attribute `cv_results_` in a DataFrame.

**Question**: how many results should appear ?

In [None]:
search.cv_results_

In [None]:
results = pd.DataFrame(search.cv_results_)
results

Let's index these results by the hyper-parameters

In [None]:
results = results.set_index(['param_pca__n_components','param_logistic__C']).sort_index()
results

In [None]:
# Take the best C for each `n_components`
best_clfs = results.reset_index().groupby('param_pca__n_components').apply(
    lambda g: g.nlargest(1, 'mean_test_score'))

In [None]:
best_clfs

In [None]:
fig, ax1 = plt.subplots(nrows=1, figsize=(6, 6))

best_clfs.plot(x='param_pca__n_components', y='mean_test_score', yerr='std_test_score',
               legend=False, ax=ax1)
ax1.set_ylabel('Classification accuracy (val)')
ax1.set_xlabel('n_components')

plt.xlim(-1, 70)

plt.tight_layout()
plt.show()

## Performing a grid-search on multiple parameters

We want to test what reduction we should use in the `PCA`, along with what regularization in the `LogisticRegression`

How do we do that ? We use a `GridSearchCV` with multiple parameters

In [None]:
X_digits, y_digits = datasets.load_digits(return_X_y=True)

# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'pca__n_components': [5, 15, 30, 45, 64],
    'logistic__C': np.logspace(-4, 4, 4),
}
search = GridSearchCV(pipe, param_grid, n_jobs=-1)

In [None]:
search

In [None]:
search.fit(X_digits, y_digits)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

# Plot the PCA spectrum
pca.fit(X_digits)

Let's see how the explained signal vary with the PCA size


In [None]:
fig, ax0 = plt.subplots(nrows=1, figsize=(6, 6))


ax0.plot(np.arange(1, pca.n_components_ + 1),
         pca.explained_variance_ratio_, '+', linewidth=4)
ax0.set_ylabel('PCA explained variance ratio')

ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')
ax0.legend(prop=dict(size=12), loc='upper center')

ax0.set_xlabel('n_components')


### Exploring grid-search

To explore the grid search results, we may transform the attribute `cv_results_` in a DataFrame.

**Question**: how many results should appear ?

In [None]:
search.cv_results_

In [None]:
results = pd.DataFrame(search.cv_results_)
results

Let's index these results by the hyper-parameters

In [None]:
results = results.set_index(['param_pca__n_components','param_logistic__C']).sort_index()
results

In [None]:
# Take the best C for each `n_components`
best_clfs = results.reset_index().groupby('param_pca__n_components').apply(
    lambda g: g.nlargest(1, 'mean_test_score'))

In [None]:
best_clfs

In [None]:
fig, ax1 = plt.subplots(nrows=1, figsize=(6, 6))

best_clfs.plot(x='param_pca__n_components', y='mean_test_score', yerr='std_test_score',
               legend=False, ax=ax1)
ax1.set_ylabel('Classification accuracy (val)')
ax1.set_xlabel('n_components')

plt.xlim(-1, 70)

plt.tight_layout()
plt.show()