# Hyperparameter Tuning with Cross-validation

### Reference
- [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)
- [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
- [LeaveOneGroupOut](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneGroupOut.html)
- [Custom Estimator](https://scikit-learn.org/stable/developers/develop.html)

### Import libraries

In [1]:
import pickle
from urllib.request import urlopen

import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### Data Load

In [2]:
def load_data(local=True):
    """
    For colab, use local = False.
    It takes a few minutes to download.
    """
    if local:
        DATA_PATH = 'data/train.pkl'
        with open(DATA_PATH, 'rb') as f:
            df = pickle.load(f)
    else:
        DATA_URL = 'https://gitlab.com/machine-learning-course1/ml-project-samsung-2023-summer/-/raw/master/data/train.pkl?inline=false'
        df = pickle.load(urlopen(DATA_URL))

    X        = np.stack(df['Data'])
    Y        = np.stack(df['Motion'])
    Subjects = np.stack(df['Subject'])

    return X, Y, Subjects

In [3]:
X, Y, Subjects = load_data(local=True)

print(f'X: {X.shape}  Y: {Y.shape}  Subjects: {Subjects.shape}')
print(f'Motions: {sorted(set(Y))}')
print(f'Subjects: {sorted(set(Subjects))}')

X: (13940, 500, 3)  Y: (13940,)  Subjects: (13940,)
Motions: ['Nordic walking', 'ascending stairs', 'cycling', 'descending stairs', 'ironing', 'lying', 'rope jumping', 'running', 'sitting', 'standing', 'vacuum cleaning', 'walking']
Subjects: [3, 4, 5, 6, 7, 8, 9]


### Feature Extraction

In [4]:
def extract_features(X_sample: np.ndarray) -> np.ndarray:
    """
    Extract features from a single sample

    Parameters
    ----------
    X_sample : array of shape (500, 3)
        100Hz * 5 seconds => 500
        3 axis (x, y, z)  => 3

    Returns
    -------
    features : array with (p,) shape
        Extracted features from X_sample
    """
    assert X_sample.shape == (500, 3)

    # Extract time domain features
    X_time = X_sample
    mean = np.mean(X_time, axis=0)
    std = np.std(X_time, axis=0)

    # Extract frequency domain features
    X_freq = np.abs(np.fft.fft(X_sample, axis=0))[1:]
    dominant_freq = np.argmax(X_freq)

    # Concatenate features
    features = np.hstack([
        mean,
        std,
        dominant_freq,
    ])

    assert features.ndim == 1
    return features

In [5]:
# Extract features for multiple samples
X_features = np.array([
    extract_features(X_sample)
    for X_sample in X
])
print(X_features.shape)

(13940, 7)


### Example `KNNClassifier`

In [6]:
# Example KNNClassifier
KNNClassifier = Pipeline([
    ('scaler', StandardScaler()),
    ('preprocessor', PCA()),
    ('classifier', KNeighborsClassifier()),
])

### List hyperparameters of the `KNNClassifier`

In [7]:
KNNClassifier.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('preprocessor', PCA()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'scaler': StandardScaler(),
 'preprocessor': PCA(),
 'classifier': KNeighborsClassifier(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'preprocessor__copy': True,
 'preprocessor__iterated_power': 'auto',
 'preprocessor__n_components': None,
 'preprocessor__n_oversamples': 10,
 'preprocessor__power_iteration_normalizer': 'auto',
 'preprocessor__random_state': None,
 'preprocessor__svd_solver': 'auto',
 'preprocessor__tol': 0.0,
 'preprocessor__whiten': False,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

### Tune `n_neighbors` of `KNNClassifier`

In [8]:
param_grid = { # Hyperparameter search space
    'classifier__n_neighbors': [5, 10, 15],
}

clf = GridSearchCV(
    KNNClassifier,
    param_grid=param_grid,  # Hyperparameter search space
    cv=LeaveOneGroupOut(),  # Subject-wise cross-validation
    scoring='f1_macro',     # Scoring with macro F1 score
    n_jobs=-1,              # Enable multiprocessing
    verbose=10,             # Print processing logs
    refit=False,           # Do not refit the cls with best params
)
clf.fit(X_features, Y, groups=Subjects)
print(f'Best Parameters: {clf.best_params_}')

Fitting 7 folds for each of 3 candidates, totalling 21 fits


[CV 7/7; 1/3] START classifier__n_neighbors=5...................................
[CV 2/7; 1/3] START classifier__n_neighbors=5...................................
[CV 2/7; 2/3] START classifier__n_neighbors=10..................................
[CV 3/7; 2/3] START classifier__n_neighbors=10..................................
[CV 5/7; 2/3] START classifier__n_neighbors=10..................................
[CV 7/7; 1/3] END ....classifier__n_neighbors=5;, score=0.141 total time=   0.0s
[CV 7/7; 2/3] START classifier__n_neighbors=10..................................
[CV 4/7; 2/3] START classifier__n_neighbors=10..................................
[CV 6/7; 2/3] START classifier__n_neighbors=10..................................
[CV 2/7; 3/3] START classifier__n_neighbors=15..................................
[CV 7/7; 2/3] END ...classifier__n_neighbors=10;, score=0.204 total time=   0.0s
[CV 4/7; 1/3] START classifier__n_neighbors=5...................................
[CV 6/7; 1/3] START classifi