# Feature Selection

### Import libraries

In [None]:
import pickle
from urllib.request import urlopen

import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneGroupOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from tqdm.notebook import tnrange

### Data Load

In [None]:
def load_data(local=True):
    """
    For colab, use local = False.
    It takes a few minutes to download.
    """
    if local:
        DATA_PATH = 'data/train.pkl'
        with open(DATA_PATH, 'rb') as f:
            df = pickle.load(f)
    else:
        DATA_URL = 'https://gitlab.com/machine-learning-course1/ml-project-samsung-2023-summer/-/raw/master/data/train.pkl?inline=false'
        df = pickle.load(urlopen(DATA_URL))

    X        = np.stack(df['Data'])
    Y        = np.stack(df['Motion'])
    Subjects = np.stack(df['Subject'])

    return X, Y, Subjects

In [None]:
X, Y, Subjects = load_data(local=True)

print(f'X: {X.shape}  Y: {Y.shape}  Subjects: {Subjects.shape}')
print(f'Motions: {sorted(set(Y))}')
print(f'Subjects: {sorted(set(Subjects))}')

X: (13940, 500, 3)  Y: (13940,)  Subjects: (13940,)
Motions: ['Nordic walking', 'ascending stairs', 'cycling', 'descending stairs', 'ironing', 'lying', 'rope jumping', 'running', 'sitting', 'standing', 'vacuum cleaning', 'walking']
Subjects: [3, 4, 5, 6, 7, 8, 9]


### Feature Extraction

In [None]:
def extract_features(X_sample: np.ndarray) -> np.ndarray:
    """
    Extract features from a single sample

    Parameters
    ----------
    X_sample : array of shape (500, 3)
        100Hz * 5 seconds => 500
        3 axis (x, y, z)  => 3

    Returns
    -------
    features : array with (p,) shape
        Extracted features from X_sample
    """
    assert X_sample.shape == (500, 3)

    # Extract time domain features
    X_time = X_sample
    mean = np.mean(X_time, axis=0)
    std = np.std(X_time, axis=0)

    # Extract frequency domain features
    X_freq = np.abs(np.fft.fft(X_sample, axis=0))[1:]
    dominant_freq = np.argmax(X_freq)

    # Concatenate features
    features = np.hstack([
        mean,
        std,
        dominant_freq,
    ])

    assert features.ndim == 1
    return features

In [None]:
# Extract features for multiple samples
X_features = np.array([
    extract_features(X_sample)
    for X_sample in X
])
num_features = X_features.shape[1]
print(X_features.shape)

(13940, 7)


### Feature Selector

In [None]:
def select_features(features, indices=None):
    if indices is None:
        indices = [i for i in range(features.shape[1])]
    return features[:, indices]


feature_selector = FunctionTransformer(select_features)

In [None]:
feature_selector.get_params()

{'accept_sparse': False,
 'check_inverse': True,
 'feature_names_out': None,
 'func': <function __main__.select_features(features, indices=None)>,
 'inv_kw_args': None,
 'inverse_func': None,
 'kw_args': None,
 'validate': False}

In [None]:
X_selected = feature_selector.transform(X_features)
print(X_selected.shape)

(13940, 7)


In [None]:
feature_selector.set_params(
    kw_args={'indices': [1, 3, 5]}
)
X_selected = feature_selector.transform(X_features)
print(X_selected.shape)

(13940, 3)


### Example `KNNClassifier` with Feature Selection

In [None]:
# Example KNNClassifier
KNNClassifier = Pipeline([
    ('feature_selector', FunctionTransformer(select_features)),
    ('scaler', StandardScaler()),
    ('preprocessor', PCA()),
    ('classifier', KNeighborsClassifier(n_neighbors=10)),
])

### List hyperparameters of the `KNNClassifier`

In [None]:
KNNClassifier.get_params()

{'memory': None,
 'steps': [('feature_selector',
   FunctionTransformer(func=<function select_features at 0x7f9fe1223700>)),
  ('scaler', StandardScaler()),
  ('preprocessor', PCA()),
  ('classifier', KNeighborsClassifier(n_neighbors=10))],
 'verbose': False,
 'feature_selector': FunctionTransformer(func=<function select_features at 0x7f9fe1223700>),
 'scaler': StandardScaler(),
 'preprocessor': PCA(),
 'classifier': KNeighborsClassifier(n_neighbors=10),
 'feature_selector__accept_sparse': False,
 'feature_selector__check_inverse': True,
 'feature_selector__feature_names_out': None,
 'feature_selector__func': <function __main__.select_features(features, indices=None)>,
 'feature_selector__inv_kw_args': None,
 'feature_selector__inverse_func': None,
 'feature_selector__kw_args': None,
 'feature_selector__validate': False,
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'preprocessor__copy': True,
 'preprocessor__iterated_power': 'auto',
 'preprocessor__n_c

### Feature Selection with Cross-validation

In [None]:
# Initialization variables
features, remaining_features = [], [i for i in range(num_features)]
score_dict, features_dict = {}, {}

for k in tnrange(1, num_features + 1):
    best_score = -1
    for feature in remaining_features:
        KNNClassifier.set_params(feature_selector__kw_args={
            'indices': features + [feature]
        })
        score = cross_val_score(
            KNNClassifier,
            X_features, Y,
            groups=Subjects,
            cv=LeaveOneGroupOut(),
            scoring='f1_macro',
            n_jobs=-1,
        ).mean()
        if best_score < score:
            best_score = score
            best_feature = feature

    # Updating variables for next loop
    features.append(best_feature)
    remaining_features.remove(best_feature)
    assert len(features) == k

    # Saving values for plotting
    score_dict[k] = best_score
    features_dict[k] = features.copy()

  0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
for k in range(1, num_features + 1):
    print(f'k={k}  F1 score: {score_dict[k]:.4f}  Selected features: {features_dict[k]}')

k=1  F1 score: 0.2697  Selected features: [6]
k=2  F1 score: 0.3505  Selected features: [6, 4]
k=3  F1 score: 0.4225  Selected features: [6, 4, 0]
k=4  F1 score: 0.4647  Selected features: [6, 4, 0, 5]
k=5  F1 score: 0.4969  Selected features: [6, 4, 0, 5, 2]
k=6  F1 score: 0.4943  Selected features: [6, 4, 0, 5, 2, 3]
k=7  F1 score: 0.4500  Selected features: [6, 4, 0, 5, 2, 3, 1]


### Tune `indices` and `n_neighbors`

In [None]:
param_grid = { # Hyperparameter search space
    'feature_selector__kw_args': [
        {'indices': [6, 4, 0, 5, 2]},
        {'indices': [6, 4, 0, 5, 2, 3]},
    ],
    'classifier__n_neighbors': [5, 10, 15],
}

clf = GridSearchCV(
    KNNClassifier,
    param_grid=param_grid,  # Hyperparameter search space
    cv=LeaveOneGroupOut(),  # Subject-wise cross-validation
    scoring='f1_macro',     # Scoring with macro F1 score
    n_jobs=-1,              # Enable multiprocessing
    verbose=10,             # Print processing logs
    refit=False,           # Do not refit the cls with best params
)
clf.fit(X_features, Y, groups=Subjects)
print(f'Best Parameters: {clf.best_params_}')

Fitting 7 folds for each of 6 candidates, totalling 42 fits
[CV 1/7; 1/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2]}
[CV 2/7; 1/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2]}
[CV 3/7; 1/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2]}
[CV 4/7; 1/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2]}
[CV 5/7; 1/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2]}
[CV 6/7; 1/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2]}
[CV 7/7; 1/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2]}
[CV 1/7; 2/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2, 3]}
[CV 2/7; 2/6] START classifier__n_neighbors=5, feature_selector__kw_args={'indices': [6, 4, 0, 5, 2, 3]}
[CV 3/7; 2/6] ST