### Online Feature selection

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from river import (
    stats,
    metrics,
    stream,
    feature_selection
)

import os
import sys
sys.path.append('../')
from vibrodiagnostics import (
    selection,
    models
)

#### Settings
Choose 0 for Fault, 1 for Anomaly

- 0, 0, 0 = Fault, az, Correlation
- 0, 0, 1 = Fault, az, F score
- 0, 0, 2 = Fault, az, MI

- 0, 2, 0 = Fault, bz, Correlation
- 0, 2, 1 = Fault, bz, F score
- 0, 2, 2 = Fault, bz, MI

- 0, 1, 0 = Fault, A, Correlation
- 0, 1, 1 = Fault, A, F score
- 0, 1, 2 = Fault, A, MI

- 0, 3, 0 = Fault, B, Correlation
- 0, 3, 1 = Fault, B, F score
- 0, 3, 2 = Fault, B, MI

In [None]:
OPTION = (0, 0, 0)

TARGETS = ['fault', 'anomaly']
AXIS_ALL = [['az'], ['ax', 'ay', 'az'], ['bz'], ['bx', 'by', 'bz']]
TITLES = ['Correlation', 'F score', 'Mutual information']
METRICS = [selection.Correlation, selection.FisherScore, selection.MutualInformation]

TARGET = TARGETS[OPTION[0]]
AXIS = AXIS_ALL[OPTION[1]]
TITLE = TITLES[OPTION[2]]
METRIC = METRICS[OPTION[2]]

In [None]:
PATH_PREFIX = '../../datasets/'
FEATURES_PATH =  os.path.join(PATH_PREFIX, 'features_data')

DATASET_PATH = os.path.join(PATH_PREFIX, 'MAFAULDA.zip')
MAFAULDA_METADATA = os.path.join(FEATURES_PATH, 'mafaulda_metadata.csv')

TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)
TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)

TSFEL_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_FEATURES_PATH)
TSFEL_TD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_TIME_FEATURES_PATH)
TSFEL_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_FREQ_FEATURES_PATH)
TSFEL_SD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_STAT_FEATURES_PATH)

In [None]:
def plot_best_features(X, y, title, metric, n=10):
    selector = feature_selection.SelectKBest(similarity=metric(), k=2)

    best = []
    for xs, ys in stream.iter_pandas(X, y):
        selector.learn_one(xs, ys)
        best.append({k: abs(v) for k, v in selector.leaderboard.items()})

    # Get only n best featues to plot
    n_top_names = [(k, abs(v)) for k, v in selector.leaderboard.items()]
    n_top_names = sorted(n_top_names, key=lambda x: x[1], reverse=True)[:n]
    n_top_names = set(map(lambda x: x[0], n_top_names))
    best = [
        {k: v for k, v in step.items() if k in n_top_names}
        for step in best
    ]

    feature_set = pd.DataFrame.from_records(best)
    kwargs = dict(figsize=(15, 6), grid=True, xlabel='Observation', ylabel=title)
    if metric == selection.FisherScore:
        kwargs['ylim'] = (0, 20) 
    feature_set.plot(**kwargs)

    return feature_set

TSFEL generated features in all domains

In [None]:
X, y = models.features_subset(TSFEL_FEATURES, selection.FAULT_CLASSES, AXIS, TARGET)
feature_set = plot_best_features(X, y, TITLE, METRIC)
plt.show()
(feature_set.tail(1)
    .reset_index(drop=True)
    .T.rename(columns={0: TITLE})
    .sort_values(by=TITLE, ascending=False))

TSFEL generated features in time domain

In [None]:
X, y = models.features_subset(TSFEL_TD_FEATURES, selection.FAULT_CLASSES, AXIS, TARGET)
feature_set = plot_best_features(X, y, TITLE, METRIC)
plt.show()
(feature_set.tail(1)
    .reset_index(drop=True)
    .T.rename(columns={0: TITLE})
    .sort_values(by=TITLE, ascending=False))

TSFEL generated features in frequency (spectral) domain

In [None]:
X, y = models.features_subset(TSFEL_FD_FEATURES, selection.FAULT_CLASSES, AXIS, TARGET)
feature_set = plot_best_features(X, y, TITLE, METRIC)
plt.show()
(feature_set.tail(1)
    .reset_index(drop=True)
    .T.rename(columns={0: TITLE})
    .sort_values(by=TITLE, ascending=False))

TSFEL generated features in statistical domain

In [None]:
X, y = models.features_subset(TSFEL_SD_FEATURES, selection.FAULT_CLASSES, AXIS, TARGET)
feature_set = plot_best_features(X, y, TITLE, METRIC)
plt.show()
(feature_set.tail(1)
    .reset_index(drop=True)
    .T.rename(columns={0: TITLE})
    .sort_values(by=TITLE, ascending=False))

Custom features in all domains

In [None]:
X, y = models.features_subset(TD_FD_FEATURES, selection.FAULT_CLASSES, AXIS, TARGET)
feature_set = plot_best_features(X, y, TITLE, METRIC)
plt.show()
(feature_set.tail(1)
    .reset_index(drop=True)
    .T.rename(columns={0: TITLE})
    .sort_values(by=TITLE, ascending=False))

Custom features in time domain

In [None]:
X, y = models.features_subset(TD_FEATURES, selection.FAULT_CLASSES, AXIS, TARGET)
feature_set = plot_best_features(X, y, TITLE, METRIC)
plt.show()
(feature_set.tail(1)
    .reset_index(drop=True)
    .T.rename(columns={0: TITLE})
    .sort_values(by=TITLE, ascending=False))

Custom features in frequency domain

In [None]:
X, y = models.features_subset(FD_FEATURES, selection.FAULT_CLASSES, AXIS, TARGET)
feature_set = plot_best_features(X, y, TITLE, METRIC)
plt.show()
(feature_set.tail(1)
    .reset_index(drop=True)
    .T.rename(columns={0: TITLE})
    .sort_values(by=TITLE, ascending=False))