## TODO: Online features

In [None]:
# 'fault', 'anomaly'
# 0.5, 0.7, 0.9
VAR_TARGET = 'fault'
ANOMALY_SEVERITY = 0.5      

In [None]:
import os
import sys
sys.path.append('../')
from vibrodiagnostics import selection, models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean
from river import (
    stats,
    metrics,
    stream,
    feature_selection
)


PATH_PREFIX = '../../datasets/'
FEATURES_PATH =  os.path.join(PATH_PREFIX, 'features_data')

TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)
TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)

TSFEL_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_FEATURES_PATH)
TSFEL_TD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_TIME_FEATURES_PATH)
TSFEL_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_FREQ_FEATURES_PATH)
TSFEL_SD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_STAT_FEATURES_PATH)

FEATURE_SETS = [
    TD_FEATURES, FD_FEATURES, TD_FD_FEATURES,
    TSFEL_FEATURES, TSFEL_TD_FEATURES, TSFEL_FD_FEATURES, TSFEL_SD_FEATURES
]

In [None]:
def calculate_scores(x: pd.DataFrame, y: pd.DataFrame, metric) -> pd.DataFrame:
    # Final score of feature ranks
    selector = feature_selection.SelectKBest(similarity=metric(), k=2)
    for xs, ys in stream.iter_pandas(x, y):
        selector.learn_one(xs, ys)
        
    leaderboard = (
        pd.DataFrame(selector.leaderboard.items(), columns=['feature', 'score'])
          .set_index('feature')
          .sort_values(by='score', ascending=False)
    )
    return leaderboard


def feature_ranking(feature_set: str, window_size=None):
    metric_ranks = pd.DataFrame()

    for metric_name, metric in zip(
            ('corr_rank', 'f_rank', 'mi_rank'), 
            (selection.Correlation, selection.FisherScore, selection.MutualInformation)
        ):
        feature_ranks = pd.DataFrame()

        for axis in ('ax', 'ay', 'az'):
            x, y = models.features_subset(
                feature_set, selection.FAULT_CLASSES, [axis],
                VAR_TARGET, anomaly_severity=ANOMALY_SEVERITY
            )
            if isinstance(window_size, int):
                x = x.loc[:,x.columns.str.endswith(f'_{window_size}')]
                x.columns = x.columns.str.extract(r'([\w\_]+)_(\w+)$')[0]

            scores = calculate_scores(x, y, metric).reset_index()
            scores['feature'] = scores['feature'].str.extract(r'([a-z]{2})_([\w\_\-]+)')[1]
            scores = scores.set_index('feature')
            feature_ranks[axis] = scores
        
        ranks = feature_ranks.rank(axis='rows', method='first', ascending=False)
        ranks = ranks.apply(gmean, axis=1).sort_values().to_frame(name='rank')      # Rank product
        metric_ranks[metric_name] = ranks.rank(axis='rows', method='dense')

    return metric_ranks


def corr_among_features(feature_set, axis, window_size=None):
    x, y = models.features_subset(
        feature_set, selection.FAULT_CLASSES, [axis], 
        VAR_TARGET,  anomaly_severity=ANOMALY_SEVERITY
    )
    if isinstance(window_size, int):
        x = x.loc[:,x.columns.str.endswith(f'_{window_size}')]
        x.columns = x.columns.str.extract(r'([\w\_]+)_(\w+)$')[0]

    correlations = [
        {'feature_1': k[0], 'feature_2': k[1], 'corr': v}
        for k, v in x.corr().abs().stack().to_dict().items()
        if k[0] != k[1]
    ]
    correlations = pd.DataFrame.from_records(correlations).sort_values(by='corr', ascending=False)
    correlations[correlations['corr'] > 0.7]
    return correlations


def plot_ranks(metric_ranks):
    fig, ax = plt.subplots(figsize=(15, 5))
    ax.plot(metric_ranks.index, metric_ranks['corr_rank'], color='green', label='Correlation Rank')
    ax.plot(metric_ranks.index, metric_ranks['f_rank'], color='blue', label='F score Rank')
    ax.plot(metric_ranks.index, metric_ranks['mi_rank'], color='purple', label='Mutual Information Rank')
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    plt.legend()
    plt.grid()
    plt.show()


def ensamble_feature_ranking(metric_ranks):
    return metric_ranks.apply(gmean, axis=1).sort_values().to_frame(name='rank') # Rank product

### Feature set #1: Custom features all

In [None]:
c = corr_among_features(TD_FD_FEATURES, 'az')
c.head(20)

In [None]:
metric_ranks = feature_ranking(TD_FD_FEATURES)
metric_ranks

In [None]:
plot_ranks(metric_ranks)

In [None]:
ranks = ensamble_feature_ranking(metric_ranks)
ranks

### Feature set #2: Custom features time domain

In [None]:
c = corr_among_features(TD_FEATURES, 'az')
c.head(20)

In [None]:
metric_ranks = feature_ranking(TD_FEATURES)
metric_ranks

In [None]:
plot_ranks(metric_ranks)

In [None]:
ranks = ensamble_feature_ranking(metric_ranks)
ranks

### Feature set #3: Custom features frequency domain

In [None]:
c = corr_among_features(FD_FEATURES, 'az')
c.head(20)

In [None]:
metric_ranks = feature_ranking(FD_FEATURES)
metric_ranks

In [None]:
plot_ranks(metric_ranks)

In [None]:
ranks = ensamble_feature_ranking(metric_ranks)
ranks

##### Choose window size

In [None]:
window_sizes = (2**6, 2**8, 2**10, 2**12, 2**14)

win_len = window_sizes[0]
print('Window size:', win_len)
metric_ranks = feature_ranking(FD_FEATURES, win_len)
metric_ranks

In [None]:
print('Window size:', win_len)
ranks = ensamble_feature_ranking(metric_ranks)
ranks

In [None]:
corr_among_features(FD_FEATURES, 'az', win_len).head(10)

In [None]:
win_len = window_sizes[1]
print('Window size:', win_len)
metric_ranks = feature_ranking(FD_FEATURES, win_len)
metric_ranks

In [None]:
print('Window size:', win_len)
ranks = ensamble_feature_ranking(metric_ranks)
ranks

In [None]:
corr_among_features(FD_FEATURES, 'az', win_len).head(10)

In [None]:
win_len = window_sizes[2]
print('Window size:', win_len)
metric_ranks = feature_ranking(FD_FEATURES, win_len)
metric_ranks

In [None]:
print('Window size:', win_len)
ranks = ensamble_feature_ranking(metric_ranks)
ranks

In [None]:
win_len = window_sizes[3]
print('Window size:', win_len)
metric_ranks = feature_ranking(FD_FEATURES, win_len)
metric_ranks

In [None]:
print('Window size:', win_len)
ranks = ensamble_feature_ranking(metric_ranks)
ranks

In [None]:
win_len = window_sizes[4]
print('Window size:', win_len)
metric_ranks = feature_ranking(FD_FEATURES, win_len)
metric_ranks

In [None]:
print('Window size:', win_len)
ranks = ensamble_feature_ranking(metric_ranks)
ranks

### Feature set #4: TSFEL features all

In [None]:
c = corr_among_features(TSFEL_FEATURES, 'az')
c.head(30)

In [None]:
metric_ranks = feature_ranking(TSFEL_FEATURES)
metric_ranks

In [None]:
plot_ranks(metric_ranks)

In [None]:
ranks = ensamble_feature_ranking(metric_ranks)
ranks

### Feature set #5: TSFEL features temporal domain

In [None]:
c = corr_among_features(TSFEL_TD_FEATURES, 'az')
c.head(20)

In [None]:
metric_ranks = feature_ranking(TSFEL_TD_FEATURES)
metric_ranks

In [None]:
plot_ranks(metric_ranks)

In [None]:
ranks = ensamble_feature_ranking(metric_ranks)
ranks

### Feature set #6: TSFEL features spectral domain

In [None]:
c = corr_among_features(TSFEL_FD_FEATURES, 'az')
c.head(20)

In [None]:
metric_ranks = feature_ranking(TSFEL_FD_FEATURES)
metric_ranks

In [None]:
plot_ranks(metric_ranks)

In [None]:
ranks = ensamble_feature_ranking(metric_ranks)
ranks

### Feature set #7: TSFEL features statistical domain

In [None]:
c = corr_among_features(TSFEL_SD_FEATURES, 'az')
c.head(20)

In [None]:
metric_ranks = feature_ranking(TSFEL_SD_FEATURES)
metric_ranks

In [None]:
plot_ranks(metric_ranks)

In [None]:
ranks = ensamble_feature_ranking(metric_ranks)
ranks