## Batch features

Exports (RPM limit = False / True):
```
    - SHAFT, A, fault
    - SHAFT, B, fault
    - BEARINGS, A, fault
    - BEARINGS, B, fault

    - SHAFT, A, anomaly, 0.6
    - SHAFT, B, anomaly, 0.6
    - BEARINGS, A, anomaly, 0.6
    - BEARINGS, B, anomaly, 0.6

    - SHAFT, A, anomaly, 0.9
    - SHAFT, B, anomaly, 0.9
    - BEARINGS, A, anomaly, 0.9
    - BEARINGS, B, anomaly, 0.9
```

In [1]:
PLACEMENTS = [['ax', 'ay', 'az'], ['bx', 'by', 'bz']]
TARGETS = ['fault', 'anomaly']
ANOMALY_SEVERITIES = [0.6, 0.9]
SHAFT_FAULTS = {'normal': 'N', 'imbalance': 'I', 'horizontal-misalignment': 'HM', 'vertical-misalignment': 'VM'}
BEARING_FAULTS = {'overhang-cage_fault': 'O-Cage', 'underhang-cage_fault': 'U-Cage',
                  'underhang-ball_fault': 'U-Ball', 'overhang-ball_fault': 'O-Ball',
                  'underhang-outer_race': 'U-Race', 'overhang-ball_fault': 'O-Race'}
FAULT_TYPES = [SHAFT_FAULTS, BEARING_FAULTS]


TRAIN_SIZE = 0.8
ONLINE = False


PLACE = PLACEMENTS[0]
VAR_TARGET = TARGETS[1]
ANOMALY_SEVERITY = ANOMALY_SEVERITIES[0]       # If it is anomaly

FAULT_CLASSES = FAULT_TYPES[0]
RPM_LIMIT = True                         # False, True
BALANCE = True                           # False, True

In [2]:
import os
import itertools
from collections import Counter
import sys
sys.path.append('../')
from vibrodiagnostics import selection, models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean, spearmanr, kendalltau

from typing import Callable
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA


PATH_PREFIX = '../../datasets/'
FEATURES_PATH =  os.path.join(PATH_PREFIX, 'features_data')

TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)
TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)

METRICS_ONLINE = (selection.Correlation, selection.FisherScore, selection.MutualInformation)
METRICS_OFFLINE = (selection.corr_classif, f_classif, mutual_info_classif)

Offline feature selection

In [7]:
def calculate_scores(x: pd.DataFrame, y: pd.DataFrame, metric: Callable) -> pd.DataFrame:
    scores = metric(x, y)
    if isinstance(scores, tuple):
        scores = scores[0]
    leaderboard = (
        pd.DataFrame(zip(x.columns, scores), columns=['feature', 'score'])
        .set_index('feature')
        .sort_values(by='score', ascending=False)
    )
    return leaderboard


def features_subset_offline(filename, classes, axis, label, train_size, anomaly_severity, balance, rpm_limit):
    features = pd.read_csv(filename).fillna(0)
    features = features[features['fault'].isin(classes)]
    if rpm_limit:
        RPM = 2900
        RPM_RANGE = 500
        features = features[features['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both')]
    
    features = models.fault_labeling(features, classes, anomaly_severity)

    columns = features.columns.str.startswith(tuple(axis))
    X = features[features.columns[columns]]
    y = features[label].astype('category')
    print(y)

    # Balance dataset & Normalize dataset (later)
    if balance:
        oversample = RandomOverSampler(sampling_strategy='not majority', random_state=10)
        X, y = oversample.fit_resample(X, y)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, stratify=y, random_state=10
    )
    return X_train, y_train, X_test, y_test


def axis_to_magnitudes(feature_set: str, axis: list, window_size: int) -> tuple:
    x, y, _, _ = features_subset_offline(
        feature_set,
        FAULT_CLASSES,
        axis,
        VAR_TARGET,
        TRAIN_SIZE,
        ANOMALY_SEVERITY,
        BALANCE,
        RPM_LIMIT
    )
    if isinstance(window_size, int):
        x = x.loc[:,x.columns.str.endswith(f'_{window_size}')]
        x.columns = x.columns.str.extract(r'([\w\_]+)_(\w+)$')[0]

    feature_names = x.columns.str.extract(r'([a-z]{2})_([\w\_\-]+)')[1].unique()

    result = pd.DataFrame()
    for name in feature_names:
        vector_dims = [f'{dim}_{name}' for dim in axis]
        result[name] = x[vector_dims].apply(np.linalg.norm, axis=1)

    return result, y


def feature_ranking(feature_set: str, window_size=None):
    axis = PLACE
    metric_ranks = pd.DataFrame()

    for metric_name, metric in zip(('corr', 'f_stat', 'mi'), METRICS_OFFLINE):
        x, y = axis_to_magnitudes(feature_set, axis, window_size)
        scores = calculate_scores(x, y, metric)
        metric_ranks[metric_name] = scores

    return metric_ranks


def corr_among_features(feature_set, axis, window_size=None):
    x, y = axis_to_magnitudes(feature_set, axis, window_size)

    correlations = [
        {'feature_1': k[0], 'feature_2': k[1], 'corr': v}
        for k, v in x.corr().abs().stack().to_dict().items()
        if k[0] != k[1]
    ]
    correlations = pd.DataFrame.from_records(correlations).sort_values(by='corr', ascending=False)
    correlations[correlations['corr'] > 0.7]
    return correlations


def plot_scores(metric_ranks, n=None):
    fig, ax = plt.subplots(1, 3, figsize=(20, 5))
    for i, col in enumerate(('corr', 'f_stat', 'mi')):
        scores = metric_ranks[col].sort_values(ascending=False)
        if n is not None:
            scores = scores.iloc[:n]
        ax[i].bar(scores.index, scores)

    for i, col_name in enumerate(('Correlation', 'F statistic', 'Mutual information')):
        ax[i].set_xticks(ax[i].get_xticks())
        ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=45, ha='right')
        ax[i].grid()
        ax[i].set_xlabel('Feature')
        ax[i].set_ylabel(col_name)


def ensemble_feature_ranking(scores: pd.DataFrame):
    ranks = scores.rank(axis='rows', method='first', ascending=False)
    return ranks.apply(gmean, axis=1).sort_values().to_frame(name='rank') # Rank product


def rank_correlation(scores: pd.DataFrame):
    # spearman's rho vs kendall's tau
    ranks = scores.rank(axis='rows', method='first', ascending=False)
    correlations = []
    for a, b in itertools.combinations(ranks.columns, r=2):
        coef, pval = spearmanr(ranks[a], ranks[b])
        # coef, pval = kendalltau(ranks[a], ranks[b])
        correlations.append({'A': a, 'B': b, 'Spearman': coef, 'P-value': pval})

    return pd.DataFrame.from_records(correlations)

Online feature selection

In [4]:
def calculate_scores_online(x: pd.DataFrame, y: pd.DataFrame, metric) -> pd.DataFrame:
    selector = feature_selection.SelectKBest(similarity=metric(), k=2)
    for xs, ys in stream.iter_pandas(x, y):
        selector.learn_one(xs, ys)
        
    leaderboard = (
        pd.DataFrame(selector.leaderboard.items(), columns=['feature', 'score'])
          .set_index('feature')
          .sort_values(by='score', ascending=False)
    )
    return leaderboard


def features_subset_online(filename, classes, axis, label, train_size, anomaly_severity, balance, rpm_limit):
    features = pd.read_csv(filename)
    features = features[features['fault'].isin(classes)]
    if rpm_limit:
        RPM = 2900
        RPM_RANGE = 500
        features = features[features['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both')]
    
    features = models.fault_labeling(features, classes, anomaly_severity)

    groups = [
        df.sample(frac=1, random_state=10)
        for i, df in (
            features.sort_values(by='severity_level').groupby('severity_level')
        )
    ]
    features = pd.concat(groups).reset_index(drop=True)

    columns = features.columns.str.startswith(tuple(axis))
    X = features[features.columns[columns]]
    y = features[label].astype('category')
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, random_state=10
    )   
    return (
        X_train.sort_index(), y_train.sort_index(),
        X_test.sort_index(), y_test.sort_index()
    )

def plot_online_best_features(X, y, title, metric, n=10):
    selector = feature_selection.SelectKBest(similarity=metric(), k=2)

    best = []
    for xs, ys in stream.iter_pandas(X, y):
        selector.learn_one(xs, ys)
        best.append({k: abs(v) for k, v in selector.leaderboard.items()})

    # Get only n best featues to plot
    n_top_names = [(k, abs(v)) for k, v in selector.leaderboard.items()]
    n_top_names = sorted(n_top_names, key=lambda x: x[1], reverse=True)[:n]
    n_top_names = set(map(lambda x: x[0], n_top_names))
    best = [
        {k: v for k, v in step.items() if k in n_top_names}
        for step in best
    ]

    feature_set = pd.DataFrame.from_records(best)
    kwargs = dict(figsize=(15, 6), grid=True, xlabel='Observation', ylabel=title)
    if metric == selection.FisherScore:
        kwargs['ylim'] = (0, 20) 
    feature_set.plot(**kwargs)

    return feature_set

# X, y = models.features_subset(TD_FEATURES, FAULT_CLASSES, AXIS, TARGET)
# feature_set = plot_best_features(X, y, TITLE, METRIC)
# plt.show()
# (feature_set.tail(1)
#     .reset_index(drop=True)
#     .T.rename(columns={0: TITLE})
#     .sort_values(by=TITLE, ascending=False))

Feature EDA

In [5]:
def show_effect_of_normalization(feature_set: str, axis: list, metric: Callable):
    x, y = axis_to_magnitudes(feature_set, axis, None)
    scores = calculate_scores(x, y, metric)
    
    features_normalized = selection.normalize_features(x, x.columns)
    scores_norm = calculate_scores(features_normalized, y, metric)
    
    fig, ax = plt.subplots(1, 2, figsize=(20, 5))
    scores.head(15).plot.bar(figsize=(10, 4), grid=True, xlabel='Feature', ylabel='Metric', legend=False, title='Unnormalized', ax=ax[0])
    scores_norm.head(15).plot.bar(figsize=(10, 4), grid=True, xlabel='Feature', ylabel='Metric', legend=False, title='Normalized', ax=ax[1])
    plt.show()


def scatter_3best_features(x, y, best_features, scaler_class, target):
    columns = best_features[:3]
    data = x[columns].copy()
    scaler = scaler_class()
    data[columns] = scaler.fit_transform(data)
    if target == 'fault':
        models.cross_cuts_3d(data, y)
    elif target == 'anomaly':
        models.cross_cuts_3d_anomalies(data, y)


def pca_feature_importance(X, n=10):
    # Absolute values of the Eigenvectors' components corresponding to the k largest Eigenvalues.
    model = PCA(n_components=3).fit(X)
    X_pc = model.transform(X)

    columns = list(X.columns)
    percentages = [(100 * (np.flip(np.sort(np.abs(pc))) / np.sum(np.abs(pc))))[:n] for pc in model.components_]
    most_important = [np.flip(np.argsort(np.abs(pc)))[:n] for pc in model.components_]

    for i, pc in enumerate(most_important):
        print(f'PC{i+1} ({model.explained_variance_ratio_[i] * 100:.4f} %)')
        print([columns[x] for x in pc])
        print(percentages[i])

### Show effect of normalization on metrics
Result: normalization has no effect

In [8]:
show_effect_of_normalization(TD_FEATURES, PLACE, METRICS_OFFLINE[0])

145     False
146     False
147     False
148     False
149     False
        ...  
9730     True
9731     True
9732     True
9733     True
9734     True
Name: anomaly, Length: 1500, dtype: category
Categories (2, bool): [False, True]


ValueError: Input y contains NaN.

In [None]:
show_effect_of_normalization(TD_FEATURES, PLACE, METRICS_OFFLINE[1])

In [None]:
show_effect_of_normalization(TD_FEATURES, PLACE, METRICS_OFFLINE[2])

### Show class imbalance

In [None]:
def plot_class_imbalance(feature_set, axis):
    _, y = axis_to_magnitudes(feature_set, axis, None)
    counter = Counter(y)
    for k, v in counter.items():
        per = v / len(y) * 100
        print(f'Class={k}, n={v} ({per:.3f}%%)')
    
    plt.bar(counter.keys(), counter.values())
    plt.show()

plot_class_imbalance(TD_FEATURES, PLACE)

In [None]:
plot_class_imbalance(FD_FEATURES, PLACE)

---
## Feature set #1: Custom features time domain

In [None]:
c = corr_among_features(TD_FEATURES, PLACE)
c.head(30)

In [None]:
metric_scores = feature_ranking(TD_FEATURES)
metric_scores

In [None]:
plot_scores(metric_scores)

In [None]:
rank_correlation(metric_scores)

In [None]:
ranks = ensemble_feature_ranking(metric_scores)
ranks

In [None]:
x, y = axis_to_magnitudes(TD_FEATURES, PLACE, None)
scatter_3best_features(x, y, list(ranks.index), MinMaxScaler, VAR_TARGET)
plt.show()

In [None]:
x, y = axis_to_magnitudes(TD_FEATURES, PLACE, None)
scatter_3best_features(x, y, list(ranks.index), StandardScaler, VAR_TARGET)
plt.show()

In [None]:
pca_feature_importance(x)

---
## Feature set #2: Custom features frequency domain

In [None]:
c = corr_among_features(FD_FEATURES, PLACE)
c.head(20)

In [None]:
metric_scores = feature_ranking(FD_FEATURES)
metric_scores

In [None]:
plot_scores(metric_scores, n=15)

In [None]:
rank_correlation(metric_scores)

In [None]:
ranks = ensemble_feature_ranking(metric_scores)
ranks.head(10)

## Frequency domain: window size = 64

In [None]:
window_sizes = (2**6, 2**8, 2**10, 2**12, 2**14)

win_len = window_sizes[0]
print('Window size:', win_len)
metric_scores = feature_ranking(FD_FEATURES, win_len)
metric_scores

In [None]:
plot_scores(metric_scores)

In [None]:
print('Window size:', win_len)
ranks = ensemble_feature_ranking(metric_scores)
ranks

In [None]:
corr_among_features(FD_FEATURES, PLACE, win_len).head(10)

---
## Frequency domain: window size = 256

In [None]:
win_len = window_sizes[1]
print('Window size:', win_len)
metric_ranks = feature_ranking(FD_FEATURES, win_len)
metric_ranks

In [None]:
plot_scores(metric_ranks)

In [None]:
rank_correlation(metric_scores)

In [None]:
ranks = ensemble_feature_ranking(metric_scores)
ranks

In [None]:
corr_among_features(FD_FEATURES, PLACE, win_len).head(20)

---
## Frequency domain: window size = 1024

In [None]:
win_len = window_sizes[2]
print('Window size:', win_len)
metric_scores = feature_ranking(FD_FEATURES, win_len)
metric_scores

In [None]:
plot_scores(metric_scores)

In [None]:
rank_correlation(metric_scores)

In [None]:
ranks = ensemble_feature_ranking(metric_scores)
ranks

---
## Frequency domain: window size = 4096

In [None]:
win_len = window_sizes[3]
print('Window size:', win_len)
metric_scores = feature_ranking(FD_FEATURES, win_len)
metric_scores

In [None]:
plot_scores(metric_scores)

In [None]:
rank_correlation(metric_scores)

In [None]:
ranks = ensemble_feature_ranking(metric_scores)
ranks

---

## Frequency domain: window size = 16384

In [None]:
win_len = window_sizes[4]
print('Window size:', win_len)
metric_scores = feature_ranking(FD_FEATURES, win_len)
metric_scores

In [None]:
plot_scores(metric_scores)

In [None]:
rank_correlation(metric_scores)

In [None]:
ranks = ensemble_feature_ranking(metric_scores)
ranks

#### Evolution of ranks depending in window size (scatter)

In [None]:
feature_window_ranks = pd.DataFrame()
for win in window_sizes:
    scores = feature_ranking(FD_FEATURES, win)
    ranks = ensemble_feature_ranking(scores)
    feature_window_ranks[win] = ranks

feature_window_ranks.sort_values(by=1024)