In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple, Callable
import tsfel
import numpy as np
from sklearn.feature_selection import mutual_info_classif, f_classif
from scipy.stats import gmean

import sys
sys.path.append('../')
from vibrodiagnostics import datasets, selection, discovery
from sklearn.preprocessing import MinMaxScaler


PARTS = 5
DIR = '../../inspections/datacentres/shc3/'
FILENAMES = [
    'k3_1.tsv', 'k3_2.tsv', 'k3_3.tsv',     # Bad
    'k5_1.tsv', 'k5_2.tsv', 'k5_3.tsv'      # Good
]

FEATURES_PATH = os.path.join(DIR, 'features')
FILENAMES = [os.path.join(DIR, filename) for filename in FILENAMES]

In [None]:
def beaglebone_measurement(filename: str, fs: int=2500) -> Tuple[str, pd.DataFrame]:
    g = 9.81
    milivolts = 1800
    resolution = 2**12
    columns = ['x', 'y', 'z']
    ts = pd.read_csv(filename, delimiter='\t', index_col=False, header=None, names=columns)
        
    # Calculate amplitude in m/s^2 Beaglebone Black ADC and ADXL335 resolution (VIN 1.8V, 12bits)
    for dim in columns:
        ts[dim] = ts[dim] * (milivolts / resolution)  # ADC to mV
        ts[dim] = (ts[dim] / 180) * g                 # mV to m/s^2 (180 mV/g)
        ts[dim] -= ts[dim].mean()

    ts['t'] = ts.index * (1 / fs)
    ts.set_index('t', inplace=True)
    return (os.path.basename(filename), ts, fs, ts.columns)  # last is feature columns


def features_time_domain(filename: str, loader: Callable, parts: int=None) -> pd.DataFrame:
    print(f'Processing: {filename}')
    name, ts, fs_hz, columns = loader(filename)

    columns = ['x', 'y', 'z']
    dataframe = discovery.split_dataframe(ts, parts)
    dataframe = discovery.detrending_filter(dataframe, columns)

    result = []
    for i, df in enumerate(dataframe):
        fvector = [('name', [f'{name}.part.{i}'])]
        for col in columns:
            fvector.extend(discovery.time_features_calc(df, col))
        result.append(pd.DataFrame(dict(fvector))) 

    return pd.concat(result).reset_index(drop=True)


def features_frequency_domain(filename: str, loader: Callable, parts: int=None) -> pd.DataFrame:
    # Calculate FFT with Welch method in 4 different Hann window sizes
    OVERLAP = 0.5
    WINDOW_SIZES = (2**6, 2**8, 2**10, 2**12)

    print(f'Processing: {filename}')
    name, ts, fs_hz, columns = loader(filename)
    columns = ['x', 'y', 'z']
    dataframe = discovery.split_dataframe(ts, parts)
    dataframe = discovery.detrending_filter(dataframe, columns)

    result = []
    for i, df in enumerate(dataframe):
        fvector = [('name', [f'{name}.part.{i}'])]
        for window in WINDOW_SIZES:
            for col in columns:
                fvector.extend(discovery.frequency_features_calc(df, col, window))
        result.append(pd.DataFrame(dict(fvector))) 

    return pd.concat(result).reset_index(drop=True)

Time domain features

In [None]:
FEATURES_FILENAME = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)

features = datasets.import_files_split(
    FILENAMES, features_time_domain, beaglebone_measurement, parts=PARTS
)
features.to_csv(FEATURES_FILENAME, index=False)
features.head(10)

Frequency domain features

In [None]:
FEATURES_FILENAME = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)

features = datasets.import_files_split(
    FILENAMES, features_frequency_domain, beaglebone_measurement, parts=PARTS
)
features.to_csv(FEATURES_FILENAME, index=False)
features.head(10)

Merge time and frequency domain features

In [None]:
time_domain_filename = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
freq_domain_filename = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)
merged_filename = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)

result = selection.me_merge_feature_domains(time_domain_filename, freq_domain_filename)
result.to_csv(merged_filename, index=False)
result.head(5)

#### Temporal and spectral domain features EDA

In [None]:
def boxplot_features(dataset):
    FEATURES_FILENAME = os.path.join(FEATURES_PATH, dataset)
    features = pd.read_csv(FEATURES_FILENAME)
    # Compute magnitude of feature vectors
    names = features['name']

    # Choose window size
    if dataset == selection.FREQ_FEATURES_PATH:
        window_size = 2**12
        features = features.loc[:,features.columns.str.endswith(f'_{window_size}')]
        features.columns = features.columns.str.extract(r'([\w\_]+)_(\w+)$')[0]

    features['ac_unit'] = names.str.extract(r'(k\d+)').astype('category')

    axis = ['x', 'y', 'z']
    feature_names = np.unique(features.drop(columns=['ac_unit']).columns.str.extract(r'[a-z]+_(\w+)').dropna().T.to_numpy())
    result = pd.DataFrame()
    for name in feature_names:              
        vector_dims = [f'{dim}_{name}' for dim in axis]
        result[name] = features[vector_dims].apply(np.linalg.norm, axis=1)

    result_labeled = result.copy()
    result_labeled['ac_unit'] = features['ac_unit']

    # Show boxplots, split by AC unit
    result.boxplot(figsize=(15, 5))
    plt.show()
    result_labeled.boxplot(figsize=(20, 5), layout=(2, 6), by='ac_unit', sharey=False)
    plt.show()

    # MinMax scaled result
    scaler = MinMaxScaler()
    result_scaled = pd.DataFrame()
    result_scaled[result.columns] = scaler.fit_transform(result)

    result_labeled = result_scaled.copy()
    result_labeled['ac_unit'] = features['ac_unit']

    # Show boxplots, split by AC unit
    result_scaled.boxplot(figsize=(15, 5))
    plt.show()
    result_labeled.boxplot(figsize=(20, 5), layout=(2, 6), by='ac_unit', sharey=False)
    plt.show()
    return result_scaled, features['ac_unit']

In [None]:
X_t, Y_t = boxplot_features(selection.TIME_FEATURES_PATH)

In [None]:
X_f, Y_f = boxplot_features(selection.FREQ_FEATURES_PATH)

### Ranks

In [None]:
METRICS_OFFLINE = (selection.corr_classif, f_classif, mutual_info_classif)

def batch_feature_ranking(X: pd.DataFrame, Y: pd.DataFrame) -> pd.DataFrame:
    metric_ranks = pd.DataFrame()  # Independent scores

    for metric_name, metric in zip(('corr', 'f_stat', 'mi'), METRICS_OFFLINE):
        scores = metric(X, Y)
        if isinstance(scores, tuple):
            scores = scores[0]
        leaderboard = (
            pd.DataFrame(zip(X.columns, scores), columns=['feature', 'score'])
            .set_index('feature')
            .sort_values(by='score', ascending=False)
        )
        metric_ranks[metric_name] = leaderboard
    
    ranks = metric_ranks.rank(axis='rows', method='first', ascending=False)
    return metric_ranks, ranks.apply(gmean, axis=1).sort_values().to_frame(name='rank')

In [None]:
scores, ranks = batch_feature_ranking(X_t, Y_t)
scores

In [None]:
ranks

In [None]:
scores, ranks = batch_feature_ranking(X_f, Y_f)
scores

In [None]:
ranks