# Feature Extraction

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb

import os
from itertools import pairwise
from typing import Callable, List, Tuple
from zipfile import ZipFile
from datetime import datetime

from tqdm.notebook import tqdm
from multiprocessing.pool import ThreadPool

from scipy.stats import entropy
from scipy.signal import find_peaks, butter, lfilter, windows, welch
from scipy.fft import rfft
from scipy.interpolate import interp1d
from tsfel import feature_extraction as ft

Settings

In [None]:
path_root = '../../datasets/'
path_features = os.path.join(path_root, 'features')

idx = 1        # Choose dataset [0, 1]
datasets = ['MAFAULDA.zip', 'FluidPump.zip']
names = [name.split('.')[0].lower() for name in datasets]
parts = [1, 12]         # 5 second recordings (5/5, 60/5)

opt = {
    'name': names[idx],
    'dataset': os.path.join(path_root, datasets[idx]),
    'temporal_features': os.path.join(path_features, f'{names[idx]}_temporal.csv'),
    'spectral_features': os.path.join(path_features, f'{names[idx]}_spectral.csv'),
    'parts': parts[idx]
}

Calculation of custom features

In [None]:
def energy(Pxx: np.array) -> float:
    return np.sum(Pxx**2)


def negentropy(x: np.array) -> float:
    if len(x) == 0:
        return np.nan
    return -entropy((x ** 2) / np.mean(x ** 2))


def signal_to_noise(x: np.array) -> float:
    # https://dsp.stackexchange.com/questions/76291/how-to-extract-noise-from-a-signal-in-order-to-get-both-noise-power-and-signal-p
    # https://www.geeksforgeeks.org/signal-to-noise-ratio-formula/
    # https://saturncloud.io/blog/calculating-signaltonoise-ratio-in-python-with-scipy-v11/
    m = np.mean(x)
    sd = np.std(x)
    return np.where(sd == 0, 0, m / sd)


def spectral_roll_off_frequency(f: np.array, Pxx: np.array, percentage: float) -> float:
    """Roll-off: Cumulative sum of energy in spectral bins and find index in f array
    'percentage' % of total energy below this frequency
    """
    return f[np.argmax(np.cumsum(Pxx**2) >= percentage * energy(Pxx))]


def temporal_variation(dataset: pd.DataFrame, axis: str, window: int) -> list:
    """Temporal variation of succesive spectra (stationarity)
    """
    overlap = 0.5
    step = int(window * overlap)
    v = dataset[axis].to_numpy()
    spectra = [
        np.absolute(rfft(v[i:i+window] * windows.hann(window)))
        for i in range(0, len(v) - window, step)
    ]
    fluxes = [
        1 - np.corrcoef(psd1, psd2) for psd1, psd2 in pairwise(spectra)
    ]
    return fluxes


def envelope_signal(f: np.array, Pxx: np.array) -> np.array:
    peaks, _ = find_peaks(Pxx)      # peaks = mms_peak_finder(Pxx)
    try:
        envelope = interp1d(f[peaks], Pxx[peaks], kind='quadratic', fill_value='extrapolate')
    except ValueError:
        return []

    y_env = envelope(f)
    y_env[y_env < 0] = 0
    return y_env


def spectral_transform(dataset: pd.DataFrame, axis: str, window: int, fs: int) -> Tuple[np.array, np.array]:
    overlap = 0.5
    step = int(window * overlap)

    v = dataset[axis].to_numpy()
    f, pxx = welch(
        v,
        fs=fs,
        window='hann',
        nperseg=window,
        noverlap=step,
        scaling='spectrum',
        average='mean',
        detrend='constant',
        return_onesided=True
    )
    return f, pxx

Feature extraction by domain

In [None]:
def time_features_calc(df: pd.DataFrame, col: str, fs: int, window: int) -> List[Tuple[str, pd.DataFrame]]:
    x = df[col]
    features = [
        ('zerocross', ft.zero_cross(x) / len(x)),
        ('pp', [ft.pk_pk_distance(x)]),
        ('aac', np.mean(np.absolute(np.diff(x)))),
        ('rms', [ft.rms(x)]),
        ('skewness', [ft.skewness(x)]),
        ('kurtosis', [ft.kurtosis(x)]),
        ('shape', [ft.rms(x) / np.mean(np.absolute(x))]),
        ('crest', [np.max(np.absolute(x)) / ft.rms(x)]),
        ('impulse', [np.max(np.absolute(x)) / np.mean(np.absolute(x))]),
        ('clearance', [np.max(np.absolute(x)) / (np.mean(np.sqrt(np.absolute(x))) ** 2)]),
    ]
    return [(f'{col}_{f[0]}', f[1]) for f in features]

def frequency_features_calc(df: pd.DataFrame, col: str, fs: int, window: int) -> List[Tuple[str, pd.DataFrame]]:
    f, pxx = spectral_transform(df, col, window, fs)
    
    fluxes = temporal_variation(df, col, window)
    envelope_spectrum = envelope_signal(f, pxx)

    features = [
        ('centroid', [np.average(f, weights=pxx)]),
        ('std', [ft.calc_std(pxx)]),
        ('skewness', [ft.skewness(pxx)]),
        ('kurtosis', [ft.kurtosis(pxx)]),
        ('roll_on', [spectral_roll_off_frequency(f, pxx, 0.05)]),
        ('roll_off', [spectral_roll_off_frequency(f, pxx, 0.85)]),
        ('flux', [np.mean(fluxes)]),
        ('noisiness', [signal_to_noise(pxx)]),
        ('energy', [energy(pxx)]),
        ('entropy', [entropy(pxx / np.sum(pxx))]),
        ('negentropy', [negentropy(envelope_spectrum)])
    ]
    return [(f'{col}_{f[0]}', f[1]) for f in features]

Load dataset from ZIP archive

In [None]:
# Feature extraction (Generic)
def list_files(dataset: ZipFile) -> List[str]:
    filenames = [
        f.filename
        for f in dataset.infolist()
        if f.filename.endswith(('.csv', '.tsv'))
    ]
    filenames.sort()
    return filenames


def load_files_split(dataset: ZipFile, func: Callable, parts: int, cores: int = 4):
    pool = ThreadPool(processes=cores)
    filenames = list_files(dataset)

    return pd.concat([
        pool.apply_async(func, (dataset, name, parts)).get()
        for name in tqdm(filenames)
    ])


def split_dataframe(dataframe: pd.DataFrame, parts: int = None) -> List[pd.DataFrame]:
    if parts is None:
        return [dataframe]

    step = len(dataframe) // parts
    return [
        dataframe.iloc[i:i+step].reset_index(drop=True)
        for i in range(0, len(dataframe), step)
        if len(dataframe.iloc[i:i + step]) == step
    ]


def detrending_filter(dataframes: List[pd.DataFrame], columns: List[str]) -> List[pd.DataFrame]:
    for df in dataframes:
        df[columns] = df[columns].apply(lambda x: x - x.mean())
    return dataframes

MaFaulDa dataset signal preprocessing

In [None]:
# Feature extraction mafaulda
mafaulda_columns = ['ax', 'ay', 'az', 'bx', 'by', 'bz']
mafaulda_all_columns = ['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic']
mafaulda_fs_hz = 50000
mafaulda_spectral_window = 2**15

In [None]:
def mafaulda_parse_filename(filename: str) -> Tuple[str, str, str]:
    path = filename.split('/')

    if path[0].strip() in ('overhang', 'underhang'):
        fault = f'{path[0]}-{path[1]}'
        severity = path[2]
        seq = path[3]
    
    elif path[0].strip() == 'normal':
        fault, severity, seq = path[0], '0', path[1]

    else:
        fault, severity, seq = path

    return fault, severity, seq


def rpm_calc(tachometer: pd.Series) -> float:
    t = tachometer.index.to_numpy()
    y = tachometer.to_numpy()
    peaks, _ = find_peaks(y, prominence=3, width=50)
    interval = np.diff(t[peaks]).mean()
    return 60 / interval


def mafaulda_csv_import(dataset: ZipFile, filename: str) -> pd.DataFrame:
    ts = pd.read_csv(dataset.open(filename), names=mafaulda_all_columns)
    T = 1 / mafaulda_fs_hz
    ts = (
        ts
        .assign(t = lambda x: x.index * T)
        .reset_index()
        .assign(t = lambda x: x.index * T)
        .set_index('t')
        .assign(rpm = lambda x: rpm_calc(x.tachometer))
    )
    return ts.assign(key=filename)


def mafaulda_lowpass_filter(
        data: pd.Series,
        cutoff: int = mafaulda_fs_hz // 5,
        fs: int = mafaulda_fs_hz,
        order: int = 5) -> pd.Series:
    
    b, a = butter(order, cutoff, fs=fs, btype='lowpass')
    y = lfilter(b, a, data.to_numpy())
    return pd.Series(data=y, index=data.index)


def lowpass_filter_extract(dataframes: List[pd.DataFrame], columns: List[str]) -> List[pd.DataFrame]:
    for df in dataframes:
        df[columns] = df[columns].apply(mafaulda_lowpass_filter)
    return dataframes


def mafaulda_features_by_domain(
        features_calc: Callable,
        dataset: ZipFile,
        filename: str, 
        parts: int = None) -> pd.DataFrame:

    # print(f'Processing: {filename}')
    fs = mafaulda_fs_hz
    columns = mafaulda_columns
    window = mafaulda_spectral_window

    ts = mafaulda_csv_import(dataset, filename)
    fault, severity, seq = mafaulda_parse_filename(filename)

    dataframe = split_dataframe(ts, parts)
    dataframe = detrending_filter(dataframe, columns)
    dataframe = lowpass_filter_extract(dataframe, columns)

    result = []
    for i, df in enumerate(dataframe):
        fvector = [
            ('fault', [fault]),
            ('severity', [severity]),
            ('seq', [f'{seq}.part.{i}']),
            ('rpm', [df['rpm'].mean()])
        ]
        for col in columns:
            fvector.extend(features_calc(df, col, fs, window))
        result.append(pd.DataFrame(dict(fvector))) 

    return pd.concat(result).reset_index(drop=True)

Fluid pumps dataset signal preprocessing

In [None]:
# Fluid pumps extraction
fluidpump_columns = ['x', 'y', 'z']
fluidpump_all_columns = ['t', 'x', 'y', 'z']
fluidpump_fs_hz = 26866
fluidpump_spectral_window = 2 ** 14

In [None]:
def fluidpump_csv_import(dataset: ZipFile, filename: str) -> pd.DataFrame:
    ts = pd.read_csv(
        dataset.open(filename),
        delimiter='\t',
        index_col=False,
        header=0,
        names=fluidpump_all_columns
    ) 
    g = 9.80665
    columns = fluidpump_columns
    ts[columns] = ts[columns].apply(lambda x: g * (x / 1000))

    T = 1 / fluidpump_fs_hz
    ts = (
        ts
        .assign(t = lambda x: x.index * T)
        .assign(key=filename)
    )
    return ts


def fluidpump_features_by_domain(
        features_calc: Callable,
        dataset: ZipFile,
        filename: str, 
        parts: int = None
    ):

    # print(f'Processing: {filename}')
    fs = fluidpump_fs_hz
    columns = fluidpump_columns
    window = fluidpump_spectral_window

    ts = fluidpump_csv_import(dataset, filename)
    dataframe = split_dataframe(ts, parts)
    dataframe = detrending_filter(dataframe, columns)
    
    header = filename.split(os.path.sep)
    metadata = [
        ('place', [header[-5]]),
        ('date', [datetime.fromisoformat(header[-4]).date()]),
        ('device', [header[-3]]),
        ('position', [header[-2]]),
        ('seq', [int(header[-1].split('.')[0])])
    ]

    result = []
    for i, df in enumerate(dataframe):
        fvector = metadata.copy()
        for col in columns:
            fvector.extend(features_calc(df, col, fs, window))
        result.append(pd.DataFrame(dict(fvector))) 

    return pd.concat(result).reset_index(drop=True)

Feature extraction procedure and save to files

In [None]:
def features_time_domain(dataset: ZipFile, filename: str, parts: int) -> pd.DataFrame:
    if opt['name'] == 'mafaulda':
        return mafaulda_features_by_domain(time_features_calc, dataset, filename, parts)
    elif opt['name'] == 'fluidpump':
        return fluidpump_features_by_domain(time_features_calc, dataset, filename, parts)


def features_frequency_domain(dataset: ZipFile, filename: str, parts: int) -> pd.DataFrame:
    if opt['name'] == 'mafaulda':
        return mafaulda_features_by_domain(frequency_features_calc, dataset, filename, parts)
    elif opt['name'] == 'fluidpump':
        return fluidpump_features_by_domain(frequency_features_calc, dataset, filename, parts)

In [None]:
# Temporal features
features = load_files_split(
    dataset=ZipFile(opt['dataset']),
    func=features_time_domain,
    parts=opt['parts']
)
features.to_csv(opt['temporal_features'], index=False)
features

In [None]:
# Spectral features
features = load_files_split(
    dataset=ZipFile(opt['dataset']),
    func=features_frequency_domain,
    parts=opt['parts']
)
features.to_csv(opt['spectral_features'], index=False)
features