In [None]:
import os
from zipfile import ZipFile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tsfel

import sys
sys.path.append('../')
from vibrodiagnostics import (
    mafaulda,
    selection,
    discovery,
    models
)

MAFAULDA_PATH = '../../datasets/MAFAULDA.zip'
FEATURES_PATH =  '../../datasets/features_data/'
MAFAULDA_METADATA = os.path.join(FEATURES_PATH, selection.MAFAULDA_METADATA)

Extract metadata about files from whole dataset

In [None]:
file_index = mafaulda.dataset_index(MAFAULDA_PATH)
file_index.to_csv(MAFAULDA_METADATA, index=False)

Load files for whole dataset

In [None]:
files = pd.read_csv(MAFAULDA_METADATA, index_col='filename')
files['fault'].value_counts()

In [None]:
files

In [None]:
def features_time_domain(zip_file: ZipFile, filename: str, parts: int=None) -> pd.DataFrame:
    print(f'Processing: {filename}')

    columns = mafaulda.COLUMNS
    ts = mafaulda.csv_import(zip_file, filename)
    fault, severity, seq = mafaulda.parse_filename(filename)

    dataframe = discovery.split_dataframe(ts, parts)
    dataframe = discovery.detrending_filter(dataframe, columns)
    dataframe = discovery.lowpass_filter_extract(dataframe, columns)

    result = []
    for i, df in enumerate(dataframe):
        fvector = [
            ('fault', [fault]),
            ('severity', [severity]),
            ('seq', [f'{seq}.part.{i}']),
            ('rpm', [df['rpm'].mean()])
        ]
        for col in columns:
            fvector.extend(discovery.time_features_calc(df, col))
        result.append(pd.DataFrame(dict(fvector))) 

    return pd.concat(result).reset_index(drop=True)


def features_frequency_domain(zip_file: ZipFile, filename: str, parts: int=None) -> pd.DataFrame:
    # Calculate FFT with Welch method in 5 different Hann window sizes
    print(f'Processing: {filename}')
    OVERLAP = 0.5
    WINDOW_SIZES = (2**6, 2**8, 2**10, 2**12, 2**14)

    columns = mafaulda.COLUMNS
    ts = mafaulda.csv_import(zip_file, filename)
    fault, severity, seq = mafaulda.parse_filename(filename)

    dataframe = discovery.split_dataframe(ts, parts)
    dataframe = discovery.detrending_filter(dataframe, columns)
    dataframe = discovery.lowpass_filter_extract(dataframe, columns)

    result = []
    for i, df in enumerate(dataframe):
        fvector = [
            ('fault', [fault]),
            ('severity', [severity]),
            ('seq', [f'{seq}.part.{i}']),
            ('rpm', [df['rpm'].mean()])
        ]
        for window in WINDOW_SIZES:
            for col in columns:
                fvector.extend(discovery.frequency_features_calc(df, col, window))
        result.append(pd.DataFrame(dict(fvector))) 

    return pd.concat(result).reset_index(drop=True)

Export features for all files

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
filenames = list(files.index)
filenames[:10]

Time domain features

In [None]:
FEATURES_FILENAME = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)

features = mafaulda.import_files_split(dataset, filenames, features_time_domain, parts=5)
features.to_csv(FEATURES_FILENAME, index=False)
features.head(10)

Frequency domain features

In [None]:
FEATURES_FILENAME = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)

features = mafaulda.import_files_split(dataset, filenames, features_frequency_domain, parts=5)
features.to_csv(FEATURES_FILENAME, index=False)
features.head(10)

Merge time and frequency domain features

In [None]:
time_domain_filename = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
freq_domain_filename = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)
merged_filename = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)

result = selection.merge_feature_domains(time_domain_filename, freq_domain_filename)
result.to_csv(merged_filename, index=False)

result.head(5)

In [None]:
EDA of features

In [None]:
RPM = 2500
RPM_RANGE = 500
SHAFT_FAULTS = {'normal': 'N', 'imbalance': 'I', 'horizontal-misalignment': 'HM', 'vertical-misalignment': 'VM'}
BEARING_FAULTS = {'overhang-cage_fault': 'O-Cage', 'underhang-cage_fault': 'U-Cage',
                  'underhang-ball_fault': 'U-Ball', 'overhang-ball_fault': 'O-Ball',
                  'underhang-outer_race': 'U-Race', 'overhang-ball_fault': 'O-Race'}

shaft_fault_all = files[
    (files['fault'].isin(SHAFT_FAULTS))
].copy()

shaft_fault_rpm =  files[
    (files['fault'].isin(SHAFT_FAULTS)) &
    (files['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both'))
].copy()

shaft_fault_all = files[
    (files['fault'].isin(SHAFT_FAULTS))
].copy()

shaft_fault_rpm =  files[
    (files['fault'].isin(SHAFT_FAULTS)) &
    (files['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both'))
].copy()

Frequency spectrum comparison of faults in low and high RPM

In [None]:
def plot_frequency_spectrum(dataset, file, axis, p, window=8192, dB=False, label=None):
    ts = csv_import(dataset, file)
    f, pxx = discovery.spectral_transform(ts, axis, window)
    p.set_xlabel('Frequency [Hz]')
    if dB:
        p.set_ylabel('Amplitude [dB]')
        pxx = 20 * np.log10(pxx / DB_REF)
    else:
        p.set_ylabel('Amplitude [m/s^2]')
    p.plot(f, pxx, label=label)
    p.grid(True)


def plot_rpm_comparison(files, fault, dB):
    table = files[
        (files['rpm'] == files['rpm'].min()) |
        (files['rpm'] == files['rpm'].max())
    ] 
    dataset = ZipFile(MAFAULDA_PATH)
    
    fig, ax = plt.subplots(1, 1, figsize=(15, 3), sharey=True)
    ax.set_title(f'{fault}')
    for filename, series in table.iterrows():
        plot_frequency_spectrum(dataset, filename, 'ax', ax, dB=dB, label=f'{series["rpm"]:.2f}')

    ax.set_xlim(0, 1000)
    ax.legend(loc="upper right")
    fig.tight_layout()
    plt.show()

Shaft faults: Scale in m/s^2: frequency spectrum between lowest rpm and highest RPM

In [None]:
files = pd.read_csv(MAFAULDA_METADATA, index_col='filename')
files = fault_labeling(files, SHAFT_FAULTS, 0.8, debug=True)
files.head(5)

In [None]:
for fault, level in [('N', 0), ('I', 1), ('VM', 1), ('HM', 1)]:
    sources = files[(files['fault'] == fault) &  (files['severity_level'] == level)]
    plot_rpm_comparison(sources, fault, dB=False)

In [None]:
Bearing faults: Scale in m/s^2: frequency spectrum between lowest rpm and highest RPM

In [None]:
files = pd.read_csv(MAFAULDA_METADATA, index_col='filename')
files = fault_labeling(files, BEARING_FAULTS, 0.8, debug=True)
files.head(5)

In [None]:
for fault, level in [(fault, 1) for fault in BEARING_FAULTS.values()]:
    sources = files[(files['fault'] == fault) &  (files['severity_level'] == level)]
    plot_rpm_comparison(sources, fault, dB=False)