# Figures for IIT SRC article

In [None]:
import numpy as np
import os
import pandas as pd
from zipfile import ZipFile
import matplotlib.pyplot as plt
from scipy.signal import find_peaks, butter, lfilter, windows, welch
from typing import Tuple
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from adjustText import adjust_text


def spectral_transform(dataset: pd.DataFrame, axis: str, window: int, fs: int) -> Tuple[np.array, np.array]:
    overlap = 0.5
    step = int(window * overlap)

    v = dataset[axis].to_numpy()
    f, pxx = welch(
        v,
        fs=fs,
        window='hann',
        nperseg=window,
        noverlap=step,
        scaling='spectrum',
        average='mean',
        detrend='constant',
        return_onesided=True
    )
    return f, pxx

In [None]:
mafaulda_columns = ['ax', 'ay', 'az', 'bx', 'by', 'bz']
mafaulda_all_columns = ['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic']
mafaulda_fs_hz = 50000


def rpm_calc(tachometer: pd.Series) -> float:
    t = tachometer.index.to_numpy()
    y = tachometer.to_numpy()
    peaks, _ = find_peaks(y, prominence=3, width=50)
    interval = np.diff(t[peaks]).mean()
    return 60 / interval

def mafaulda_lowpass_filter(
        data: pd.Series,
        cutoff: int = mafaulda_fs_hz // 5,
        fs: int = mafaulda_fs_hz,
        order: int = 5) -> pd.Series:
    
    b, a = butter(order, cutoff, fs=fs, btype='lowpass')
    y = lfilter(b, a, data.to_numpy())
    return pd.Series(data=y, index=data.index)

def mafaulda_csv_import(dataset: ZipFile, filename: str) -> pd.DataFrame:
    columns = mafaulda_all_columns
    ts = pd.read_csv(dataset.open(filename), names=columns)
    T = 1 / mafaulda_fs_hz
    ts = (
        ts
        .assign(t = lambda x: x.index * T)
        .reset_index()
        .assign(t = lambda x: x.index * T)
        .set_index('t')
        .assign(rpm = lambda x: rpm_calc(x.tachometer))
    )
    # Detrending
    ts[columns] = ts[columns].apply(lambda x: x - x.mean())
    # Low pass filter
    ts[columns] = ts[columns].apply(mafaulda_lowpass_filter)
    return ts.assign(key=filename)

In [None]:
fluidpump_columns = ['x', 'y', 'z']
fluidpump_all_columns = ['t', 'x', 'y', 'z']
fluidpump_fs_hz = 26866

def fluidpump_csv_import(dataset: ZipFile, filename: str) -> pd.DataFrame:
    ts = pd.read_csv(
        dataset.open(filename),
        delimiter='\t',
        index_col=False,
        header=0,
        names=fluidpump_all_columns
    ) 
    g = 9.80665
    columns = fluidpump_columns
    ts[columns] = ts[columns].apply(lambda x: g * (x / 1000))

    T = 1 / fluidpump_fs_hz
    ts = ts.assign(t = lambda x: x.index * T)
    ts.set_index('t', inplace=True)
    time = 10
    time_diff = 5
    ts = ts.loc[time:time + time_diff]
    # Detrending
    ts[columns] = ts[columns].apply(lambda x: x - x.mean())
    return ts

#### Signal analysis
- 1ks plot (6x subplots) Mafaulda welch from each fault (1s, 2**14 window, hann window)  - largest severity - 2500 rpm
- 1ks plot (6x subplots) Custom dataset - each place in one day spectrum (5s segment)

In [None]:
# Maufaulda - worst severity at 2500 rpm
def plot_psd(ts, axname, fs, name, ax, window=2**14, xlim=2, ylim=0.1):
    freqs, pxx = spectral_transform(ts, axname, window, fs)
    freqs /= 1000

    ax[i].plot(freqs, pxx, color='darkblue')
    ax[i].fill_between(freqs, pxx, color='lightblue', alpha=0.3)
    ax[i].grid(True)
    ax[i].set_xlabel('Frequency [kHz]')
    ax[i].set_ylabel('Amplitude [m/s\u00B2]')
    ax[i].set_xlim(0, xlim)
    ax[i].set_ylim(0, ylim)
    ax[i].set_title(name)


axname = 'ay'
dataset = ZipFile('../datasets/MAFAULDA.zip')
filenames = [
    'normal/43.6224.csv',
    'horizontal-misalignment/2.0mm/42.5984.csv',
    'imbalance/35g/43.6224.csv',
    'underhang/cage_fault/35g/43.4176.csv',
    'underhang/ball_fault/35g/41.1648.csv',
    'underhang/outer_race/35g/43.4176.csv'
]
fig, ax = plt.subplots(len(filenames), 1, figsize=(10, 10))
for i, name in enumerate(filenames):
    ts = mafaulda_csv_import(dataset, name)
    plot_psd(ts, axname, mafaulda_fs_hz, name, ax, xlim=2, ylim=0.1)

fig.tight_layout()
plt.show()

In [None]:
# Machines - in the same day
axname = 'z'
dataset = ZipFile('../datasets/FluidPump.zip')
filenames = [
    'compressor/2024-02-20/K3/001/1.tsv',
    'compressor/2024-02-20/K3/002/1.tsv',
    'pump/2024-02-27/KSB-1/MTR001/1.tsv',
    'pump/2024-02-27/KSB-1/MTR002/1.tsv',
    'pump/2024-02-27/KSB-1/PMP003/1.tsv',
    'pump/2024-02-27/KSB-1/PMP004/1.tsv',
]

fig, ax = plt.subplots(len(filenames), 1, figsize=(10, 10))
for i, name in enumerate(filenames):
    ts = fluidpump_csv_import(dataset, name)
    # change name
    plot_psd(ts, axname, fluidpump_fs_hz, name, ax, xlim=5, ylim=0.5)

fig.tight_layout()
plt.show()

In [None]:
axname = 'z'
dataset = ZipFile('../datasets/FluidPump.zip')
filenames = [
    'compressor/2024-02-20/K5/001/1.tsv',
    'compressor/2024-02-20/K5/002/1.tsv',
    'pump/2024-02-27/KSB-7/MTR001/1.tsv',
    'pump/2024-02-27/KSB-7/MTR002/1.tsv',
    'pump/2024-02-27/KSB-7/PMP003/1.tsv',
    'pump/2024-02-27/KSB-7/PMP004/1.tsv'
]

fig, ax = plt.subplots(len(filenames), 1, figsize=(10, 10))
for i, name in enumerate(filenames):
    ts = fluidpump_csv_import(dataset, name)
    # change name
    plot_psd(ts, axname, fluidpump_fs_hz, name, ax, xlim=5, ylim=0.5)

fig.tight_layout()
plt.show()

#### Feature analysis:
Mafaulda (3) a Custom (4) 
- 1 ks table (how many faults have how many recordings)
- 1 ks plot (2 lines TD, FD) - number of PC vs. explained variance
- 1 ks plot (2x subplots TD, FD) - loading plot (PC2)
- 1 ks (4 subplots) custom: all machines, pumps, compressors, motors

##### 1. Explained varinace by PCA components and loading plots

In [None]:
def get_features_list(domains):
    features = []
    for dname, dataset in domains.items():
        names = pd.read_csv(dataset)
        names = names.columns.str.extract(r'([a-z]+)_([a-z\_\-]+)')[1].unique()
        features.extend([f'{dname}_{col.strip("_")}' for col in names if not pd.isnull(col)])

    return features


def load_whole_dataset(dataset: str, domain: str, axis: tuple, label_cols: list = None):
    features = pd.read_csv(dataset)

    columns = features.columns.str.startswith(axis)
    X = features[features.columns[columns]]
    if label_cols is not None:
        Y = features[label_cols]
    else:
        Y = pd.DataFrame()
    feature_names = get_features_list({domain: dataset})

    result = pd.DataFrame()
    for name in feature_names:              
        name = re.search(r'[a-z]+_([\w\_]+)', name).group(1)
        vector_dims = [f'{dim}_{name}' for dim in axis]
        result[name] = X[vector_dims].apply(np.linalg.norm, axis=1)
    X = result
    return X, Y


def explained_variance(X):
    x_scaled = pd.DataFrame()
    x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
    pca= PCA(n_components=10)
    X_pca = pca.fit_transform(x_scaled)
    return pca.explained_variance_ratio_


def get_principal_components(X):
    x_scaled = pd.DataFrame()
    x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
    pca= PCA(n_components=2)
    X_pca = pca.fit_transform(x_scaled)
    return pca.components_


def plot_cumulative_explained_variance(td_variance, fd_variance):
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(
        np.arange(1, len(td_variance) + 1),
        100 * np.cumsum(td_variance), 
        marker='s', label='Temporal features'
    )
    ax.plot(
        np.arange(1, len(fd_variance) + 1), 
        100 * np.cumsum(fd_variance),
        marker='s', label='Spectral features'
    )
    ax.set_xlabel('Number of principal components')
    ax.set_ylabel('Explained variance [%]')
    ax.grid()
    ax.legend()
    plt.show()


def loading_plot(loadings, feature_names, bottom, top):
    xs = loadings[0]
    ys = loadings[1]

    texts = []
    # Plot the loadings on a scatterplot
    for i, varnames in enumerate(feature_names):
        plt.arrow(
            0, 0,   # coordinates of arrow base
            xs[i],  # length of the arrow along x
            ys[i],  # length of the arrow along y
            color='r', 
            head_width=0.01
        )
        texts.append(plt.text(xs[i], ys[i], varnames))

    # Define the axis
    adjust_text(texts, only_move={'points':'y', 'texts':'y'})
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.xlim(bottom, top)
    plt.ylim(bottom, top)
    plt.grid()
    plt.show()

MaFaulDa

In [None]:
FEATURES_PATH = '../datasets/features'
dataset = {
    'td_features': os.path.join(FEATURES_PATH, 'mafaulda_temporal.csv'),
    'fd_features': os.path.join(FEATURES_PATH, 'mafaulda_spectral.csv'),
    'axis': ('ax', 'ay', 'az'),
    'labels': ['fault', 'severity']
}
dataset['X_td'], dataset['Y_td'] = (
    load_whole_dataset(dataset['td_features'], 'temporal', dataset['axis'], dataset['labels'])
)
dataset['X_fd'], dataset['Y_fd'] = (
    load_whole_dataset(dataset['fd_features'], 'spectral', dataset['axis'], dataset['labels'])
)

In [None]:
# Explained variance
td_variance = explained_variance(dataset['X_td'])
fd_variance = explained_variance(dataset['X_fd'])
plot_cumulative_explained_variance(td_variance, fd_variance)

In [None]:
# Loading plots
td_pc = get_principal_components(dataset['X_td'])
fd_pc = get_principal_components(dataset['X_fd'])
loading_plot(td_pc, dataset['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, dataset['X_fd'].columns, -0.5, 1)

Fluid pumps

In [None]:
# pump dataset (all devices, each type - pump, motor, compressor)
FEATURES_PATH = '../datasets/features'
dataset = {
    'td_features': os.path.join(FEATURES_PATH, 'fluidpump_temporal.csv'),
    'fd_features': os.path.join(FEATURES_PATH, 'fluidpump_spectral.csv'),
    'axis': ('x', 'y', 'z'),
    'labels': ['device', 'position']
}
dataset['X_td'], dataset['Y_td'] = (
    load_whole_dataset(dataset['td_features'], 'temporal', dataset['axis'], dataset['labels'])
)
dataset['X_fd'], dataset['Y_fd'] = (
    load_whole_dataset(dataset['fd_features'], 'spectral', dataset['axis'], dataset['labels'])
)

In [None]:
td_variance = explained_variance(dataset['X_td'])
fd_variance = explained_variance(dataset['X_fd'])
plot_cumulative_explained_variance(td_variance, fd_variance)

In [None]:
# Loading plots
td_pc = get_principal_components(dataset['X_td'])
fd_pc = get_principal_components(dataset['X_fd'])
loading_plot(td_pc, dataset['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, dataset['X_fd'].columns, -0.5, 1)

In [None]:
# Split by machine
compressor = Y[Y['device'].isin(['K3', 'K5'])].index
pump = Y[
    Y['device'].isin(['KSB-1', 'KSB-7']) & 
    Y['position'].isin(['PMP003', 'PMP004'])
].index
motor = Y[
    Y['device'].isin(['KSB-1', 'KSB-7']) & 
    Y['position'].isin(['MTR001', 'MTR002'])
].index

td_variance = explained_variance(dataset['X_td'].loc[compressor])
fd_variance = explained_variance(dataset['X_fd'].loc[compressor])
plot_cumulative_explained_variance(td_variance, fd_variance)

td_pc = get_principal_components(dataset['X_td'].loc[compressor])
fd_pc = get_principal_components(dataset['X_fd'].loc[compressor])
loading_plot(td_pc, dataset['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, dataset['X_fd'].columns, -0.5, 1)

# -----
td_variance = explained_variance(X_td.loc[pump])
fd_variance = explained_variance(X_fd.loc[pump])
plot_cumulative_explained_variance(td_variance, fd_variance)

td_pc = get_principal_components(dataset['X_td'].loc[pump])
fd_pc = get_principal_components(dataset['X_fd'].loc[pump])
loading_plot(td_pc, dataset['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, dataset['X_fd'].columns, -0.5, 1)

# -----
td_variance = explained_variance(X_td.loc[motor])
fd_variance = explained_variance(X_fd.loc[motor])
plot_cumulative_explained_variance(td_variance, fd_variance)

td_pc = get_principal_components(dataset['X_td'].loc[motor])
fd_pc = get_principal_components(dataset['X_fd'].loc[motor])
loading_plot(td_pc, dataset['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, dataset['X_fd'].columns, -0.5, 1)


##### 1A. Class labels count

##### 2. Scatter plots of labels
- 1 ks (5 subplots) scatter: mafaulda, all machines, pumps, compressors, motors

Classification accuracy (choices of k. and feat. count, 5-fold cross validation)

For mafaulda and custom (which classes - all or just one machine)
- 1 ks All features (2x subplots TD, FD)
	- Each subplot boxplot (k = 3,5,7)

All models (exhausive) - draw rank, corr, f-stat, mi as horizontal line
	- 3 ks plots (2, 3, 4 features)
		- Each plot 2 boxplot subplots (TD, FD) - k-neigh. vs. accuracy of all models


Compare accuracies of best models in each categories for given number of features and k:
- 1 ks plot - bar chart - color rainbow - one x (td), second x (fd)