# EDA - Explorary analysis of pumps dataset
Collected vibration dataset of water pumps and compressors

In [None]:
import os
import re
from typing import List, Tuple
from datetime import datetime, date
from zipfile import ZipFile

import numpy as np
import pandas as pd
from scipy.signal import windows, welch
import matplotlib.pylab as plt
from tqdm.notebook import tqdm

import sys
sys.path.append('../')
from vibrodiagnostics import pumps, extraction
plt.rcParams.update({'font.size': 14})

Paths in the file system to pump dataset and CSV files with extracted features (pumps.ipynb)

In [None]:
PATH = '../datasets'
FEATURES_PATH = os.path.join(PATH, 'features')
DATASET_PATH = os.path.join(PATH, 'FluidPump.zip')
FEATURES = {
    'TD': os.path.join(FEATURES_PATH, 'PUMPS_TD.csv'),
    'FD': os.path.join(FEATURES_PATH, 'PUMPS_FD.csv'),
}

Set analysed sensor axis and FFT window size.

In [None]:
POSSIBLE_DIRECTIONS = ['x', 'y', 'z']
AXIS = POSSIBLE_DIRECTIONS[0]
FFT_WINDOW = 2**14

Side by side Welch's frequency spectra at the same places on different days

In [None]:
source = ZipFile(DATASET_PATH)
filenames = extraction.list_files(source)

waveforms = []
all_metadata = []
for name in tqdm(filenames):
    ts = pumps.csv_import(source, name)
    header = name.split(os.path.sep)
    metadata = {
        'date': datetime.fromisoformat(header[-4]).date(),
        'device': header[-3],
        'position': header[-2],
        'seq': int(header[-1].split('.')[0]),
        'ts': ts
    }
    metacopy = metadata.copy()
    metacopy.pop('ts')

    all_metadata.append(metacopy)
    waveforms.append(metadata)

In [None]:
all_metadata = pd.DataFrame.from_records(all_metadata)
all_metadata

#### Plot one spectrum in all axis

In [None]:
query = {
    'date': date(2024, 2, 27),
    'device': 'KSB7',
    'position': 'PMP003',
    'seq': 1
}
F_LIMIT_KHZ = 2
axis = pumps.BEARINGS_COLUMNS
Fs = pumps.SAMPLING_RATE

signal = filter(
    lambda s: (
        s['date'] == query['date'] and
        s['device'] == query['device'] and
        s['position'] == query['position'] and
        s['seq'] == query['seq']
    ),
    waveforms
)
ts = list(signal)[0]['ts'].set_index('t')
ts

#### Histograms

In [None]:
ax = ts.hist(figsize=(20, 4), grid=True, bins=100, layout=(1, 3), edgecolor='black', linewidth=0.5)
plt.show()

#### Waveform in full length

In [None]:
ax = ts.plot(figsize=(20, 8), grid=True, subplots=True)
for i, axname in enumerate(axis):
    ax[i].set_xlabel('Time [s]')
    ax[i].set_ylabel(f'Amplitude ({axname}) [m/s\u00b2]')
plt.show() 

#### Waveform in detail
- T_WAVEFORM s
- interval of TIME s

In [None]:
T_WAVEFORM = 10
TIME = 0.1

ax = (ts[axis].iloc[int(T_WAVEFORM*Fs):int(T_WAVEFORM*Fs+Fs*TIME)]
                  .plot(figsize=(20, 8), grid=True, subplots=True))
    
for i, axname in enumerate(axis):
    ax[i].set_xlabel('Time [s]')
    ax[i].set_ylabel(f'Amplitude ({axname}) [m/s^2]')
plt.show() 

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 10))
ts_range = ts[:]
#ts_range = ts.loc[:10]

for i, axname in enumerate(axis):
    freqs, pxx = extraction.spectral_transform(ts_range[axname], FFT_WINDOW, Fs)
    freqs /= 1000
    ax[i].plot(freqs, pxx, color='darkblue')
    ax[i].fill_between(freqs, pxx, color='lightblue', alpha=0.3)
    ax[i].grid(True)
    ax[i].set_xlim(0, F_LIMIT_KHZ)
    ax[i].set_xlabel('Frequency [kHz]')
    ax[i].set_ylabel('Amplitude [m/s\u00B2]')

#### Histograms based on the machine

In [None]:
machines = {
    'KSB1': {
        'MTR001': 'M1',
        'MTR002': 'M1',
        'PMP003': 'P1',
        'PMP004': 'P1'
    },
    'KSB7': {
        'MTR001': 'M2',
        'MTR002': 'M2',
        'PMP003': 'P2',
        'PMP004': 'P2'
    },
    'Sigma': {
        'MTR001': 'M3',
        'MTR002': 'M3',
        'PMP003': 'P3',
        'PMP004': 'P3'
    },
    'K3': {
        '001': 'C1',
        '002': 'C1'
    },
    'K5': {
        '001': 'C2',
        '002': 'C2'
    }
}

machine_waveform = pd.DataFrame.from_records(waveforms)
machine_waveform = pumps.get_classes(machine_waveform, machines, keep=True) 
machine_waveform

In [None]:
for name, group in machine_waveform.groupby(by='label', observed=False):
    print(name)
    axis = pd.concat(list(group['ts']))
    print(len(axis))
    ax = axis[POSSIBLE_DIRECTIONS].hist(
        figsize=(20, 4),
        grid=True,
        bins=100,
        layout=(1, 3),
        edgecolor='black',
        linewidth=0.5
    )
    for i in range(len(POSSIBLE_DIRECTIONS)):
        ax[0][i].set_xlabel('Amplitude [m/s\u00B2]')
    plt.show()

#### Frequency spectrum of one window

In [None]:
# def window_frequency_spectrum(ts: pd.DataFrame, t: float, nfft: int = FFT_WINDOW, dB=False):
#     fig, ax = plt.subplots(len(axis), 1, figsize=(20, 10))
#     i = 0
#     for axname in axis:
#         signal = ts[axname].loc[t:t+nfft/Fs].to_numpy()
#         n = len(signal)
#         pxx = np.abs(np.fft.rfft(signal * windows.hann(n)) / n)
#         if dB is True:
#             pxx = 20 * np.log10(pxx / 0.000001)  # 1 dB = 1 um/s^2    
#         freqs = np.fft.fftfreq(n, d=1/Fs)[:len(pxx)]

#         ax[i].plot(freqs, pxx, color='darkblue')
#         ax[i].fill_between(freqs, pxx, color='lightblue', alpha=0.3)
#         ax[i].grid(True)
#         ax[i].set_xlabel('Frequency [Hz]')
#         ax[i].set_ylabel('Amplitude [m/s\u00B2]')
#         ax[i].set_xlim(0, F_LIMIT_KHZ * 1000)
#         ax[i].set_title(f'{axname.upper()}, {name}')
#         i += 1
#     plt.tight_layout()
#     plt.show()


# window_frequency_spectrum(ts, t=10, dB=False)

In [None]:
def spectogram(x, nfft, fs, xlim=None, ylim=None):
    fig, ax = plt.subplots(figsize=(15, 4))
    cmap = plt.get_cmap('inferno')
    pxx, freqs, t, im = plt.specgram(
        x, NFFT=nfft, Fs=fs,
        detrend='mean',
        mode='magnitude',
        scale='dB',
        vmin=-70,
        cmap=cmap,
        noverlap=int(nfft * 0.66)
    )
    fig.colorbar(im, aspect=20, pad=0.04)
    ax.set_xlabel('Time [s]')
    ax.set_ylabel('Frequency [Hz]')
    if xlim is not None:
        ax.set_xlim(*xlim)
    if ylim is not None:
        ax.set_ylim(*ylim)
    return freqs, t, pxx

for axname in ['x', 'y', 'z']:
    print(axname)
    spectogram(ts[axname], 2**12, pumps.SAMPLING_RATE)
    plt.show()

In [None]:
def side_by_side(
        machines: list,
        plot_type: str = 'psd',
        axis: str = 'x',
        device: str | None = None,
        position: str | None = None,
        figsize: tuple | None = None,
        xlim: tuple | None = 5,
        ylim: tuple | None = None):

    if device is not None:
        machines = filter(lambda m: re.match(device, m['device']) is not None, machines)
    if position is not None:
        machines = filter(lambda m: re.match(position, m['position']) is not None, machines)

    machines = list(machines)
    rows = set([(c['date'], c['seq']) for c in machines])
    columns = set([(c['device'], c['position']) for c in machines])
    rows = len(rows)

    if figsize is None:
        figsize = (15, 20)
    fig, ax = plt.subplots(rows, len(columns), figsize=figsize)
    if rows == 1:
        ax = [ax]

    for j, col in enumerate(sorted(list(columns))):
        device, position = col
        placements = list(filter(lambda m: m['device'] == device and m['position'] == position, machines))
        
        for i, row in enumerate(sorted(placements, key=lambda m: (m['date'], m['seq']))):
            date, seq = row['date'], row['seq']
            ts = row['ts']
            try:
                subplot = ax[i][j]
            except:
                continue
        
            if plot_type == 'psd':
                freqs, pxx = extraction.spectral_transform(ts[axis], FFT_WINDOW, Fs)
                subplot.plot(freqs / 1000, pxx, color='darkblue', linewidth=0.5)

                ax[-1][j].set_xlabel('Frequency [kHz]')
                ax[i][0].set_ylabel('Amplitude [m/s\u00B2]')
    
            elif plot_type == 'psd_db':
                freqs, pxx = extraction.spectral_transform(ts[axis], FFT_WINDOW, Fs)
                pxx = dB = 20 * np.log10(pxx / 0.000001)    # 1 dB = 1 um/s^2   
                subplot.plot(freqs / 1000, pxx, color='darkblue', linewidth=0.5)

                subplot.set_xlim(0, 5)
                ax[-1][j].set_xlabel('Frequency [kHz]')
                ax[i][0].set_ylabel('Amplitude [dB]')

            elif plot_type == 'hist':
                subplot.hist(ts[axis], color='darkblue', bins=100, edgecolor='black', linewidth=0.5)

                ax[-1][j].set_xlabel('Amplitude [m/s\u00B2]')
                ax[i][0].set_ylabel('Count samples')
            
            if xlim is not None:
                subplot.set_xlim(0, xlim)
            if ylim is not None:
                subplot.set_ylim(0, ylim)

            subplot.grid(True)
            subplot.set_title(f'{device}, {position} | {date}, {seq}.')
                
    plt.tight_layout()
    plt.show()

#### Wideband frequency spectrum

In [None]:
side_by_side(waveforms, 'psd', AXIS, device=r'K\d+', position=r'\d+', figsize=(15, 20), ylim=1)

In [None]:
side_by_side(waveforms, 'psd', AXIS, device=r'Sigma', position=r'.*', figsize=(15, 5)) 

In [None]:
side_by_side(waveforms, 'psd', AXIS, device=r'KSB\d+', position=r'MTR\d+', figsize=(20, 15), ylim=0.7)

In [None]:
side_by_side(waveforms, 'psd', AXIS, device=r'KSB\d+', position=r'PMP\d+', figsize=(20, 15))

#### Frequency spectra - Low frequencies

In [None]:
side_by_side(waveforms, 'psd', AXIS, device=r'K\d+', position=r'\d+', figsize=(15, 20), xlim=0.8, ylim=1)

In [None]:
side_by_side(waveforms, 'psd', AXIS, device=r'Sigma', position=r'.*', figsize=(15, 5), xlim=1, ylim=20) 

In [None]:
side_by_side(waveforms, 'psd', AXIS, device=r'KSB\d+', position=r'PMP\d+', figsize=(20, 15), xlim=1, ylim=0.1)

### Histograms of time domain

In [None]:
side_by_side(waveforms, 'hist', AXIS, device=r'K\d+', position=r'\d+', figsize=(20, 15), xlim=None)

In [None]:
side_by_side(waveforms, 'hist', AXIS, device=r'Sigma', position=r'.*', figsize=(20, 5), xlim=None)

In [None]:
side_by_side(waveforms, 'hist', AXIS, device=r'KSB\d+', position=r'MTR\d+', figsize=(20, 15), xlim=None)

In [None]:
side_by_side(waveforms, 'hist', AXIS, device=r'KSB\d+', position=r'PMP\d+', figsize=(20, 15), xlim=None)

### Water pump slow down and speed up

In [None]:
nfft = FFT_WINDOW
filename = os.path.join(PATH, 'ksb-cloud/misc-fluid-pump/KSB1-slow-down/1.tsv')
ts = pumps.csv_import(None, filename)
for axis in pumps.BEARINGS_COLUMNS:
    spectogram(ts[axis], nfft, pumps.SAMPLING_RATE, xlim=(60, 90), ylim=(0, 120))
    plt.show()

The pumps KSB-7 is turned off. Old Sigma pump next to KSB7 is still turned on

In [None]:
filename = os.path.join(PATH, 'ksb-cloud/misc-fluid-pump/KSB7-speed-up/1.tsv')
nfft = 2 ** 11
ts = pumps.csv_import(None, filename)
for axis in pumps.BEARINGS_COLUMNS:
    spectogram(ts[axis], nfft, pumps.SAMPLING_RATE, ylim=(0, 3000))
    plt.show()

The pump KSB-7 is speeding up. Sigma pump is turned off and KSB-7 is turned on.

In [None]:
filename = os.path.join(PATH, 'ksb-cloud/misc-fluid-pump/KSB7-speed-up/2.tsv')
nfft = 2 ** 11
ts = pumps.csv_import(None, filename)
for axis in pumps.BEARINGS_COLUMNS:
    print(axis)
    spectogram(ts[axis], nfft, pumps.SAMPLING_RATE, xlim=(2, 18), ylim=(0, 3000))
    plt.show()