# Fluid pumps dataset EDA

In [None]:
import os
from typing import List, Tuple
from datetime import datetime, date
from zipfile import ZipFile

import numpy as np
import pandas as pd
from scipy.signal import windows
import matplotlib.pylab as plt
from scipy.signal import welch

In [None]:
path_root = '../../datasets/'
path_features = os.path.join(path_root, 'features')
dataset = 'FluidPump.zip'
name = dataset.split('.')[0].lower()

opt = {
    'dataset': os.path.join(path_root, dataset),
    'temporal_features': os.path.join(path_features, f'{name}_temporal.csv'),
    'spectral_features': os.path.join(path_features, f'{name}_spectral.csv'),
}

#### Side by side welch spectra from same places on different days

In [None]:
fluidpump_columns = ['x', 'y', 'z']
fluidpump_all_columns = ['t', 'x', 'y', 'z']
fluidpump_fs_hz = 26866

def list_files(dataset: ZipFile) -> List[str]:
    filenames = [
        f.filename
        for f in dataset.infolist()
        if f.filename.endswith(('.csv', '.tsv'))
    ]
    filenames.sort()
    return filenames

def fluidpump_csv_import(dataset: ZipFile, filename: str) -> pd.DataFrame:
    ts = pd.read_csv(
        dataset.open(filename),
        delimiter='\t',
        index_col=False,
        header=0,
        names=fluidpump_all_columns
    ) 
    g = 9.80665
    columns = fluidpump_columns
    ts[columns] = ts[columns].apply(lambda x: g * (x / 1000))

    T = 1 / fluidpump_fs_hz
    ts = (
        ts
        .assign(t = lambda x: x.index * T)
    )
    ts.set_index('t', inplace=True)
    return ts

In [None]:
source = ZipFile(opt['dataset'])
filenames = list_files(source)

waveforms = []
all_metadata = []
for name in filenames:
    ts = fluidpump_csv_import(source, name)
    header = name.split(os.path.sep)
    metadata = {
        'place': header[-5],
        'date': datetime.fromisoformat(header[-4]).date(),
        'device': header[-3],
        'position': header[-2],
        'seq': int(header[-1].split('.')[0]),
        'ts': ts
    }
    metacopy = metadata.copy()
    metacopy.pop('ts')

    all_metadata.append(metacopy)
    waveforms.append(metadata)

In [None]:
all_metadata = pd.DataFrame.from_records(all_metadata)
all_metadata

In [None]:
def spectral_transform(dataset: pd.DataFrame, axis: str, window: int, fs: int) -> Tuple[np.array, np.array]:
    overlap = 0.5
    step = int(window * overlap)

    v = dataset[axis].to_numpy()
    f, pxx = welch(
        v,
        fs=fs,
        window='hann',
        nperseg=window,
        noverlap=step,
        scaling='spectrum',
        average='mean',
        detrend='constant',
        return_onesided=True
    )
    return f, pxx


def side_by_side(
        waveforms: list,
        location: str,
        plot_type: str = 'psd', # psd, psd_db, hist
        axis: str = 'x',
        filt_pos: str | None = None,
        figsize: tuple | None = None,
        ylim: tuple | None = None):
        
    machines = filter(lambda m: m['place'] == location, waveforms)
    if filt_pos is not None:
        machines = filter(lambda m: filt_pos in m['position'], machines)

    machines = list(machines)
    rows = set([(c['date'], c['seq']) for c in machines])
    columns = set([(c['device'], c['position']) for c in machines])
    rows = len(rows)

    if figsize is None:
        figsize = (15, 20)
    fig, ax = plt.subplots(rows, len(columns), figsize=figsize, sharey=True)
    if rows == 1:
        ax = [ax]

    for j, col in enumerate(sorted(list(columns))):
        device, position = col
        placements = list(filter(lambda m: m['device'] == device and m['position'] == position, machines))
        
        for i, row in enumerate(sorted(placements, key=lambda m: (m['date'], m['seq']))):
            date, seq = row['date'], row['seq']
            ts = row['ts']
            try:
                subplot = ax[i][j]
            except:
                continue
        
            if plot_type == 'psd':
                freqs, pxx = spectral_transform(ts, axis, 2**14, Fs)
                subplot.plot(freqs / 1000, pxx, color='darkblue', linewidth=0.5)

                subplot.set_xlim(0, 5)
                ax[-1][j].set_xlabel('Frequency [kHz]')
                ax[i][0].set_ylabel('Amplitude [m/s\u00B2]')
    
            elif plot_type == 'psd_db':
                freqs, pxx = spectral_transform(ts, axis, 2**14, Fs)
                pxx = dB = 20 * np.log10(pxx / 0.000001) # 1 dB = 1 um/s^2   
                subplot.plot(freqs / 1000, pxx, color='darkblue', linewidth=0.5)

                subplot.set_xlim(0, 5)
                ax[-1][j].set_xlabel('Frequency [kHz]')
                ax[i][0].set_ylabel('Amplitude [dB]')

            elif plot_type == 'hist':
                subplot.hist(ts[axis], color='darkblue', bins=100, edgecolor='black', linewidth=0.5)

                ax[-1][j].set_xlabel('Amplitude [m/s\u00B2]')
                ax[i][0].set_ylabel('Count samples')
            
            if ylim is not None:
                subplot.set_ylim(0, ylim)

            subplot.grid(True)
            subplot.set_title(f'{device}, {position} | {date}, {seq}.')
                
    plt.tight_layout()
    plt.show()

#### Plot one spectrum in all axis

In [None]:
query = {
    'place': 'pump',
    'date': date(2024, 2, 27),
    'device': 'KSB7',
    'position': 'PMP003',
    'seq': 1
}
F_LIMIT_KHZ = 2
axis = fluidpump_columns
Fs = fluidpump_fs_hz

signal = filter(
    lambda s: (
        s['place'] == query['place'] and
        s['date'] == query['date'] and
        s['device'] == query['device'] and
        s['position'] == query['position'] and
        s['seq'] == query['seq']
    ),
    waveforms
)
ts = list(signal)[0]['ts']
ts

Histograms

In [None]:
ax = ts.hist(figsize=(20, 4), grid=True, bins=100, layout=(1, 3), edgecolor='black', linewidth=0.5)
plt.show()

Waveform in full length

In [None]:
ax = ts.plot(figsize=(20, 8), grid=True, subplots=True)
for i, axname in enumerate(axis):
    ax[i].set_xlabel('Time [s]')
    ax[i].set_ylabel(f'Amplitude ({axname}) [m/s^2]')
plt.show() 

Waveform in detail: T_WAVEFORM s and interval of TIME s

In [None]:
T_WAVEFORM = 10
TIME = 0.1

ax = (ts[axis].iloc[int(T_WAVEFORM*Fs):int(T_WAVEFORM*Fs+Fs*TIME)]
                  .plot(figsize=(20, 10), grid=True, subplots=True))
    
for i, axname in enumerate(axis):
    ax[i].set_xlabel('Time [s]')
    ax[i].set_ylabel(f'Amplitude ({axname}) [m/s^2]')
plt.show() 

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 10))
ts_range = ts[:]
#ts_range = ts.loc[:10]

for i, axname in enumerate(axis):
    freqs, pxx = spectral_transform(ts_range, axname, 2**14, Fs)
    freqs /= 1000
    ax[i].plot(freqs, pxx, color='darkblue')
    ax[i].fill_between(freqs, pxx, color='lightblue', alpha=0.3)
    ax[i].grid(True)
    ax[i].set_xlim(0, F_LIMIT_KHZ)
    ax[i].set_xlabel('Frequency [kHz]')
    ax[i].set_ylabel('Amplitude [m/s\u00B2]')

Frequency spectrum of one window

In [None]:
def window_frequency_spectrum(ts: pd.DataFrame, t: float, nfft: int = 2**14, dB=False):
    fig, ax = plt.subplots(len(axis), 1, figsize=(20, 10))
    i = 0
    for axname in axis:
        signal = ts[axname].loc[t:t+nfft/Fs].to_numpy()
        n = len(signal)
        pxx = np.abs(np.fft.rfft(signal * windows.hann(n)) / n)
        if dB is True:
            pxx = 20 * np.log10(pxx / 0.000001)  # 1 dB = 1 um/s^2    
        freqs = np.fft.fftfreq(n, d=1/Fs)[:len(pxx)]

        ax[i].plot(freqs, pxx, color='darkblue')
        ax[i].fill_between(freqs, pxx, color='lightblue', alpha=0.3)
        ax[i].grid(True)
        ax[i].set_xlabel('Frequency [Hz]')
        ax[i].set_ylabel('Amplitude [m/s\u00B2]')
        ax[i].set_xlim(0, F_LIMIT_KHZ * 1000)
        ax[i].set_title(f'{axname.upper()}, {name}')
        i += 1
    plt.tight_layout()
    plt.show()


window_frequency_spectrum(ts, t=10, dB=False)

In [None]:
def spectogram(x, axname, nfft):
    fig, ax = plt.subplots(figsize=(15, 4))
    cmap = plt.get_cmap('inferno')
    pxx, freqs, t, im = plt.specgram(
        x, NFFT=nfft, Fs=Fs,
        detrend='mean',
        mode='magnitude', scale='dB',
        cmap=cmap, vmin=-60
    )
    fig.colorbar(im, aspect=20, pad=0.04)
    ax.set_xlabel('Time [s]')
    ax.set_ylabel('Frequency [Hz]')
    ax.set_title(f'{axname}')
    return freqs, pxx

for axname in axis:
    freqs, Pxx = spectogram(ts[axname], axname, nfft=2**12)
    plt.show()

### Choose measurement direction

In [None]:
AXIS = 'z'          # x, y, z

### Frequency spectra

In [None]:
side_by_side(waveforms, 'compressor', 'psd', AXIS, figsize=(15, 20), ylim=1)

In [None]:
side_by_side(waveforms, 'pump', 'psd', AXIS, filt_pos='MTR', figsize=(20, 15), ylim=0.3)

In [None]:
side_by_side(waveforms, 'pump', 'psd', AXIS, filt_pos='PMP', figsize=(20, 15), ylim=0.3)

### Decibel frequency spectra

In [None]:
side_by_side(waveforms, 'compressor', 'psd_db', AXIS, figsize=(15, 20))

In [None]:
side_by_side(waveforms, 'pump', 'psd_db', AXIS, filt_pos='MTR', figsize=(20, 15))

In [None]:
side_by_side(waveforms, 'pump', 'psd_db', AXIS, filt_pos='PMP', figsize=(15, 10))

### Histograms of time domain

In [None]:
side_by_side(waveforms, 'compressor', 'hist', AXIS, figsize=(15, 20))

In [None]:
side_by_side(waveforms, 'pump', 'hist', AXIS, filt_pos='MTR', figsize=(20, 15))

In [None]:
side_by_side(waveforms, 'pump', 'hist', AXIS, filt_pos='PMP', figsize=(20, 15))

### Slow down and speed up

In [None]:
def spectogram(x, nfft, fs, xlim=None):
    fig, ax = plt.subplots(figsize=(20, 5))
    cmap = plt.get_cmap('inferno')
    pxx, freqs, t, im = plt.specgram(
        x, NFFT=nfft, Fs=fs,
        detrend='mean',
        mode='magnitude',
        scale='dB',
        vmin=-70,
        cmap=cmap,
        noverlap=int(nfft * 0.66)
    )
    fig.colorbar(im, aspect=20, pad=0.04)
    ax.set_xlabel('Time [s]')
    ax.set_ylabel('Frequency [Hz]')
    ax.set_ylim(0, 5000)
    if xlim:
        ax.set_xlim(*xlim)
    ax.set_ylim(0, 125)
    return freqs, t, pxx


source = ZipFile(opt['dataset'])
nfft = 2**14
ts = fluidpump_csv_import(source, 'misc/2024-03-26/misc/KSB1-slow-down/1.tsv')
for axis in fluidpump_columns:
    freqs, t, pxx = spectogram(ts[axis], nfft, fluidpump_fs_hz, xlim=(50, 100))
    plt.show()

In [None]:
source = ZipFile(opt['dataset'])
ts = fluidpump_csv_import(source, 'misc/2024-03-26/misc/KSB7-speed-up/1.tsv')
for axis in fluidpump_columns:
    spectogram(ts[axis], 2**11, fluidpump_fs_hz)
    plt.show()

In [None]:
source = ZipFile(opt['dataset'])
ts = fluidpump_csv_import(source, 'misc/2024-03-26/misc/KSB7-speed-up/2.tsv')
for axis in fluidpump_columns:
    spectogram(ts[axis], 2**11, fluidpump_fs_hz, xlim=(0, 20))
    plt.show()