In [None]:
# https://www02.smt.ufrj.br/~offshore/mfs/page_01.html
import numpy as np
from zipfile import ZipFile
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
from scipy.signal import find_peaks
import seaborn as sns

zip_file = ZipFile('../../datasets/MAFAULDA.zip')


In [None]:
FS_HZ = 50000

def res_calc(fs, window):
    print('Window size:', window)
    print('Heinsenberg rectangle')
    print('Time step:', window / fs * 1000, 'ms')
    print('Frequency step:', fs / window, 'Hz')

filenames = [
    text_file.filename 
    for text_file in zip_file.infolist()
    if text_file.filename.endswith('.csv')
]

misalign =(
    pd.read_csv(
        zip_file.open('horizontal-misalignment/1.5mm/16.7936.csv'), 
        names=['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic'])
)
filenames.sort()
pprint(filenames)

### Calculate RPM from square wave tachometer signal

In [None]:
def preprocess(sequence):
    return (
        sequence
        .assign(t = lambda x: x.index * (1 / FS_HZ))
        .assign(mag_a = lambda x: np.hypot(x.ax, x.ay, x.ay))
        .assign(mag_b = lambda x: np.hypot(x.bx, x.by, x.by))
        .assign(rev = lambda x: (x.tachometer - x.shift(-1).tachometer) >= 3)
        .assign(rpm = lambda x: 60 / (x[x.rev == True].t - x[x.rev == True].shift(1).t))
        .assign(rpm = lambda x: x.rpm.fillna(method='ffill').rolling(
            (x[x.rev == True].index.values - np.roll(x[x.rev == True].index.values, 1)).max()
        ).median())  # Smooth out outliers by robust filter
        .dropna()
        .reset_index(drop=True)
        .assign(t = lambda x: x.index * (1 / FS_HZ))
        .set_index('t')
    )

df = preprocess(misalign)

In [None]:
df.describe()

In [None]:
g = df.rpm.plot(figsize=(5, 3), title='RPM', xlabel='Time [s]', ylabel='RPM')

In [None]:
df['tachometer'][:0.5].plot(figsize=(5, 3), title='Tachometer', xlabel='Time [s]', ylabel='Impuls')

###  Histograms

In [None]:
g = df.hist(figsize=(20, 10))

In [None]:
g = df[['ax', 'ay', 'az']].plot.density(figsize=(8, 5))

In [None]:
x = 1024
g = (df[['ax', 'ay', 'az']]
 .iloc[x:x+2048]
 .plot(subplots=True, figsize=(20, 10), grid=True)
)

In [None]:
g = (df[['mag_a', 'mag_b']]
 .plot(subplots=True, figsize=(20, 10), grid=True)
)

###  Q-Q plot and Kolmogorov-Smirnov test for normality

In [None]:
import statsmodels.api as sm
from scipy.stats import kstest

figure, axes = plt.subplots(2, 3, figsize=(10, 5))

for i, col in enumerate(['ax', 'ay', 'az', 'bx', 'by', 'bz']):
    print('Normality test p-value: ', kstest(df[col], 'norm').pvalue, '(<0.05 is not normal)')
    sm.qqplot(df[col], line='45', ax = axes[i // 3, i % 3])
    
plt.tight_layout()
plt.show()

### Frequency analysis (Spectrogram FFT)

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(20, 15))

RESOLUTION = 8
WINDOW = FS_HZ // RESOLUTION
    

res_calc(FS_HZ, WINDOW)

pxx, freq, t, cax = ax[0].specgram(
    df.ax,
    Fs=FS_HZ, 
    mode='magnitude',
    window=np.hamming(WINDOW), 
    NFFT=WINDOW, 
    noverlap=WINDOW//2
)

pxx, freq, t, cax = ax[1].specgram(
    df.ay,
    Fs=FS_HZ, 
    mode='magnitude',
    window=np.hamming(WINDOW), 
    NFFT=WINDOW, 
    noverlap=WINDOW//2
)

pxx, freq, t, cax = ax[2].specgram(
    df.az,
    Fs=FS_HZ, 
    mode='magnitude',
    window=np.hamming(WINDOW), 
    NFFT=WINDOW, 
    noverlap=WINDOW//2
)

pxx, freq, t, cax = ax[3].specgram(
    df.mag_b,
    Fs=FS_HZ, 
    mode='magnitude',
    window=np.hamming(WINDOW), 
    NFFT=WINDOW, 
    noverlap=WINDOW//2
)

for i in range(3):
    ax[i].set_xlabel('Time [s]')
    ax[i].set_ylabel('Frequency [Hz]')

#ax[0].set_ylim(700, 1200)
g = plt.colorbar(cax, ax=ax)

Do not use magnitude - it is not oscilatory. Various axis have different responses, but the signal in each axis is **stationary**

### Orbitals from RMS

In [None]:
n = 100
ts = 1000
fig, ax = plt.subplots(1, 3, figsize=(20, 5))

ax_rms = df['ax'].iloc[:ts].rolling(n).apply(
        lambda x: np.sqrt((x ** 2).mean())
    ) 
ay_rms = df['ay'].iloc[:ts].rolling(n).apply(
        lambda x: np.sqrt((x ** 2).mean())
    )
az_rms = df['az'].iloc[:ts].rolling(n).apply(
        lambda x: np.sqrt((x ** 2).mean())
    )
ax[0].set_xlabel('x')
ax[0].set_ylabel('y')
ax[0].scatter(ax_rms, ay_rms)
ax[1].set_xlabel('x')
ax[1].set_ylabel('z')
ax[1].scatter(ax_rms, az_rms)
ax[2].set_xlabel('y')
ax[2].set_ylabel('z')
ax[2].scatter(ay_rms, az_rms)

### 

In [None]:
# Preprocess, offset mean, Merge do jedného a v stĺpci bude key názvu, potom group by na všetky
# Calculate rms ratio with rpm
from multiprocessing.pool import ThreadPool
pool = ThreadPool(processes=4)

def csv_import(filename):
    frame = pd.read_csv(
        zip_file.open(filename), 
        names=['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic']    
    )
    return (
        frame
        .assign(key=filename)
        .assign(t = lambda x: x.index * (1 / FS_HZ))
    )

normal_cond = pd.concat([
    pool.apply_async(csv_import, (name, )).get()
    for name in filenames
    if name.startswith('normal')
])

In [None]:
# Time domain features

from scipy.stats import skew, kurtosis
import seaborn as sns

col = 'ax'

rms = lambda x: np.sqrt((x ** 2).mean())

td_featues = pd.concat([
    normal_cond.groupby(by='key')[col].mean().rename('mean'),
    normal_cond.groupby(by='key')[col].std().rename('std'),
    normal_cond.groupby(by='key')[col].apply(lambda x: skew(x)).rename('skew'),
    normal_cond.groupby(by='key')[col].apply(lambda x: kurtosis(x)).rename('kurtosis'),
    normal_cond.groupby(by='key')[col].apply(rms).rename('rms')
    ],
    axis=1
)
td_featues.head(10)

In [None]:
sns.pairplot(td_featues)

In [None]:
sns.heatmap(td_featues.corr(), annot=True)

In [None]:
# Compare featues between two different faults (or at least classes of severity of one fault)
# imbalance/6g/33.9968.csv

from multiprocessing.pool import ThreadPool
from tqdm.notebook import tqdm
pool = ThreadPool(processes=4)


def csv_import(filename):
    col = 'ax'
    frame = pd.read_csv(
        zip_file.open(filename), 
        names=['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic']    
    )
    info = filename.split('/')
    frame = (
        frame
        .assign(load=int(info[1].strip(' g')), no=info[2])
        .assign(t = lambda x: x.index * (1 / FS_HZ))
    )
    return pd.concat([
        frame.groupby(by=['load', 'no'])[col].mean().rename('mean'),
        frame.groupby(by=['load', 'no'])[col].std().rename('std'),
        frame.groupby(by=['load', 'no'])[col].apply(lambda x: skew(x)).rename('skew'),
        frame.groupby(by=['load', 'no'])[col].apply(lambda x: kurtosis(x)).rename('kurtosis'),
        frame.groupby(by=['load', 'no'])[col].apply(rms).rename('rms'),
        frame.groupby(by=['load', 'no'])[col].apply(lambda x: max(abs(x.max()), abs(x.min()))).rename('amplitude')
        ],
        axis=1
    ).reset_index()


imbalance_files =  [name for name in filenames if name.startswith('imbalance')]

imbalance = pd.concat([
    pool.apply_async(csv_import, (name, )).get()
    for name in tqdm(imbalance_files)
])

In [None]:
#imbalance.to_csv('imbalance_features.csv')
imbalance.reset_index(inplace=True)

In [None]:
sns.lmplot(x='rms', y='kurtosis', data=imbalance, fit_reg=False, hue='load', legend=False)
plt.legend(loc='lower right')
plt.show()

In [None]:
sns.lmplot(x='rms', y='amplitude', data=imbalance, fit_reg=False, hue='load', legend=False)
plt.legend(loc='lower right')
plt.show()

In [None]:
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(projection='3d')
ax.scatter(imbalance['mean'], imbalance['std'], imbalance['rms'])

ax.set_xlabel('Mean')
ax.set_ylabel('Standard deviation')
ax.set_zlabel('Amplitude')

plt.show()

In [None]:
#TODO: normalize (min-max, standard scalar), transform - log transform, remove outliers

# Spectral features - peak 1 vs peak 2 (amplitude)
# Compare different faults

In [None]:
from multiprocessing.pool import ThreadPool
from scipy.signal import welch
from tqdm.notebook import tqdm
from scipy.fft import rfft


pool = ThreadPool(processes=4)


def fft_csv_import(filename, window=4096, overlap=0.5, fs=50000, is_welch=False):
    STEP = window * overlap
    col = 'ax'
    info = filename.split('/')
    load = int(info[1].strip(' g'))
    
    frame = pd.read_csv(
        zip_file.open(filename), 
        names=['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic']    
    )
    frame = (
        frame
        .assign(load=load, no=info[2])
        .assign(t = lambda x: x.index * (1 / fs))
    )
    v = frame[col].to_numpy()
    
    if is_welch is False:
        spectra = [
            np.abs(rfft(v[i:i+window] * np.hamming(window)))
            for i in range(0, len(v) - window, int(STEP))
        ]
        freqs = [i * (fs / window) for i in range(window // 2 + 1)]

    else:
        freqs, spectra = welch(v, fs, 'hamming', nperseg=window, scaling='spectrum', average='mean')
        spectra = [spectra]
        

    return (
        pd.DataFrame(data=spectra, columns=freqs.astype(int))
        .assign(load=load, no=info[2])
        .set_index(['load', 'no'])
    )


imbalance_files = [name for name in filenames if name.startswith('imbalance')]

FS = 50000
WINDOW = 2**14
res_calc(FS, WINDOW)
spectra = fft_csv_import('imbalance/10g/56.9344.csv', fs=FS, window=WINDOW, overlap=0.5, is_welch=True)
spectra.head(10)

In [None]:
spectra.iloc[0].loc[:500].plot(legend=False)

In [None]:
np.log(spectra.iloc[0]).iloc[:200].plot(legend=False, grid=True)

In [None]:
FS = 50000
WINDOW = 2**13

imbalancePSD = pd.concat([
    pool.apply_async(fft_csv_import, (name, WINDOW, 0.5, FS, True)).get()
    for name in tqdm(imbalance_files)
])

In [None]:
imbalancePSD

In [None]:
imbalancePSD.T[(10, '56.9344.csv')].loc[:500].plot()

In [None]:
bins = imbalancePSD.T[(10, '56.9344.csv')]
peaks, properties = find_peaks(bins, prominence=0.01)
plt.plot(bins.index, bins)
plt.scatter(bins.index[peaks], bins[bins.index[peaks]], color='r')
plt.xlim(0, 1000)
plt.xlabel('Frequency [Hz]')
plt.ylabel('Amplitude')

In [None]:
# Extract peaks:
# load, no, peak_frequency, peak_amplitude
MAX_FREQ = 1000

frames = []
for index, bins in imbalancePSD.iterrows():
    peaks, properties = find_peaks(bins[:MAX_FREQ], prominence=0.02)
    row = {
        'load': index[0],
        'no': index[1],
        'f': bins.index[peaks],
        'y': bins[bins.index[peaks]]
    }

    frame = pd.DataFrame(data=row, columns=['load', 'no', 'f', 'y'])
    frames.append(frame)
    
harmonics = (
    pd.concat(frames)
      .sort_values(by=['load', 'y', 'f'], ascending=[True, False, True])
)



f0 = harmonics.groupby('load').nth(0)
f1 = harmonics.groupby('load').nth(1)

peak_features = f0.join(f1, lsuffix='_f0', rsuffix='_f1').reset_index()
peak_features

sns.lmplot(x='y_f0', y='y_f1', data=peak_features, fit_reg=False, hue='load_f0', legend=False)
plt.legend(loc='lower right')
plt.show()

In [None]:
sns.lmplot(x='f_f0', y='f_f1', data=peak_features, fit_reg=False, hue='load_f0', legend=False)
plt.legend(loc='lower right')
plt.show()

In [None]:
# Spectral statistics
from scipy.stats import skew, kurtosis

rms = lambda x: np.sqrt((x ** 2).mean())
spectral_centroid = lambda x: np.average(x.index, weights=x)

fd_features = pd.concat([
    imbalancePSD.mean(axis=1).rename('mean'),
    imbalancePSD.std(axis=1).rename('std'),
    imbalancePSD.T.apply(lambda x: skew(x)).rename('skew'),
    imbalancePSD.T.apply(lambda x: kurtosis(x)).rename('kurtosis'),
    imbalancePSD.T.apply(rms).rename('rms'),
    imbalancePSD.T.apply(spectral_centroid).rename('centroid')
    ],
    axis=1
)
fd_features.head(10)

In [None]:
sns.pairplot(fd_features)

In [None]:
fd_plain = fd_features.reset_index()
sns.lmplot(x='rms', y='kurtosis', data=fd_plain, fit_reg=False, hue='load_f0', legend=False)
plt.legend(loc='lower right')
plt.show()

In [None]:
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(projection='3d')
ax.scatter(fd_plain['kurtosis'], fd_plain['centroid'], fd_plain['rms'])

ax.set_xlabel('Kurtosis')
ax.set_ylabel('Centroid')
ax.set_zlabel('RMS')

In [None]:
# Compare different faults

## PSD of severe faults by axis (Welch) - up to 2 kHz (120 000 RPM)

In [None]:
# fft_import_csv chosen files
# subplots all spectra
# compare stats

FS = 50000
WINDOW = 2**13
fault_files = [
    'horizontal-misalignment/2.0mm/60.8256.csv',
    'vertical-misalignment/1.90mm/61.44.csv',
    'imbalance/35g/56.7296.csv',
    'normal/61.44.csv',
    'overhang/ball_fault/35g/32.1536.csv',
    'overhang/cage_fault/35g/54.0672.csv',
    'overhang/outer_race/35g/53.4528.csv',
    'underhang/ball_fault/35g/50.7904.csv',
    'underhang/cage_fault/35g/56.5248.csv',
    'underhang/outer_race/35g/58.9824.csv'
]

def fft_csv_import(filename, axis='ax', window=4096, overlap=0.5, fs=50000, is_welch=False):
    STEP = window * overlap
    
    frame = pd.read_csv(
        zip_file.open(filename), 
        names=['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic']    
    )
    frame = (
        frame
        .assign(name=filename)
        .assign(t = lambda x: x.index * (1 / fs))
    )
    v = frame[axis].to_numpy()
    
    # Calcultate rpm
    frame = (
        frame.assign(rev = lambda x: (x.tachometer - x.shift(-1).tachometer) >= 3)
        .assign(rpm = lambda x: 60 / (x[x.rev == True].t - x[x.rev == True].shift(1).t))
        .assign(rpm = lambda x: x.rpm.fillna(method='ffill').rolling(
            (x[x.rev == True].index.values - np.roll(x[x.rev == True].index.values, 10)).max()
        ).median())
    )
    #frame['rpm'].plot()
    
    if is_welch is False:
        spectra = [
            np.abs(rfft(v[i:i+window] * np.hamming(window)))
            for i in range(0, len(v) - window, int(STEP))
        ]
        freqs = [i * (fs / window) for i in range(window // 2 + 1)]

    else:
        freqs, spectra = welch(v, fs, 'hamming', nperseg=window, scaling='spectrum', average='mean')
        spectra = [spectra]
        

    return (
        pd.DataFrame(data=spectra, columns=freqs.astype(int))
        .assign(name=filename, rpm=frame['rpm'].median())
        .set_index(['name'])
    )

# fft_csv_import('normal/61.44.csv', 'az', WINDOW, 0.5, FS, True)

### Measurement place A - import worst faults and compare each axis's PSD

In [None]:
faultPSD_X = pd.concat([
    pool.apply_async(fft_csv_import, (name, 'ax', WINDOW, 0.5, FS, True)).get()
    for name in tqdm(fault_files)
])

faultPSD_Y = pd.concat([
    pool.apply_async(fft_csv_import, (name, 'ay', WINDOW, 0.5, FS, True)).get()
    for name in tqdm(fault_files)
])

faultPSD_Z = pd.concat([
    pool.apply_async(fft_csv_import, (name, 'az', WINDOW, 0.5, FS, True)).get()
    for name in tqdm(fault_files)
])

In [None]:
f_cutoff = 2000

faultPSD_X_v = faultPSD_X.drop('rpm', axis=1)
x_psd = (
    faultPSD_X_v
    .T[faultPSD_X_v.T.index < f_cutoff]
)
axis = x_psd.plot(
    subplots=True,
    figsize=(20, 15),
    xlabel='Frequency [Hz]',
    ylabel='Amplitude'
)

# Graph RPM
for ax, rpm  in zip(axis, faultPSD_X['rpm']):
    f0 = rpm / 60
    ax.axvline(x=f0, color='red')
    # n - harmonics of rotating frequency
    n = 6
    for i in range(2, n):
        ax.axvline(x=f0 * i, color='orange')

In [None]:
faultPSD_Y_v = faultPSD_Y.drop('rpm', axis=1)
y_psd = faultPSD_Y_v.T[faultPSD_Y_v.T.index < f_cutoff]
p = y_psd.plot(subplots=True, figsize=(20, 15))

In [None]:
faultPSD_Z_v = faultPSD_Z.drop('rpm', axis=1)
z_psd = faultPSD_Z_v.T[faultPSD_Z_v.T.index < f_cutoff]
p = z_psd.plot(subplots=True, figsize=(20, 15))

### Measurement place B - import worst faults and compare each axis's PSD

In [None]:
faultPSD_B_X = pd.concat([
    pool.apply_async(fft_csv_import, (name, 'ax', WINDOW, 0.5, FS, True)).get()
    for name in tqdm(fault_files)
])

faultPSD_B_Y = pd.concat([
    pool.apply_async(fft_csv_import, (name, 'ay', WINDOW, 0.5, FS, True)).get()
    for name in tqdm(fault_files)
])

faultPSD_B_Z = pd.concat([
    pool.apply_async(fft_csv_import, (name, 'az', WINDOW, 0.5, FS, True)).get()
    for name in tqdm(fault_files)
])

In [None]:
f_cutoff = 2000
faultPSD_B_X_v = faultPSD_B_X.drop('rpm', axis=1)
x_psd = faultPSD_B_X_v.T[faultPSD_B_X_v.T.index < f_cutoff]
p = x_psd.plot(subplots=True, figsize=(20, 15))

In [None]:
faultPSD_B_Y_v = faultPSD_B_Y.drop('rpm', axis=1)
y_psd = faultPSD_B_Y_v.T[faultPSD_B_Y_v.T.index < f_cutoff]
p = y_psd.plot(subplots=True, figsize=(20, 15))

In [None]:
faultPSD_B_Z_v = faultPSD_B_Z.drop('rpm', axis=1)
z_psd = faultPSD_B_Z_v.T[faultPSD_B_Z_v.T.index < f_cutoff]
p = z_psd.plot(subplots=True, figsize=(20, 15))

In [None]:
# Compare (rms, skewness) between different clusters (faults)


### Choose fault and graph different severities

In [None]:
normal_cond_filenames = [name  for name in filenames if name.startswith('normal')]

normal_cond_PSD = []
for axis in ('ax', 'ay', 'az'):
    frame = pd.concat([
        pool.apply_async(fft_csv_import, (name, axis, WINDOW, 0.5, FS, True)).get()
        for name in tqdm(normal_cond_filenames)
    ])
    normal_cond_PSD.append(frame)

In [None]:
normal_cond_PSD_v = normal_cond_PSD[0].drop('rpm', axis=1)
(normal_cond_PSD_v
 .T[normal_cond_PSD_v.T.index < 2000]
 .plot(figsize=(20, 8), legend=False))

In [None]:
normal_cond_PSD_v.T.corr()

In [None]:
sns.heatmap(normal_cond_PSD_v.T.corr(), annot=False)