In [None]:
# https://www02.smt.ufrj.br/~offshore/mfs/page_01.html
import numpy as np
from zipfile import ZipFile
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

zip_file = ZipFile('../../datasets/MAFAULDA.zip')


FS_HZ = 50000

filenames = [
    text_file.filename 
    for text_file in zip_file.infolist()
    if text_file.filename.endswith('.csv')
]

misalign =(
    pd.read_csv(
        zip_file.open('horizontal-misalignment/1.5mm/16.7936.csv'), 
        names=['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic'])
)
filenames.sort()
pprint(filenames)

### Calculate RPM from square wave tachometer signal

In [None]:
def preprocess(sequence):
    return (
        sequence
        .assign(t = lambda x: x.index * (1 / FS_HZ))
        .assign(mag_a = lambda x: np.hypot(x.ax, x.ay, x.ay))
        .assign(mag_b = lambda x: np.hypot(x.bx, x.by, x.by))
        .assign(rev = lambda x: (x.tachometer - x.shift(-1).tachometer) >= 3)
        .assign(rpm = lambda x: 60 / (x[x.rev == True].t - x[x.rev == True].shift(1).t))
        .assign(rpm = lambda x: x.rpm.fillna(method='ffill').rolling(
            (x[x.rev == True].index.values - np.roll(x[x.rev == True].index.values, 1)).max()
        ).median())  # Smooth out outliers by robust filter
        .dropna()
        .reset_index(drop=True)
        .assign(t = lambda x: x.index * (1 / FS_HZ))
        .set_index('t')
    )

df = preprocess(misalign)

In [None]:
df.describe()

In [None]:
g = df.rpm.plot(figsize=(5, 3), title='RPM', xlabel='Time [s]', ylabel='RPM')

In [None]:
df['tachometer'][:0.5].plot(figsize=(5, 3), title='Tachometer', xlabel='Time [s]', ylabel='Impuls')

###  Histograms

In [None]:
g = df.hist(figsize=(20, 10))

In [None]:
g = df[['ax', 'ay', 'az']].plot.density(figsize=(8, 5))

In [None]:
x = 1024
g = (df[['ax', 'ay', 'az']]
 .iloc[x:x+2048]
 .plot(subplots=True, figsize=(20, 10), grid=True)
)

In [None]:
g = (df[['mag_a', 'mag_b']]
 .plot(subplots=True, figsize=(20, 10), grid=True)
)

###  Q-Q plot and Kolmogorov-Smirnov test for normality

In [None]:
import statsmodels.api as sm
from scipy.stats import kstest

figure, axes = plt.subplots(2, 3, figsize=(10, 5))

for i, col in enumerate(['ax', 'ay', 'az', 'bx', 'by', 'bz']):
    print('Normality test p-value: ', kstest(df[col], 'norm').pvalue, '(<0.05 is not normal)')
    sm.qqplot(df[col], line='45', ax = axes[i // 3, i % 3])
    
plt.tight_layout()
plt.show()

### Frequncy analysis (Spectrogram FFT)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(20, 8))

RESOLUTION = 8
WINDOW = FS_HZ // RESOLUTION
print('Window size:', WINDOW)
print('Heinsenberg rectangle')
print('Time step:', WINDOW / FS_HZ * 1000, 'ms')
print('Frequency step:', RESOLUTION, 'Hz')

pxx, freq, t, cax = ax[0].specgram(
    df.az,
    Fs=FS_HZ, 
    mode='magnitude',
    window=np.hamming(WINDOW), 
    NFFT=WINDOW, 
    noverlap=WINDOW//2
)

pxx, freq, t, cax = ax[1].specgram(
    df.mag_b,
    Fs=FS_HZ, 
    mode='magnitude',
    window=np.hamming(WINDOW), 
    NFFT=WINDOW, 
    noverlap=WINDOW//2
)

ax[0].set_xlabel('Time [s]')
ax[0].set_ylabel('Frequency [Hz]')
#ax[0].set_ylim(700, 1200)
g = plt.colorbar(cax, ax=ax)

### Orbitals from RMS

In [None]:
n = 100
ts = 1000
fig, ax = plt.subplots(1, 3, figsize=(20, 5))

ax_rms = df['ax'].iloc[:ts].rolling(n).apply(
        lambda x: np.sqrt((x ** 2).mean())
    ) 
ay_rms = df['ay'].iloc[:ts].rolling(n).apply(
        lambda x: np.sqrt((x ** 2).mean())
    )
az_rms = df['az'].iloc[:ts].rolling(n).apply(
        lambda x: np.sqrt((x ** 2).mean())
    )
ax[0].set_xlabel('x')
ax[0].set_ylabel('y')
ax[0].scatter(ax_rms, ay_rms)
ax[1].set_xlabel('x')
ax[1].set_ylabel('z')
ax[1].scatter(ax_rms, az_rms)
ax[2].set_xlabel('y')
ax[2].set_ylabel('z')
ax[2].scatter(ay_rms, az_rms)

### 

In [None]:
# Preprocess, offset mean, Merge do jedného a v stĺpci bude key názvu, potom group by na všetky
# Calculate rms ratio with rpm
from multiprocessing.pool import ThreadPool
pool = ThreadPool(processes=4)

def csv_import(filename):
    frame = pd.read_csv(
        zip_file.open(filename), 
        names=['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic']    
    )
    return (
        frame
        .assign(key=filename)
        .assign(t = lambda x: x.index * (1 / FS_HZ))
    )

normal_cond = pd.concat([
    pool.apply_async(csv_import, (name, )).get()
    for name in filenames
    if name.startswith('normal')
])

In [None]:
# Time domain features

from scipy.stats import skew, kurtosis
import seaborn as sns

col = 'ax'

rms = lambda x: np.sqrt((x ** 2).mean())

td_featues = pd.concat([
    normal_cond.groupby(by='key')[col].mean().rename('mean'),
    normal_cond.groupby(by='key')[col].std().rename('std'),
    normal_cond.groupby(by='key')[col].apply(lambda x: skew(x)).rename('skew'),
    normal_cond.groupby(by='key')[col].apply(lambda x: kurtosis(x)).rename('kurtosis'),
    normal_cond.groupby(by='key')[col].apply(rms).rename('rms')
    ],
    axis=1
)
td_featues.head(10)

In [None]:
sns.pairplot(td_featues)

In [None]:
sns.heatmap(td_featues.corr(), annot=True)

In [None]:
# Compare featues between two different faults (or at least classes of severity of one fault)
# imbalance/6g/33.9968.csv

from multiprocessing.pool import ThreadPool
from tqdm.notebook import tqdm
pool = ThreadPool(processes=4)



def csv_import(filename):
    col = 'ax'
    frame = pd.read_csv(
        zip_file.open(filename), 
        names=['tachometer', 'ax', 'ay', 'az', 'bx', 'by', 'bz', 'mic']    
    )
    info = filename.split('/')
    frame = (
        frame
        .assign(load=int(info[1].strip(' g')), no=info[2])
        .assign(t = lambda x: x.index * (1 / FS_HZ))
    )
    return pd.concat([
        frame.groupby(by=['load', 'no'])[col].mean().rename('mean'),
        frame.groupby(by=['load', 'no'])[col].std().rename('std'),
        frame.groupby(by=['load', 'no'])[col].apply(lambda x: skew(x)).rename('skew'),
        frame.groupby(by=['load', 'no'])[col].apply(lambda x: kurtosis(x)).rename('kurtosis'),
        frame.groupby(by=['load', 'no'])[col].apply(rms).rename('rms'),
        frame.groupby(by=['load', 'no'])[col].apply(lambda x: max(abs(x.max()), abs(x.min()))).rename('amplitude')
        ],
        axis=1
    ).reset_index()


imbalance_files =  [name for name in filenames if name.startswith('imbalance')]

imbalance = pd.concat([
    pool.apply_async(csv_import, (name, )).get()
    for name in tqdm(imbalance_files)
])

In [None]:
#imbalance.to_csv('imbalance_features.csv')
imbalance.reset_index(inplace=True)

In [None]:
sns.lmplot(x='mean', y='std', data=imbalance, fit_reg=False, hue='load', legend=False)
plt.legend(loc='lower right')
plt.show()

In [None]:
sns.lmplot(x='rms', y='amplitude', data=imbalance, fit_reg=False, hue='load', legend=False)
plt.legend(loc='lower right')
plt.show()

In [None]:
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(projection='3d')
ax.scatter(imbalance['mean'], imbalance['std'], imbalance['rms'])

ax.set_xlabel('Mean')
ax.set_ylabel('Standard deviation')
ax.set_zlabel('Amplitude')

plt.show()

In [None]:
#TODO: normalize (min-max, standard scalar), transform - log transform, remove outliers

# Spectral features - peak 1 vs peak 2 (amplitude)
# Compare different faults