# Feature extraction

In [None]:
from zipfile import ZipFile
import pandas as pd
import numpy as np

import padasip as pa
from sklearn.decomposition import PCA

import pywt
import ewtpy
import warnings

import mafaulda
import feature_discovery as fdiscovery

import matplotlib.pylab as plt


MAFAULDA_PATH = '../../datasets/MAFAULDA.zip'

## Preprocessing
1. DC Blocker
2. Downsampling (with antialiasing filter)
3. Automatic Noise Control (no noise signal available)

In [None]:
# Plot frequency response of filters
# fdiscovery.dc_blocker([], cutoff=2, plot=True)
# fdiscovery.lowpass_filter([], cutoff=10000, order=4, plot=True)

In [None]:
FS_DOWN = 20000
zip_file = ZipFile(MAFAULDA_PATH)
files = mafaulda.get_mafaulda_files(zip_file)
ts_source = mafaulda.csv_import(zip_file, files[0])


ts = {}
for col in mafaulda.COLUMNS:
    x = fdiscovery.dc_blocker(ts_source[col], cutoff=1)
    x = fdiscovery.downsample(x, fs_reduced=FS_DOWN)    
    # x = ts_source[col]  (Original)

    # anc = pa.filters.FilterNLMS(n=4, mu=0.1, w="random")
    # y, e, w = anc.run(x, noise)
    ts[col] = x

n_original = len(ts_source['ax'])
duration =  n_original / mafaulda.FS_HZ
n_resampled = len(ts['ax'])

ts['t'] = np.linspace(0, duration, n_resampled)
ts['key'] = ts_source['key'][0]
ts = pd.DataFrame(ts).set_index('t')


def spectogram(x):
    plt.figure(figsize=(20, 4))
    Pxx, freqs, t, im = plt.specgram(x, NFFT=256, Fs=FS_DOWN, mode='magnitude', scale='dB')
    plt.xlabel('Time [s]')
    plt.ylabel('Frequency [Hz]')

ts['ax'].plot(figsize=(20, 5))
#spectogram(ts['ax'])

## Time domain features

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
files = mafaulda.get_mafaulda_files(dataset)
td_features = mafaulda.import_files(dataset, files, fdiscovery.features_time_domain)
td_features

In [None]:
td_features.to_csv('td_features_no_filter.csv', index=False)

#### PCA on time domain signal for axis

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
files = mafaulda.get_mafaulda_files(dataset)

files = [f for f in files if f.startswith('horizontal-misalignment/0.5mm')]
# print(len(files))
X = mafaulda.load_dataset_matrix(dataset, files, 'ax')

pca = PCA(n_components=4)
result = pca.fit_transform(X)

print(pca.explained_variance_ratio_)
print(np.sum(pca.explained_variance_ratio_))

fig, ax = plt.subplots(1, 2, figsize=(20, 4))
ax[0].scatter(result.T[0], result.T[1], s=1)
ax[1].bar(
    np.arange(len(pca.explained_variance_ratio_)),
    height=pca.explained_variance_ratio_
)
plt.show()

## Frequency domain features
Read also: https://librosa.org/doc/0.10.1/feature.html

### Extract frequency domain features

In [None]:
OVERLAP = 0.5
WINDOW_SIZES = (2**8, 2**10, 2**12, 2**14, 2**16)

for w in WINDOW_SIZES:
    mafaulda.resolution_calc(mafaulda.FS_HZ, w)
    print()

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
files = mafaulda.get_mafaulda_files(dataset)
fd_features = mafaulda.import_files(dataset, files, fdiscovery.features_frequency_domain)
fd_features.head()

In [None]:
fd_features.info()

In [None]:
fd_features.to_csv('fd_features_no_filter.csv', index=False)

## Time-frequency domain features

#### Features:
- Energy
- Energy ratios
- Kurtosis
- Negentropy

#### Transforms:
- Discrete wavelet transform
- Wavelet packet decompostion (Meyer wavelet, Fejér-Korovkin wavelet)
- Empirical wavelet transform

### Wavelet packet decomposition

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
files = mafaulda.get_mafaulda_files(dataset)
wp_features = mafaulda.import_files(dataset, files, fdiscovery.features_wavelet_domain)
wp_features.head()

In [None]:
wp_features.to_csv('wpd_features_no_filter.csv', index=False)

#### PCA on WPD for energy

In [None]:
wp_features = pd.read_csv('../../datasets/features_data/wpd_features_no_filter.csv')
columns = [
    col for col in wp_features.columns 
    if col not in ('fault', 'severity', 'seq', 'rpm', 'axis', 'feature')
]
matrix = wp_features[
    (wp_features['feature'] == 'energy')
    #& (td_features['fault'] == 'horizontal-misalignment')
][columns].to_numpy()

# TODO: norm: MinMaxScaler

pca = PCA(n_components=2)
result = pca.fit_transform(matrix)
print(pca.explained_variance_ratio_)

# https://stackoverflow.com/questions/61282874/pca-matrix-with-sklearn
# print(pca.components_)

plt.scatter(result.T[0], result.T[1], s=1)
plt.show()

### Multilevel 1D Discrete Wavelet Transform
https://www.mathworks.com/help/wavelet/gs/choose-a-wavelet.html
- https://www.mathworks.com/help/wavelet/gs/introduction-to-the-wavelet-families.html

In [None]:
axis = 'ax'
wavelet = 'dmey'
ts = mafaulda.csv_import(ZipFile(MAFAULDA_PATH), 'vertical-misalignment/1.78mm/51.8144.csv')
result = pywt.wavedec(ts[axis], wavelet, mode='symmetric', level=3)
print(len(ts[axis]))
print([len(x) for x in result], '\n', result)

#### Energy in partitioned regions
- store multiple levels to compare = {3, 6, 9}
- multiple wavelets = {db1, ...}
- https://stackoverflow.com/questions/30808430/how-to-select-columns-from-dataframe-by-regex

In [None]:
pywt.families()
pywt.wavelist()
w = pywt.Wavelet('dmey')
print(w)

### Empirical Wavelet transform

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

def ewt_transform(dataset: pd.DataFrame, axis: str, scales: int):
    ewt, mfb, boundaries = ewtpy.EWT1D(
        dataset[axis], N=scales,
        log=0, detect='locmax', completion=0, 
        reg='average', lengthFilter=10, sigmaFilter=5
    )
    return ewt, mfb, boundaries
import matplotlib.pyplot as plt
ewt, mfb, boundaries = ewt_transform(ts, 'ax', 3)
ewt

### Experiment to find spectral envelope

In [None]:
# TODO: calculate size of signal in PCM and DPCM (biggest number for differential), DPCM with len(hamming code) >= entropy in time domain
fdiscovery.plot_spectral_envelope(ZipFile(MAFAULDA_PATH), 'vertical-misalignment/1.78mm/51.8144.csv', 'ax')