# Feature extraction

In [None]:
from zipfile import ZipFile
import pandas as pd
import numpy as np

import pywt
import ewtpy
import warnings

import mafaulda
import feature_discovery as fdiscovery


MAFAULDA_PATH = '../../datasets/MAFAULDA.zip'

## Preprocessing

- DC Removal
- ANC - Noise removal
- Downsampling + LowPass

In [None]:
# https://medium.com/analytics-vidhya/how-to-filter-noise-with-a-low-pass-filter-python-885223e5e9b7
# Low-pass filter 10 kHz (Butterworth)

# Downsampling factor: 50 kHz / 10 kHz = 5
# src.axis_spectrograms(misalign_sub)  (čistý subsampling vs. low-pass)

## Time domain features

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
files = mafaulda.get_mafaulda_files(dataset)
td_features = mafaulda.import_files(dataset, files, fdiscovery.features_time_domain)
td_features.head()

In [None]:
td_features.to_csv('td_features_no_filter.csv', index=False)

## Frequency domain features
Read also: https://librosa.org/doc/0.10.1/feature.html

### Extract frequency domain features

In [None]:
OVERLAP = 0.5
WINDOW_SIZES = (2**8, 2**10, 2**12, 2**14, 2**16)

for w in WINDOW_SIZES:
    mafaulda.resolution_calc(mafaulda.FS_HZ, w)
    print()

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
files = mafaulda.get_mafaulda_files(dataset)
fd_features = mafaulda.import_files(dataset, files, fdiscovery.features_frequency_domain)
fd_features.head()

In [None]:
fd_features.info()

In [None]:
fd_features.to_csv('fd_features_no_filter.csv', index=False)

## Time-frequency domain features

#### Features:
- Energy
- Energy ratios
- Kurtosis
- Negentropy

#### Transforms:
- Discrete wavelet transform
- Wavelet packet decompostion (Meyer wavelet, Fejér-Korovkin wavelet)
- Empirical wavelet transform

### Wavelet packet decomposition

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
files = mafaulda.get_mafaulda_files(dataset)
wp_features = mafaulda.import_files(dataset, files, fdiscovery.features_wavelet_domain)
wp_features.head()

In [None]:
wp_features.to_csv('wpd_features_no_filter.csv', index=False)

### Multilevel 1D Discrete Wavelet Transform
https://www.mathworks.com/help/wavelet/gs/choose-a-wavelet.html
- https://www.mathworks.com/help/wavelet/gs/introduction-to-the-wavelet-families.html

In [None]:
axis = 'ax'
wavelet = 'dmey'
ts = mafaulda.csv_import(ZipFile(MAFAULDA_PATH), 'vertical-misalignment/1.78mm/51.8144.csv')
result = pywt.wavedec(ts[axis], wavelet, mode='symmetric', level=3)
print(len(ts[axis]))
print([len(x) for x in result], '\n', result)

#### Energy in partitioned regions
- store multiple levels to compare = {3, 6, 9}
- multiple wavelets = {db1, ...}
- https://stackoverflow.com/questions/30808430/how-to-select-columns-from-dataframe-by-regex

In [None]:
pywt.families()
pywt.wavelist()
w = pywt.Wavelet('dmey')
print(w)

### Empirical Wavelet transform

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

def ewt_transform(dataset: pd.DataFrame, axis: str, scales: int):
    ewt, mfb, boundaries = ewtpy.EWT1D(
        dataset[axis], N=scales,
        log=0, detect='locmax', completion=0, 
        reg='average', lengthFilter=10, sigmaFilter=5
    )
    return ewt, mfb, boundaries
import matplotlib.pyplot as plt
ewt, mfb, boundaries = ewt_transform(ts, 'ax', 3)
ewt

### Experiment to find spectral envelope

In [None]:
# TODO: calculate size of signal in PCM and DPCM (biggest number for differential), DPCM with len(hamming code) >= entropy in time domain
fdiscovery.plot_spectral_envelope(ZipFile(MAFAULDA_PATH), 'vertical-misalignment/1.78mm/51.8144.csv', 'ax')