# Features EDA

In [None]:
import pandas as pd
import seaborn as sb

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

TIME_FEATURES_PATH = '../../datasets/features_data/td_features_no_filter.csv'
FREQ_FEATURES_PATH = '../../datasets/features_data/fd_features_no_filter.csv'
WPD_FEATURES_PATH = '../../datasets/features_data/wpd_features_no_filter.csv'

## Correlations

### Time domain

In [None]:
features = pd.read_csv(TIME_FEATURES_PATH)
td_columns = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
rows = features[
     (features['axis'] == 'ax')
   & (features['fault'] == 'horizontal-misalignment')
][td_columns]
rows.corr()

In [None]:
sb.heatmap(rows.corr(), annot=True)

In [None]:
ax = rows.var().plot(kind='barh', xlabel='Variance', ylabel='Feature')

In [None]:
# TODO: pairplot with color for fault
fig, ax = plt.subplots(figsize=(6, 6))

colors = {
    'horizontal-misalignment': 'tab:blue', 
    'imbalance':'tab:orange', 
    'vertical-misalignment': 'tab:purple', 
    'normal': 'tab:green'
}

for key, group in features.groupby('fault'):
    group.plot(ax=ax, kind='scatter', x='kurt', y='margin', label=key, color=colors.get(key, 'tab:brown'))
plt.show()

In [None]:
ax = plt.figure().add_subplot(projection='3d')

colors = {
    'horizontal-misalignment': 'tab:blue', 
    'imbalance': 'tab:orange', 
    'vertical-misalignment': 'tab:purple', 
    'normal': 'tab:green'
}

for key, group in features.groupby('fault'):
    ax.scatter(group['kurt'], group['margin'], group['rpm'], label=key, color=colors.get(key, 'tab:brown'))

plt.xlabel('Kurtosis')
plt.ylabel('Margin')
plt.show()

#### Faults and their severity in relation to feature value and rotational speed

In [None]:
def plot_feature_to_rpm(features, column):
    fig, axes = plt.subplots(5, 2, figsize=(20, 15))
    faults = features['fault'].cat.categories
    
    for idx, detail in enumerate(zip(faults, axes.flat)):
        fault, ax = detail
        
        rows = features[features['fault'] == fault]
        severity = rows['severity'].astype('category').cat.categories
        colors = dict(zip(severity, mcolors.TABLEAU_COLORS))
    
        ax.scatter(rows['rpm'], rows[column], s=1, c=rows['severity'].map(colors))
        ax.set_xlabel('RPM')
        ax.set_ylabel(column)
        ax.set_title(fault)


def plot_fault_histogram(features, columns):
    fig, axes = plt.subplots(5, 2, figsize=(20, 15))
    faults = features['fault'].cat.categories

    for idx, detail in enumerate(zip(faults, axes.flat)):
        fault, ax = detail
        
        rows = features[features['fault'] == fault]
        severity = rows['severity'].astype('category').cat.categories
        colors = dict(zip(severity, mcolors.TABLEAU_COLORS))

        for col in columns:
            ax.hist(rows[col], bins=50, label=col)
        ax.set_title(fault)
        # ax.legend()

features = pd.read_csv(TIME_FEATURES_PATH)
columns = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
features['fault'] = features['fault'].astype('category')

In [None]:
plot_feature_to_rpm(features, columns[0])

In [None]:
plot_fault_histogram(features, columns)  # TODO: calculate mutal information

In [None]:
features = pd.read_csv(FREQ_FEATURES_PATH)
columns = [
    'centroid', 'std', 'skew', 'kurt', 'roll-off', 'flux_mean', 'flux_std',
    'hdev', 'noisiness', 'inharmonicity', 'energy', 'entropy',
    'negentropy'
]
features['fault'] = features['fault'].astype('category')
features['fft_window_length'] = features['fft_window_length'].astype('category')
print(features['fft_window_length'].cat.categories)

features = features[(features['fft_window_length'] == 1024) & (features['axis'] == 'ax')]
features

In [None]:
plot_feature_to_rpm(features, columns[-2])

In [None]:
plot_fault_histogram(features, columns)

### Mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif

# MI: it is equal to zero if and only if two random variables are independent, 
# and higher values mean higher dependency.
def calc_mutual_information(dataset, columns, summary=True):
    if summary:
        mi = mutual_info_classif(dataset[columns], dataset['fault'])
        return (pd.DataFrame(list(zip(columns, mi)), columns=['feature', 'MI'])
                  .set_index('feature')
                  .sort_values(by='MI', ascending=False))
    else: # For each axis and target category independently
        mutual_infos = {}
        for key, group in dataset.groupby('axis'):
            mi = mutual_info_classif(group[columns], group['fault'])
            mutual_infos[key] = mi
        df = pd.DataFrame(mutual_infos)
        df['feature'] = columns
        return df.set_index('feature')

#### MI in Time domain

In [None]:
features = pd.read_csv(TIME_FEATURES_PATH)
columns = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
features['fault'] = features['fault'].astype('category')
#features = features[(features['axis'].isin(['ax', 'ay', 'az']))].dropna()

print(len(features))
mi = calc_mutual_information(features, columns, summary=True)
mi.plot.bar(figsize=(8, 5), grid=True, ylabel='MI')
plt.show()

In [None]:
calc_mutual_information(features, columns, summary=False)

#### MI in Frequency domain

In [None]:
# TODO: MI between pairs of variables (e.g. mean to std, ..)
FFT_WINDOW = 1024
features = pd.read_csv(FREQ_FEATURES_PATH)
columns = [
    'centroid', 'std', 'skew', 'kurt', 'roll-off', 'flux_mean', 'flux_std',
    'hdev', 'noisiness', 'inharmonicity', 'energy', 'entropy',
    'negentropy'
]
features['fault'] = features['fault'].astype('category')
features['fft_window_length'] = features['fft_window_length'].astype('category')
features = features[
    (features['fft_window_length'] == FFT_WINDOW) &
    (features['axis'].isin(['ax', 'ay', 'az']))
].dropna()

print(len(features))
mi = calc_mutual_information(features, columns, summary=True)
mi.plot.bar(figsize=(8, 5), grid=True, ylabel='MI')
plt.show()

In [None]:
# Mutual information between feature in axis and various faults (predicted variable)
features = pd.read_csv(FREQ_FEATURES_PATH)
features['fault'] = features['fault'].astype('category')
features['fft_window_length'] = features['fft_window_length'].astype('category')
features = features[
    (features['fft_window_length'] == FFT_WINDOW)
].dropna()
mi = calc_mutual_information(features, columns, summary=False)
mi
#sb.heatmap(mi, annot=True)
#plt.show()

#### MI in Wavelets

In [None]:
features = pd.read_csv(WPD_FEATURES_PATH)

# More axis at once significantly reduces MI
features = features[features['axis'] == 'ax']                 # One axis
#features = features[features['axis'].isin(['ax', 'ay', 'az'])]  # One measuremnt position

columns = [col for col in features.columns 
           if col not in ('fault', 'severity', 'seq', 'rpm', 'axis', 'feature')]
features

In [None]:
features_energy = features[features['feature'] == 'energy']
print(len(features_energy))

mi = calc_mutual_information(features_energy, columns, summary=True)
mi.iloc[:30].plot.bar(figsize=(20, 4), grid=True, ylabel='MI', title='WPD Energy')
plt.show()

In [None]:
features_energy_ratio = features[features['feature'] == 'energy_ratio']
print(len(features_energy_ratio))

mi = calc_mutual_information(features_energy_ratio, columns, summary=True)
mi.iloc[:30].plot.bar(figsize=(20, 4), grid=True, ylabel='MI', title='WPD Energy ratio')
plt.show()

In [None]:
features_entropy = features[features['feature'] == 'negentropy']
print(len(features_entropy))

mi = calc_mutual_information(features_entropy, columns, summary=True)
mi.iloc[:30].plot.bar(figsize=(20, 4), grid=True, ylabel='MI', title='WPD Negentropy')
plt.show()

In [None]:
features_kurtosis = features[features['feature'] == 'kurtosis']
print(len(features_kurtosis))

mi = calc_mutual_information(features_entropy, columns, summary=True)
mi.iloc[:30].plot.bar(figsize=(20, 4), grid=True, ylabel='MI', title='WPD Kurtosis')
plt.show()

#### PCA on time domain features

In [None]:
features = pd.read_csv(TIME_FEATURES_PATH)
columns = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
matrix = features[columns].to_numpy()

pca = PCA(n_components=2)
result = pca.fit_transform(matrix)

print(pca.explained_variance_ratio_)
# print(pca.singular_values_)
print(pca.components_)
plt.scatter(result.T[0], result.T[1], s=1)
plt.show()

In [None]:
wp_features = pd.read_csv(WPD_FEATURES_PATH)
columns = [
    col for col in wp_features.columns 
    if col not in ('fault', 'severity', 'seq', 'rpm', 'axis', 'feature')
]

fig, ax = plt.subplots(figsize=(6, 6))

for key, group in features.groupby('fault'):
    group.plot(ax=ax, kind='scatter', x='kurt', y='margin', label=key, color=colors.get(key, 'tab:brown'))
plt.show()

In [None]:
# Work in progress - test wrong rpm
import mafaulda as m
from zipfile import ZipFile
import feature_discovery
import matplotlib.pyplot as plt

MAFAULDA_PATH = '../../datasets/MAFAULDA.zip'
path = 'underhang/ball_fault/20g/49.3568.csv'
#path = 'horizontal-misalignment/0.5mm/12.288.csv'

ts = m.csv_import(ZipFile(MAFAULDA_PATH), path)
#ts['tachometer'].iloc[3000:10000].plot()

from scipy.signal import find_peaks
import numpy as np

t = ts['tachometer'].index.to_numpy()
y = ts['tachometer'].to_numpy()
# t = t[1000:60000]
# y = y[1000:60000]
peaks, _ = find_peaks(y, prominence=3, width=50)
plt.plot(t, y)
plt.scatter(t[peaks], y[peaks], color="red")
print(60 / np.diff(t[peaks]).mean())