# Features EDA

In [None]:
import pandas as pd
import seaborn as sb
import numpy as np

# PLotting
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Feature selection
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler, MinMaxScaler # OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA    # TODO: do an inverse transform to find importance of features in model
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.feature_selection import mutual_info_classif, f_classif

# Models (TODO: split to separate notebook)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN

from river import cluster       # cluster.DenStream
from river import anomaly       # anomaly.HalfSpaceTrees, LocalOutlierFactor
from river import preprocessing # preprocessing.StandardScaler
from river import neighbors     # neighbors.KNNClassifier, SWINN
from river import drift         # ADWIN
from river import stream

# from skmultiflow.anomaly_detection import HalfSpaceTrees
# from skmultiflow.lazy import KNNClassifier, KNNADWINClassifier

# Dataset paths and attributes
TIME_FEATURES_PATH = '../../datasets/features_data/td_features_no_filter.csv'
FREQ_FEATURES_PATH = '../../datasets/features_data/fd_features_no_filter.csv'
WPD_FEATURES_PATH = '../../datasets/features_data/wpd_features_no_filter.csv'

TD_COLUMNS = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
FD_COLUMNS = [
    'centroid', 'std', 'skew', 'kurt', 'roll-off', 'flux_mean', 'flux_std',
    'hdev', 'noisiness', 'inharmonicity', 'energy', 'entropy',
    'negentropy'
]

## Correlations

### Time domain

In [None]:
features = pd.read_csv(TIME_FEATURES_PATH)
rows = features[
     (features['axis'] == 'ax')
   & (features['fault'] == 'horizontal-misalignment')
][TD_COLUMNS]
rows.corr()

## Absolute value of correlation

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))  
sb.heatmap(np.abs(rows.corr()), annot=True, ax=ax, cmap='Greens')

## Variance of variables

In [None]:
ax = rows.var().plot(kind='barh', xlabel='Variance', ylabel='Feature')

In [None]:
# TODO: pairplot with color for fault
fig, ax = plt.subplots(figsize=(6, 6))

colors = {
    'horizontal-misalignment': 'tab:blue', 
    'imbalance':'tab:orange', 
    'vertical-misalignment': 'tab:purple', 
    'normal': 'tab:green'
}

for key, group in features.groupby('fault'):
    group.plot(ax=ax, kind='scatter', x='kurt', y='margin', label=key, color=colors.get(key, 'tab:brown'))
plt.show()

In [None]:
ax = plt.figure().add_subplot(projection='3d')

colors = {
    'horizontal-misalignment': 'tab:blue', 
    'imbalance': 'tab:orange', 
    'vertical-misalignment': 'tab:purple', 
    'normal': 'tab:green'
}

for key, group in features.groupby('fault'):
    ax.scatter(group['kurt'], group['margin'], group['rpm'], label=key, color=colors.get(key, 'tab:brown'))

plt.xlabel('Kurtosis')
plt.ylabel('Margin')
plt.show()

## Faults and their severity in relation to feature value and rotational speed

In [None]:
def plot_feature_to_rpm(features, column):
    fig, axes = plt.subplots(5, 2, figsize=(20, 15))
    faults = features['fault'].cat.categories
    
    for idx, detail in enumerate(zip(faults, axes.flat)):
        fault, ax = detail
        
        rows = features[features['fault'] == fault]
        severity = rows['severity'].astype('category').cat.categories
        colors = dict(zip(severity, mcolors.TABLEAU_COLORS))
    
        ax.scatter(rows['rpm'], rows[column], s=1, c=rows['severity'].map(colors))
        ax.set_xlabel('RPM')
        ax.set_ylabel(column)
        ax.set_title(fault)


def plot_fault_histogram(features, columns):
    fig, axes = plt.subplots(5, 2, figsize=(20, 15))
    faults = features['fault'].cat.categories

    for idx, detail in enumerate(zip(faults, axes.flat)):
        fault, ax = detail
        
        rows = features[features['fault'] == fault]
        severity = rows['severity'].astype('category').cat.categories
        colors = dict(zip(severity, mcolors.TABLEAU_COLORS))

        for col in columns:
            ax.hist(rows[col], bins=50, label=col)
        ax.set_title(fault)
        # ax.legend()

features = pd.read_csv(TIME_FEATURES_PATH)
columns = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
features['fault'] = features['fault'].astype('category')

In [None]:
plot_feature_to_rpm(features, columns[0])

In [None]:
plot_fault_histogram(features, columns)  # TODO: calculate mutal information

In [None]:
features = pd.read_csv(FREQ_FEATURES_PATH)
columns = [
    'centroid', 'std', 'skew', 'kurt', 'roll-off', 'flux_mean', 'flux_std',
    'hdev', 'noisiness', 'inharmonicity', 'energy', 'entropy',
    'negentropy'
]
features['fault'] = features['fault'].astype('category')
features['fft_window_length'] = features['fft_window_length'].astype('category')
print(features['fft_window_length'].cat.categories)

features = features[(features['fft_window_length'] == 1024) & (features['axis'] == 'ax')]
features

In [None]:
plot_feature_to_rpm(features, columns[-2])

In [None]:
plot_fault_histogram(features, columns)

## Pearson correlation in Time domain
- Calculate correlation between fault (then severity) and features by axis
- Transform faults to binary vector (is unbalance, is not unbalance) for each fault - one hot encoding

In [None]:
def corr_features_to_fault(dataframe, features):
    fault_dummies = pd.get_dummies(dataframe['fault'])
    fault_features = pd.concat([dataframe, fault_dummies], axis=1)

    correlations = []
    for i, fault in enumerate(dataframe['fault'].cat.categories):
        for col in features:
            x = fault_features[col]
            y = fault_features[fault]
            f = np.abs(pearsonr(x, y)[0])
            correlations.append({
                'fault': fault,
                'feature': col,
                'corr': f
            })

    correlations = pd.DataFrame(correlations)
    correlations['fault'] = correlations['fault'].astype('category')
    correlations['feature'] = correlations['feature'].astype('category')
    return correlations

def show_time_domain_correlation(corr_table):
    num_of_faults = len(corr_table['fault'].cat.categories)
    fig, axes = plt.subplots(2, num_of_faults // 2, figsize=(20, 10))

    for i, group in enumerate(corr_table.groupby(by='fault', observed=True)):
        fault, rows = group
        x = rows.sort_values(by='corr', ascending=False)
    
        # Plot correlations
        o = axes.flatten()[i]
        o.bar(x['feature'], x['corr'])
        o.grid(True)
        # o.set_xlabel('Feature')
        #o.set_ylabel('Correlation with fault')
        o.set_title(f'Fault: {fault}')
        # Rotate x labels by 45 deg
        o.set_xticks(o.get_xticks())
        o.set_xticklabels(o.get_xticklabels(), rotation=45, ha='right')

    for ax in axes[-1]:
        ax.set_xlabel('Feature')
    for ax in axes[:,0]:
        ax.set_ylabel('Correlation with fault')

    fig.tight_layout()

#### Ordered importance of features to fault based on their Pearson correlation coeficient

In [None]:
AXIS = ['ax', 'ay', 'az', 'bx', 'by', 'bz']
features = pd.read_csv(TIME_FEATURES_PATH)
features = features[features['axis'].isin(AXIS)]
features['fault'] = features['fault'].astype('category')

df = corr_features_to_fault(features, TD_COLUMNS)
show_time_domain_correlation(df)

#### Correlations among faults and time domain featured unordered

In [None]:
corr_to_class = df.pivot(index='fault', columns='feature', values='corr')
fig, ax = plt.subplots(figsize=(10, 5))
sb.heatmap(corr_to_class, annot=True, cmap='Greens', ax=ax)
plt.show()

#### Rank order of features averaged among all fault types
**Less is better**

In [None]:
corr_fault_to_feat = df.pivot(index='fault', columns='feature', values='corr')

feature_ranks = corr_fault_to_feat.rank(axis='columns', method='dense', ascending=False)
common_rank = feature_ranks.mean().sort_values().to_frame(name='rank')
common_rank.plot.bar(grid=True, legend=False)
plt.show()

## F statistic in Time domain

In [None]:
# MI: it is equal to zero if and only if two random variables are independent, 
# and higher values mean higher dependency.
def calc_feature_selection_metric(fmetric, dataset, columns, summary=True):
    if summary:
        m = fmetric(dataset[columns], dataset['fault'].cat.codes)  # Do not have to be codes
        if isinstance(m, tuple):
            m = m[0]
        return (pd.DataFrame(list(zip(columns, m)), columns=['feature', 'stat'])
                  .set_index('feature')
                  .sort_values(by='stat', ascending=False))
    else: # For each axis and target category independently
        stat = {}
        for key, group in dataset.groupby('axis'):
            m = fmetric(group[columns], group['fault'].cat.codes)
            if isinstance(m, tuple):
                m = m[0]
            stat[key] = m
        df = pd.DataFrame(stat)
        df['feature'] = columns
        return df.set_index('feature')

def calc_f_stat(dataset, columns, summary=True):
    return calc_feature_selection_metric(f_classif, dataset, columns, summary)

def calc_mutual_information(dataset, columns, summary=True):
    return calc_feature_selection_metric(mutual_info_classif, dataset, columns, summary)

def load_time_domain_features(axis):
    features = pd.read_csv(TIME_FEATURES_PATH)
    features = features[features['axis'].isin(axis)]
    features['fault'] = features['fault'].astype('category')
    return features

## F score in Time domain

#### Unnormalized features

In [None]:
features = load_time_domain_features(['ax', 'ay', 'az', 'bx', 'by', 'bz'])
fscore = calc_f_stat(features, TD_COLUMNS, summary=True)
fscore.plot.bar(figsize=(8, 5), grid=True, xlabel='Feature', ylabel='F statistic', legend=False)
plt.show()

#### Normalized features (Result found: F score is independent of scaling)

In [None]:
features = load_time_domain_features(['ax', 'ay', 'az', 'bx', 'by', 'bz'])

standard_transformer = Pipeline(
    steps=[('standard', StandardScaler())]
)
minmax_transformer = Pipeline(
    steps=[('minmax', MinMaxScaler())]
)

preprocessor = ColumnTransformer(
    remainder='passthrough',
    transformers=[
        ('std', standard_transformer , TD_COLUMNS)
    ],
    verbose_feature_names_out=False
)
features_normalized = preprocessor.fit_transform(features)

features_normalized = pd.DataFrame(features_normalized, columns=preprocessor.get_feature_names_out())
features_normalized['fault'] = features_normalized['fault'].astype('category')
fscore = calc_f_stat(features_normalized, TD_COLUMNS, summary=True)
fscore.plot.bar(figsize=(8, 5), grid=True, xlabel='Feature', ylabel='F statistic', legend=False)
plt.show()

### F stat between feature on axis and target fault state

In [None]:
fig, ax = plt.subplots(figsize=(10, 5)) 
sb.heatmap(calc_f_stat(features, TD_COLUMNS, summary=False), annot=True, ax=ax, cmap="Greens")
plt.show()

## Mutual information

## MI in Time domain

In [None]:
features = load_time_domain_features(['ax', 'ay', 'az', 'bx', 'by', 'bz'])
mi = calc_mutual_information(features, TD_COLUMNS, summary=True)
mi.plot.bar(figsize=(8, 5), grid=True, xlabel='Feature', ylabel='Mutual information', legend=False)
plt.show()

### MI between feature on axis and target fault state

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))  
sb.heatmap(calc_mutual_information(features, TD_COLUMNS, summary=False), annot=True, ax=ax, cmap="Greens")
plt.show()

## MI in Frequency domain
- By fft window length
- By measurement point:{(ax, ay, az), {bx, by, bz})

In [None]:
WINDOW_SIZES = (2**8, 2**10, 2**12, 2**14, 2**16)

def show_freq_domain_mutual_info(features, cols):
    fig, ax = plt.subplots(1, 5, figsize=(20, 5))
    for i, win in enumerate(WINDOW_SIZES):
        x = features[
                (features['fft_window_length'] == win) &
                (features['axis'].isin(cols))
            ].dropna()
        print('FFT:', win, 'Number of rows:', len(x))
        mi = calc_mutual_information(x, columns, summary=True)

        o = ax.flatten()[i]
        o.bar(mi.index, mi.values.T[0])
        # Stylize bar graph
        o.grid(True)
        o.set_xlabel('Feature')
        o.set_ylabel('MI')
        o.set_title(f'FFT: {win}')
        # Rotate x labels by 45 deg
        o.set_xticks(o.get_xticks())
        o.set_xticklabels(o.get_xticklabels(), rotation=45, ha='right')

In [None]:
# TODO: MI between pairs of variables (e.g. mean to std, ..)
features = pd.read_csv(FREQ_FEATURES_PATH)
features['fault'] = features['fault'].astype('category')
features['fft_window_length'] = features['fft_window_length'].astype('category')

show_freq_domain_mutual_info(features, ['ax', 'ay', 'az'])
plt.show()

In [None]:
show_freq_domain_mutual_info(features, ['bx', 'by', 'bz'])
plt.show()

### Mutual information between feature in axis and various faults (predicted variable)

In [None]:
def mi_among_fault_and_axis(features, cols):
    fig, ax = plt.subplots(5, 1, figsize=(8, 20))
    
    for i, win in enumerate(WINDOW_SIZES):
        x = features[
            (features['fft_window_length'] == win) &
            (features['axis'].isin(cols))
        ].dropna()
        o = ax.flatten()[i]
        mi = calc_mutual_information(x, FD_COLUMNS, summary=False)
        sb.heatmap(mi, annot=True, ax=o, cmap="Greens")
        o.set_title(f'FFT: {win}')

AXIS = ['ax', 'ay', 'az', 'bx', 'by', 'bz']
features = pd.read_csv(FREQ_FEATURES_PATH)
features['fault'] = features['fault'].astype('category')
features['fft_window_length'] = features['fft_window_length'].astype('category')

mi_among_fault_and_axis(features, AXIS)
plt.show()

## TODO: MI in Freq domain: Rank order of features averaged among all window sizes

## MI in Wavelets

In [None]:
features = pd.read_csv(WPD_FEATURES_PATH)

WPD_AXIS = 'ax'
# More axis at once significantly reduces MI
features = features[features['axis'] == WPD_AXIS]                 # One axis
features['fault'] = features['fault'].astype('category')
#features = features[features['axis'].isin(['ax', 'ay', 'az'])]  # One measuremnt position

columns = [col for col in features.columns 
           if col not in ('fault', 'severity', 'seq', 'rpm', 'axis', 'feature')]
features.head()

In [None]:
features_energy = features[features['feature'] == 'energy']
print(len(features_energy))

mi = calc_mutual_information(features_energy, columns, summary=True)
mi.iloc[:30].plot.bar(figsize=(20, 4), grid=True, ylabel='MI', title='WPD Energy')
plt.show()

In [None]:
def plot_wpd_energy_ratio_per_level(features, wpd_axis):
    features = features[features['axis'].isin(wpd_axis)]  
    features_energy_ratio = features[features['feature'] == 'energy_ratio']
    # print(len(features_energy_ratio))
    
    fig, ax = plt.subplots(6, 1, figsize=(15, 20))
    
    for level in range(1, 7):
        cols = np.array(columns)
        cols = cols[np.char.startswith(cols, f'L{level}')]
        mi = calc_mutual_information(features_energy_ratio, cols, summary=True)
        
        o = ax.flatten()[level-1]
        o.bar(mi.index, mi.values.T[0])
        o.grid(True)
        o.set_xlabel('Feature')
        o.set_ylabel('MI')
        
        # Rotate x labels by 45 deg
        o.set_xticks(o.get_xticks())
        o.set_xticklabels(o.get_xticklabels(), rotation=45, ha='right')

    fig.suptitle(f'WPD energy ratio: Axis "{wpd_axis}"', fontsize=16, y=0.9)
    plt.show()

In [None]:
plot_wpd_energy_ratio_per_level(features, ['ax'])

In [None]:
plot_wpd_energy_ratio_per_level(features, ['ax', 'ay', 'az'])

In [None]:
features_entropy = features[features['feature'] == 'negentropy']
print(len(features_entropy))

mi = calc_mutual_information(features_entropy, columns, summary=True)
mi.iloc[:30].plot.bar(figsize=(20, 4), grid=True, ylabel='MI', title='WPD Negentropy')
plt.show()

In [None]:
features_kurtosis = features[features['feature'] == 'kurtosis']
print(len(features_kurtosis))

mi = calc_mutual_information(features_entropy, columns, summary=True)
mi.iloc[:30].plot.bar(figsize=(20, 4), grid=True, ylabel='MI', title='WPD Kurtosis')
plt.show()

In [None]:
def level_to_frequency_bands(level, fs):
    bin_count = 2 ** level
    bin_width = (fs / 2) / bin_count
    for bin in range(bin_count):
        a = bin * bin_width
        b = a + bin_width
        print(f'L{level}_{bin} = [{a}; {b}] Hz')

level_to_frequency_bands(level=4, fs=50000)

### PCA on time domain features

In [None]:
features = pd.read_csv(TIME_FEATURES_PATH)
columns = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
matrix = features[columns].to_numpy()

scaler = StandardScaler()
matrix_scaled = scaler.fit_transform(matrix)
pca = PCA()  #n_components=3)
result = pca.fit_transform(matrix_scaled)

print(pca.explained_variance_ratio_)
# print(pca.singular_values_)
print(pca.components_)
plt.scatter(result.T[0], result.T[1], s=1)
plt.show()

In [None]:
ax = plt.figure().add_subplot(projection='3d')
p = ax.scatter(result.T[0], result.T[1], result.T[2], color='red', s=1)
plt.show()

In [None]:
# https://medium.com/@andymdc31/using-pca-in-a-machine-learning-pipeline-b6fe3492b1b9
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_95 = len(total_explained_variance[total_explained_variance >= .95])

n_to_reach_95 = matrix.shape[1] - n_over_95 + 1
print("Number features: {}\tTotal Variance Explained: {}".format(
    n_to_reach_95,
    total_explained_variance[n_to_reach_95-1]
))

In [None]:
# We can find original feature importance in original model
pipeline = Pipeline(steps=[
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=2))
])
result = pipeline.fit_transform(matrix_scaled)

In [None]:
wp_features = pd.read_csv(WPD_FEATURES_PATH)
columns = [
    col for col in wp_features.columns 
    if col not in ('fault', 'severity', 'seq', 'rpm', 'axis', 'feature')
]

fig, ax = plt.subplots(figsize=(6, 6))

for key, group in features.groupby('fault'):
    group.plot(ax=ax, kind='scatter', x='kurt', y='margin', label=key, color=colors.get(key, 'tab:brown'))
plt.show()