Feature EDA

In [None]:
import pandas as pd
import seaborn as sb
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

import sys
sys.path.append('../../')
from feature import selection as sel

Correlations

Time domain

In [None]:
features = pd.read_csv(sel.TIME_FEATURES_PATH)
rows = features[
     (features['axis'] == 'ax')
   & (features['fault'] == 'horizontal-misalignment')
][sel.TD_COLUMNS]
rows.corr()

Absolute value of correlation

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))  
sb.heatmap(np.abs(rows.corr()), annot=True, ax=ax, cmap='Greens')

Variance of variables

In [None]:
ax = rows.var().plot(kind='barh', xlabel='Variance', ylabel='Feature')

In [None]:
# TODO: pairplot with color for fault
fig, ax = plt.subplots(figsize=(6, 6))

colors = {
    'horizontal-misalignment': 'tab:blue', 
    'imbalance':'tab:orange', 
    'vertical-misalignment': 'tab:purple', 
    'normal': 'tab:green'
}

for key, group in features.groupby('fault'):
    group.plot(ax=ax, kind='scatter', x='kurt', y='margin', label=key, color=colors.get(key, 'tab:brown'))
plt.show()

In [None]:
ax = plt.figure().add_subplot(projection='3d')

colors = {
    'horizontal-misalignment': 'tab:blue', 
    'imbalance': 'tab:orange', 
    'vertical-misalignment': 'tab:purple', 
    'normal': 'tab:green'
}

for key, group in features.groupby('fault'):
    ax.scatter(group['kurt'], group['margin'], group['rpm'], label=key, color=colors.get(key, 'tab:brown'))

plt.xlabel('Kurtosis')
plt.ylabel('Margin')
plt.show()

Faults and their severity in relation to feature value and rotational speed

In [None]:
def plot_feature_to_rpm(features, column):
    fig, axes = plt.subplots(5, 2, figsize=(20, 15))
    faults = features['fault'].cat.categories
    
    for idx, detail in enumerate(zip(faults, axes.flat)):
        fault, ax = detail
        
        rows = features[features['fault'] == fault]
        severity = rows['severity'].astype('category').cat.categories
        colors = dict(zip(severity, mcolors.TABLEAU_COLORS))
    
        ax.scatter(rows['rpm'], rows[column], s=1, c=rows['severity'].map(colors))
        ax.set_xlabel('RPM')
        ax.set_ylabel(column)
        ax.set_title(fault)


def plot_fault_histogram(features, columns):
    fig, axes = plt.subplots(5, 2, figsize=(20, 15))
    faults = features['fault'].cat.categories

    for idx, detail in enumerate(zip(faults, axes.flat)):
        fault, ax = detail
        
        rows = features[features['fault'] == fault]
        severity = rows['severity'].astype('category').cat.categories
        colors = dict(zip(severity, mcolors.TABLEAU_COLORS))

        for col in columns:
            ax.hist(rows[col], bins=50, label=col)
        ax.set_title(fault)
        # ax.legend()

features = pd.read_csv(sel.TIME_FEATURES_PATH)
columns = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
features['fault'] = features['fault'].astype('category')

In [None]:
plot_feature_to_rpm(features, columns[0])

In [None]:
plot_fault_histogram(features, columns)  # TODO: calculate mutal information

In [None]:
features = pd.read_csv(sel.FREQ_FEATURES_PATH)
features['fault'] = features['fault'].astype('category')
features['fft_window_length'] = features['fft_window_length'].astype('category')
print(features['fft_window_length'].cat.categories)

features = features[(features['fft_window_length'] == 1024) & (features['axis'] == 'ax')]
features

In [None]:
plot_feature_to_rpm(features, sel.FD_COLUMNS[-2])

In [None]:
plot_fault_histogram(features, sel.FD_COLUMNS)

PCA on time domain features

In [None]:
features = pd.read_csv(sel.TIME_FEATURES_PATH)
columns = ['mean', 'std', 'skew', 'kurt', 'rms', 'pp', 'crest', 'margin', 'impulse', 'shape']
matrix = features[columns].to_numpy()

scaler = StandardScaler()
matrix_scaled = scaler.fit_transform(matrix)
pca = PCA()  #n_components=3)
result = pca.fit_transform(matrix_scaled)

print(pca.explained_variance_ratio_)
# print(pca.singular_values_)
print(pca.components_)
plt.scatter(result.T[0], result.T[1], s=1)
plt.show()

In [None]:
ax = plt.figure().add_subplot(projection='3d')
p = ax.scatter(result.T[0], result.T[1], result.T[2], color='red', s=1)
plt.show()

In [None]:
# https://medium.com/@andymdc31/using-pca-in-a-machine-learning-pipeline-b6fe3492b1b9
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_95 = len(total_explained_variance[total_explained_variance >= .95])

n_to_reach_95 = matrix.shape[1] - n_over_95 + 1
print("Number features: {}\tTotal Variance Explained: {}".format(
    n_to_reach_95,
    total_explained_variance[n_to_reach_95-1]
))

In [None]:
# We can find original feature importance in original model
pipeline = Pipeline(steps=[
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=2))
])
result = pipeline.fit_transform(matrix_scaled)

In [None]:
wp_features = pd.read_csv(sel.WPD_FEATURES_PATH)
columns = [
    col for col in wp_features.columns 
    if col not in ('fault', 'severity', 'seq', 'rpm', 'axis', 'feature')
]

fig, ax = plt.subplots(figsize=(6, 6))

for key, group in features.groupby('fault'):
    group.plot(ax=ax, kind='scatter', x='kurt', y='margin', label=key, color=colors.get(key, 'tab:brown'))
plt.show()

TODO: do an inverse transform to find importance of features in model