##### k-Nearest Neighbors with different feature sets
Fixed oversampling on training set only, not on validation set when compared to original paper

In [None]:
USE_ONE_AXIS = False  # False, True
MAFAULDA_LABEL_METHODS = ['bearing-A', 'all-bearings', 'severity']
MAFAULDA_LABEL_METHOD = MAFAULDA_LABEL_METHODS[1]

In [None]:
import os
import re
from zipfile import ZipFile
from typing import Tuple

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

import sys
sys.path.append('../')
from vibrodiagnostics import (
    mafaulda, 
    pumps,
    extraction,
    ranking,
    visualize,
    models
)

plt.rcParams.update({'font.size': 14})

In [None]:
MODEL_TYPE = 'knn'          # 'lda', 'bayes', 'svm' 
KNN_METRIC = 'euclidean'    # 'cityblock', 'cosine'
FFT_WINDOW = 2 ** 15
GENERATE = True

PATH = '../datasets/'
FEATURES_PATH = os.path.join(PATH, 'features')
MAFAULDA_PATH = os.path.join(PATH, 'MAFAULDA.zip')
PUMPS_PATH = os.path.join(PATH, 'FluidPump.zip')
MAFAULDA_TEMPORAL = os.path.join(FEATURES_PATH, 'MAFAULDA_TD.csv')
MAFAULDA_SPECTRAL = os.path.join(FEATURES_PATH, 'MAFAULDA_FD.csv')

#### Feature analysis:
Mafaulda (3) a Custom (4) 
- 1 ks table (how many faults have how many recordings)
- 1 ks plot (2 lines TD, FD) - number of PC vs. explained variance
- 1 ks plot (2x subplots TD, FD) - loading plot (PC2)
- 1 ks (4 subplots) custom: all machines, pumps, compressors, motors

##### 1. Explained varianace by PCA components and loading plots

In [None]:
def explained_variance(X):
    x_scaled = pd.DataFrame()
    x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
    pca= PCA(n_components=10)
    X_pca = pca.fit_transform(x_scaled)
    return pca.explained_variance_ratio_


def get_principal_components(X):
    x_scaled = pd.DataFrame()
    x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
    pca= PCA(n_components=2)
    X_pca = pca.fit_transform(x_scaled)
    return pca.components_

MaFaulDa

In [None]:
dmafaulda = {
    'TD': MAFAULDA_TEMPORAL,
    'FD': MAFAULDA_SPECTRAL,
    'axis': ['ax', 'ay', 'az'],
    'labels': ['fault', 'severity', 'rpm'],
    'one-axis': ['ay']
}

if USE_ONE_AXIS:
    axis = dmafaulda['one-axis']
else:
    axis = dmafaulda['axis']

dmafaulda['TDx'] = extraction.load_features(dmafaulda['TD'], axis, dmafaulda['labels'])
dmafaulda['FDx'] = extraction.load_features(dmafaulda['FD'], axis, dmafaulda['labels'])

In [None]:
# Labeling
if MAFAULDA_LABEL_METHOD == 'bearing-A':
    dmafaulda['TDx'] = mafaulda.assign_labels(dmafaulda['TDx'], 'A')
    dmafaulda['FDx'] = mafaulda.assign_labels(dmafaulda['FDx'], 'A')

elif MAFAULDA_LABEL_METHOD == 'all-bearings':
    dmafaulda['TDx'] = mafaulda.assign_labels(dmafaulda['TDx'], None)
    dmafaulda['FDx'] = mafaulda.assign_labels(dmafaulda['FDx'], None)

elif MAFAULDA_LABEL_METHOD == 'severity':
    dmafaulda['TDx'] = mafaulda.label_severity(dmafaulda['TDx'], None, 0.5)
    dmafaulda['FDx'] = mafaulda.label_severity(dmafaulda['FDx'], None, 0.5)

In [None]:
# Count classes
counts = dmafaulda['TDx']['label'].value_counts().to_frame()
counts['freq'] = (counts['count'] / counts['count'].sum()) * 100
counts

In [None]:
# Explained variance
td = dmafaulda['TDx'].drop(columns=['label'])
td_variance = explained_variance(td)
fd = dmafaulda['FDx'].drop(columns=['label'])
fd_variance = explained_variance(fd)
visualize.plot_cumulative_explained_variance(td_variance, fd_variance)

In [None]:
# Loading plots
td = dmafaulda['TDx'].drop(columns=['label'])
td_pc = get_principal_components(td)
visualize.loading_plot(td_pc, td.columns, -0.8, 0.8)

fd = dmafaulda['FDx'].drop(columns=['label'])
fd_pc = get_principal_components(fd)
visualize.loading_plot(fd_pc, fd.columns, -0.8, 0.8)

Fluid pumps

In [None]:
# pump dataset (all devices, each type - pump, motor, compressor)
FEATURES_PATH = '../datasets/features'
dpump = {
    'TD': os.path.join(FEATURES_PATH, 'PUMPS_TD.csv'),
    'FD': os.path.join(FEATURES_PATH, 'PUMPS_FD.csv'),
    'axis': ('x', 'y', 'z'),
    'labels': ['date', 'device', 'position'],
    'one-axis': 'z'
}

if USE_ONE_AXIS:
    axis = dpump['axis']
else:
    axis = dpump['one-axis']

dpump['TDx'] = extraction.load_features(dpump['TD'], axis, dpump['labels'])
dpump['FDx'] = extraction.load_features(dpump['FD'], axis, dpump['labels'])

In [None]:
# Class count
machines = {
    'KSB1': {
        'MTR001': 'M1',
        'MTR002': 'M1',
        'PMP003': 'P1',
        'PMP004': 'P1'
    },
    'KSB7': {
        'MTR001': 'M2',
        'MTR002': 'M2',
        'PMP003': 'P2',
        'PMP004': 'P2'
    },
    'Sigma': {
        'MTR001': 'M3',
        'MTR002': 'M3',
        'PMP003': 'P3',
        'PMP004': 'P3'
    },
    'K3': {
        '001': 'C1',
        '002': 'C1'
    },
    'K5': {
        '001': 'C2',
        '002': 'C2'
    }
}
dpump['TDm'] = pumps.get_classes(dpump['TDx'], machines)
dpump['FDm'] = pumps.get_classes(dpump['FDx'], machines) 

In [None]:
counts = dpump['TDm']['label'].value_counts().to_frame()
counts['freq'] = (counts['count'] / counts['count'].sum()) * 100
counts

In [None]:
# Motor and pump only
# Label by device
labels_machines = {
    'KSB1': {
        'MTR001': 'M1',
        'MTR002': 'M1',
        'PMP003': 'P1',
        'PMP004': 'P1'
    },
    'KSB7': {
        'MTR001': 'M2',
        'MTR002': 'M2',
        'PMP003': 'P2',
        'PMP004': 'P2'
    }
}
dpump['TDpmp'] = pumps.get_classes(dpump['TDx'], labels_machines)
dpump['FDpmp'] = pumps.get_classes(dpump['FDx'], labels_machines) 

In [None]:
# Label by postion
label_positions = {
    'KSB1': {
        'MTR001': 'M1-1',
        'MTR002': 'M1-2',
        'PMP003': 'P1-3',
        'PMP004': 'P1-4'
    },
    'KSB7': {
        'MTR001': 'M2-1',
        'MTR002': 'M2-2',
        'PMP003': 'P2-3',
        'PMP004': 'P2-4'
    }
}
dpump['TDpos'] = pumps.get_classes(dpump['TDx'], label_positions)
dpump['FDpos'] = pumps.get_classes(dpump['FDx'], label_positions) 

In [None]:
td_variance = explained_variance(dpump['TDm'].drop(columns=['label']))
fd_variance = explained_variance(dpump['FDm'].drop(columns=['label']))
visualize.plot_cumulative_explained_variance(td_variance, fd_variance)

In [None]:
# Loading plots
td = dpump['TDm'].drop(columns=['label'])
td_pc = get_principal_components(td)
visualize.loading_plot(td_pc, td.columns, -0.5, 1)

fd = dpump['FDm'].drop(columns=['label'])
fd_pc = get_principal_components(fd)
visualize.loading_plot(fd_pc, fd.columns, -0.5, 1)

In [None]:
dpump['TDm']

In [None]:
# Split by machine
classes = [
    ['C1', 'C2'],
    ['M1', 'M2'],
    ['P1', 'P2']
]

for c in classes:
    print(c)
    td = dpump['TDm'][dpump['TDm']['label'].isin(c)].drop(columns=['label'])
    fd = dpump['FDm'][dpump['FDm']['label'].isin(c)].drop(columns=['label'])
    td_variance = explained_variance(td)
    fd_variance = explained_variance(fd)
    visualize.plot_cumulative_explained_variance(td_variance, fd_variance)

    td_pc = get_principal_components(td)
    fd_pc = get_principal_components(fd)
    visualize.loading_plot(td_pc, td.columns, -0.8, 0.8)
    visualize.loading_plot(fd_pc, fd.columns, -0.8, 0.8)

##### Scatter plots of labels after PCA
- 1 ks (5 subplots) scatter: mafaulda, all machines, pumps, compressors, motors

In [None]:
# Mafaulda time domain
td_label = dmafaulda['TDx']['label']
td = dmafaulda['TDx'].drop(columns=['label'])

visualize.project_classes(td, td_label, boundary=True)
visualize.project_classes_3d(td, td_label)

In [None]:
# Mafaulda frequency domain
fd_label = dmafaulda['FDx']['label']
fd = dmafaulda['FDx'].drop(columns=['label'])

visualize.project_classes(fd, fd_label, boundary=True)
visualize.project_classes_3d(fd, fd_label)

#### Classification accuracy 
- choices of k. and feat. count, 5-fold cross validation

- All features 
    - for mafaulda and custom (which classes - all or just one machine)
    - 1 ks All features (2x subplots TD, FD)
	    - Each lineplot (k = 3,5,7)

In [None]:
results = []
feature_sets_y = None
feature_sets = []
for domain in ('TDx', 'FDx'):
    y = dmafaulda[domain]['label']
    x = dmafaulda[domain].drop(columns=['label'])
    feature_sets.append(x)
    feature_sets_y = y
    r = models.all_features(x, y, MODEL_TYPE)
    results.append(r)

visualize.plot_all_knn(*results)

- All models (Exhausive) 
    - draw rank, corr, f-stat, mi as horizontal line
    - 3 ks plots (2, 3, 4 features)
	    - Each plot 2 boxplot subplots (TD, FD) - k-neigh. vs. accuracy of all models

In [None]:
domains = dict(zip(['TD', 'FD'], feature_sets))
filename = os.path.join(FEATURES_PATH, f'paper_model_brute_force#{MAFAULDA_LABEL_METHOD}#{USE_ONE_AXIS}.csv')
models_summary = []
results = []

if GENERATE is True:
    for domain_name, domain in domains.items():
        r = models.enumerate_models(
            domain,
            feature_sets_y,
            domain_name,
            model=MODEL_TYPE
        )
        models_summary.append(r)

    mafaulda_models_summary = pd.concat(models_summary)
    mafaulda_models_summary.to_csv(filename, index=False)
else:
    mafaulda_models_summary = pd.read_csv(filename)

In [None]:
mafaulda_models_summary

In [None]:
visualize.boxplot_enumerate_models_accuracy(mafaulda_models_summary, 'train', 'f', 'k')

In [None]:
visualize.boxplot_enumerate_models_accuracy(mafaulda_models_summary, 'test', 'f', 'k')

In [None]:
visualize.boxplot_enumerate_models_accuracy(mafaulda_models_summary, 'train', 'k', 'f')

In [None]:
visualize.boxplot_enumerate_models_accuracy(mafaulda_models_summary, 'test', 'k', 'f')

- Compare accuracies of best models in each categories for given number of features and k:
    - 1 ks plot - bar chart - color rainbow - one x (td), second x (fd)
    - Scores side by side (bar chart)
    - best permuted, pca, rank product, corr, fstat, mi

In [None]:
domains = dict(zip(['TD', 'FD'], feature_sets))
results = []

filename = os.path.join(FEATURES_PATH, 'paper_model_brute_force.csv')
mafaulda_models_summary = pd.read_csv(filename)

for domain_name, domain in domains.items():
    r = models.feature_selection_accuracies(
        domain,
        feature_sets_y,
        domain_name,
        mafaulda_models_summary
    )
    results.extend(r)

results = pd.DataFrame(results)
visualize.plot_models_performance_bar(results)

In [None]:
results

In [None]:
X = feature_sets[0]
Y = feature_sets_y
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d(X, Y, list(features), boundary=True)

In [None]:
X = feature_sets[1]
Y = feature_sets_y
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d(X, Y, list(features), boundary=True)

3d plot

In [None]:
X = feature_sets[0]
Y = feature_sets_y
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))

In [None]:
X = feature_sets[1]
Y = feature_sets_y
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))