##### k-Nearest Neighbors with different feature sets

In [None]:
USE_ONE_AXIS = False  # False, True
MAFAULDA_LABEL_METHODS = ['bearing-A', 'all-bearings', 'severity']
MAFAULDA_LABEL_METHOD = MAFAULDA_LABEL_METHODS[0]

In [None]:
import os
import re
from zipfile import ZipFile
from typing import Tuple

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from adjustText import adjust_text

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

#import mafaulda
import pumps
import extraction
import ranking
import visualize
import models

plt.rcParams.update({'font.size': 14})

In [None]:
MODEL_TYPE = 'knn'          # 'lda', 'bayes', 'svm' 
KNN_METRIC = 'euclidean'    # 'cityblock', 'cosine'
FFT_WINDOW = 2 ** 15

PATH = '../datasets/'
FEATURES_PATH = os.path.join(PATH, 'features')
MAFAULDA_PATH = os.path.join(PATH, 'MAFAULDA.zip')
PUMPS_PATH = os.path.join(PATH, 'FluidPump.zip')
MAFAULDA_TEMPORAL = os.path.join(FEATURES_PATH, 'MAFAULDA_TD.csv')
MAFAULDA_SPECTRAL = os.path.join(FEATURES_PATH, 'MAFAULDA_FD.csv')

#### Feature analysis:
Mafaulda (3) a Custom (4) 
- 1 ks table (how many faults have how many recordings)
- 1 ks plot (2 lines TD, FD) - number of PC vs. explained variance
- 1 ks plot (2x subplots TD, FD) - loading plot (PC2)
- 1 ks (4 subplots) custom: all machines, pumps, compressors, motors

##### 1. Explained varinace by PCA components and loading plots

In [None]:
def load_whole_dataset(dataset: dict, domain: str):
    axis = dataset['axis']
    label_cols = dataset['labels']
    filename = dataset[domain]

    features = pd.read_csv(filename)

    columns = features.columns.str.startswith(axis)
    X = features[features.columns[columns]]
    if label_cols is not None:
        Y = features[label_cols]
    else:
        Y = pd.DataFrame()

    names = X.columns.str.extract(r'([a-z]+)_([a-z\_\-]+)')[1].unique()
    feature_names = [f'{domain}_{col.strip("_")}' for col in names if not pd.isnull(col)]

    result = pd.DataFrame()
    for name in feature_names:              
        name = re.search(r'[a-z]+_([\w\_]+)', name).group(1)
        
        if USE_ONE_AXIS:
            dim = dataset['one-axis']
            result[name] = X[f'{dim}_{name}']
        else:
            vector_dims = [f'{dim}_{name}' for dim in axis]
            result[name] = X[vector_dims].apply(np.linalg.norm, axis=1)
    X = result
    return X, Y


def explained_variance(X):
    x_scaled = pd.DataFrame()
    x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
    pca= PCA(n_components=10)
    X_pca = pca.fit_transform(x_scaled)
    return pca.explained_variance_ratio_


def get_principal_components(X):
    x_scaled = pd.DataFrame()
    x_scaled[X.columns] = MinMaxScaler().fit_transform(X)
    pca= PCA(n_components=2)
    X_pca = pca.fit_transform(x_scaled)
    return pca.components_


def plot_cumulative_explained_variance(td_variance, fd_variance):
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(
        np.arange(1, len(td_variance) + 1),
        100 * np.cumsum(td_variance), 
        marker='s', label='Temporal features'
    )
    ax.plot(
        np.arange(1, len(fd_variance) + 1), 
        100 * np.cumsum(fd_variance),
        marker='s', label='Spectral features'
    )
    ax.set_xlabel('Number of principal components')
    ax.set_ylabel('Explained variance [%]')
    ax.grid()
    ax.legend()
    plt.show()


def loading_plot(loadings, feature_names, bottom, top):
    xs = loadings[0]
    ys = loadings[1]

    texts = []
    # Plot the loadings on a scatterplot
    for i, varnames in enumerate(feature_names):
        plt.arrow(
            0, 0,   # coordinates of arrow base
            xs[i],  # length of the arrow along x
            ys[i],  # length of the arrow along y
            color='r', 
            head_width=0.01
        )
        texts.append(plt.text(xs[i], ys[i], varnames))

    # Define the axis
    adjust_text(texts, only_move={'points':'y', 'texts':'y'})
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.xlim(bottom, top)
    plt.ylim(bottom, top)
    plt.grid()
    plt.show()

MaFaulDa

In [None]:
FEATURES_PATH = '../datasets/features'
mafaulda = {
    'temporal': MAFAULDA_TEMPORAL,
    'spectral': MAFAULDA_SPECTRAL,
    'axis': ('ax', 'ay', 'az'),
    'labels': ['fault', 'severity', 'rpm'],
    'one-axis': 'ay'
}

mafaulda['X_td'], mafaulda['Y'] = load_whole_dataset(mafaulda, 'temporal')
mafaulda['X_fd'], mafaulda['Y'] = load_whole_dataset(mafaulda, 'spectral')

In [None]:
# Class count
# 1) Label independently
faults = {
    'normal': 'normal',
    'imbalance': 'imbalance',
    'horizontal-misalignment': 'misalignment',
    'vertical-misalignment': 'misalignment',
    'underhang-outer_race': 'outer race fault',
    'underhang-cage_fault': 'cage fault',
    'underhang-ball_fault': 'ball fault',
    'overhang-outer_race': 'outer race fault',
    'overhang-cage_fault': 'cage fault',
    'overhang-ball_fault': 'ball fault'
}

bearing_faults = {
    'A': {
        'normal': 'normal',
        'imbalance': 'imbalance',
        'horizontal-misalignment': 'misalignment',
        'vertical-misalignment': 'misalignment',
        'underhang-outer_race': 'outer race fault',
        'underhang-cage_fault': 'cage fault',
        'underhang-ball_fault': 'ball fault'
    },
    'B': {
        'normal': 'normal',
        'imbalance': 'imbalance',
        'horizontal-misalignment': 'misalignment',
        'vertical-misalignment': 'misalignment',
        'overhang-outer_race': 'outer race fault',
        'overhang-cage_fault': 'cage fault',
        'overhang-ball_fault': 'ball fault',
    }
}

if MAFAULDA_LABEL_METHOD == 'bearing-A':
    mafaulda['Y']['target'] = mafaulda['Y'].apply(lambda row: bearing_faults['A'].get(row['fault']), axis=1)

elif MAFAULDA_LABEL_METHOD == 'all-bearings':
    mafaulda['Y']['target'] = mafaulda['Y'].apply(lambda row: faults.get(row['fault']), axis=1)

elif MAFAULDA_LABEL_METHOD == 'severity':
    table = mafaulda['Y'].copy()
    table['target'] = mafaulda['Y']['fault'].replace(faults)
    table['target'] = table['target'].astype('category')
    table['severity_no'] = table['severity'].str.extract(r'(\d+\.?\d*)').astype(float)

    for name, group in table.groupby(by=['target'], observed=True):
        group = group.sort_values(by='severity_no')
            
        severities = group['severity_no'].astype('category').cat.codes.values.reshape(-1, 1)
        scale_severities = MinMaxScaler().fit_transform(severities)

        sev_names = list(group['severity'].astype('category').cat.categories)
        sev = list(group['severity'].astype('category').cat.codes.astype('category').cat.categories)
        scale = [float(f'{p:.2f}') for p in pd.Series(scale_severities[:, 0]).astype('category').cat.categories]
        print(f'Fault: {name[0]}, Files: {len(group)}, Severity names: {sev_names}, Severity: {sev}, Severity Levels: {scale}')
        
        table.loc[group.index, 'severity_class'] = severities
        table.loc[group.index, 'severity_level'] = scale_severities

    table.loc[table['severity_level'] < 0.5, 'target'] = 'normal'
    mafaulda['Y'] = table


mafaulda['Y']['target'] = mafaulda['Y']['target'].astype('category')

In [None]:
# Count classes
counts = mafaulda['Y']['target'].value_counts().to_frame()
counts['freq'] = (counts['count'] / counts['count'].sum()) * 100
counts

In [None]:
# Explained variance
td_variance = explained_variance(mafaulda['X_td'])
fd_variance = explained_variance(mafaulda['X_fd'])
plot_cumulative_explained_variance(td_variance, fd_variance)

In [None]:
# Loading plots
td_pc = get_principal_components(mafaulda['X_td'])
fd_pc = get_principal_components(mafaulda['X_fd'])
loading_plot(td_pc, mafaulda['X_td'].columns, -0.8, 0.8)
loading_plot(fd_pc, mafaulda['X_fd'].columns, -0.8, 0.8)

Fluid pumps

In [None]:
# pump dataset (all devices, each type - pump, motor, compressor)
FEATURES_PATH = '../datasets/features'
pump = {
    'temporal': os.path.join(FEATURES_PATH, 'PUMPS_TD.csv'),
    'spectral': os.path.join(FEATURES_PATH, 'PUMPS_FD.csv'),
    'axis': ('x', 'y', 'z'),
    'labels': ['date', 'device', 'position'],
    'one-axis': 'z'
}

pump['X_td'], pump['Y'] = load_whole_dataset(pump, 'temporal')
pump['X_fd'], pump['Y'] = load_whole_dataset(pump, 'spectral')

In [None]:
# Class count
machines = {
    'KSB1': {
        'MTR001': 'M1',
        'MTR002': 'M1',
        'PMP003': 'P1',
        'PMP004': 'P1'
    },
    'KSB7': {
        'MTR001': 'M2',
        'MTR002': 'M2',
        'PMP003': 'P2',
        'PMP004': 'P2'
    },
    'K3': {
        '001': 'C1',
        '002': 'C1'
    },
    'K5': {
        '001': 'C2',
        '002': 'C2'
    }
}

pump['Y']['target'] = pump['Y'].apply(lambda row: machines.get(row['device'], {}).get(row['position']), axis=1)
pump['Y']['target'] = pump['Y']['target'].astype('category')

counts = pump['Y']['target'].value_counts().to_frame()
counts['freq'] = (counts['count'] / counts['count'].sum()) * 100
counts

In [None]:
# Motor and pump only

# Label by device
labels_machines = {
    'KSB1': {
        'MTR001': 'M1',
        'MTR002': 'M1',
        'PMP003': 'P1',
        'PMP004': 'P1'
    },
    'KSB7': {
        'MTR001': 'M2',
        'MTR002': 'M2',
        'PMP003': 'P2',
        'PMP004': 'P2'
    }
}
pump['Y']['label_machine'] = pump['Y'].apply(
    lambda row: labels_machines.get(row['device'], {}).get(row['position']), axis=1
)

# Label by postion
label_positions = {
    'KSB1': {
        'MTR001': 'M1-1',
        'MTR002': 'M1-2',
        'PMP003': 'P1-3',
        'PMP004': 'P1-4'
    },
    'KSB7': {
        'MTR001': 'M2-1',
        'MTR002': 'M2-2',
        'PMP003': 'P2-3',
        'PMP004': 'P2-4'
    }
}
pump['Y']['label_position'] = pump['Y'].apply(
    lambda row: label_positions.get(row['device'], {}).get(row['position']), axis=1
)

# Label only P1-3, P2-3
label_binary = {
    'KSB1': {
        'PMP003': 'P1-3'
    },
    'KSB7': {
        'PMP003': 'P2-3'
    }
}
pump['Y']['label_binary'] = pump['Y'].apply(
    lambda row: label_binary.get(row['device'], {}).get(row['position']), axis=1
)

In [None]:
td_variance = explained_variance(pump['X_td'])
fd_variance = explained_variance(pump['X_fd'])
plot_cumulative_explained_variance(td_variance, fd_variance)

In [None]:
# Loading plots
td_pc = get_principal_components(pump['X_td'])
fd_pc = get_principal_components(pump['X_fd'])
loading_plot(td_pc, pump['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, pump['X_fd'].columns, -0.5, 1)

In [None]:
# Split by machine
Y = pump['Y']
compressor = Y[Y['device'].isin(['K3', 'K5'])].index
waterpump = Y[
    Y['device'].isin(['KSB1', 'KSB7']) & 
    Y['position'].isin(['PMP003', 'PMP004'])
].index
motor = Y[
    Y['device'].isin(['KSB1', 'KSB7']) & 
    Y['position'].isin(['MTR001', 'MTR002'])
].index

td_variance = explained_variance(pump['X_td'].loc[compressor])
fd_variance = explained_variance(pump['X_fd'].loc[compressor])
plot_cumulative_explained_variance(td_variance, fd_variance)

td_pc = get_principal_components(pump['X_td'].loc[compressor])
fd_pc = get_principal_components(pump['X_fd'].loc[compressor])
loading_plot(td_pc, pump['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, pump['X_fd'].columns, -0.5, 1)

# -----
td_variance = explained_variance(pump['X_td'].loc[waterpump])
fd_variance = explained_variance(pump['X_fd'].loc[waterpump])
plot_cumulative_explained_variance(td_variance, fd_variance)

td_pc = get_principal_components(pump['X_td'].loc[waterpump])
fd_pc = get_principal_components(pump['X_fd'].loc[waterpump])
loading_plot(td_pc, pump['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, pump['X_fd'].columns, -0.5, 1)

# -----
td_variance = explained_variance(pump['X_td'].loc[motor])
fd_variance = explained_variance(pump['X_fd'].loc[motor])
plot_cumulative_explained_variance(td_variance, fd_variance)

td_pc = get_principal_components(pump['X_td'].loc[motor])
fd_pc = get_principal_components(pump['X_fd'].loc[motor])
loading_plot(td_pc, pump['X_td'].columns, -0.5, 1)
loading_plot(fd_pc, pump['X_fd'].columns, -0.5, 1)


##### Scatter plots of labels after PCA
- 1 ks (5 subplots) scatter: mafaulda, all machines, pumps, compressors, motors

In [None]:
# Mafaulda temporal
vizualize.project_classes(mafaulda['X_td'], mafaulda['Y']['target'], boundary=True)
vizualize.project_classes_3d(mafaulda['X_td'], mafaulda['Y']['target'])

In [None]:
# Mafaulda spectral
vizualize.project_classes(mafaulda['X_fd'], mafaulda['Y']['target'], boundary=True)
vizualize.project_classes_3d(mafaulda['X_fd'], mafaulda['Y']['target'])

In [None]:
all_places = (pump['Y']['device'] + ',' + pump['Y']['position']).astype('category')
vizualize.project_classes(pump['X_td'], all_places, size=(10, 7), boundary=True)
vizualize.project_classes(pump['X_fd'], all_places, size=(10, 7), boundary=True)

In [None]:
# Fluid pump temporal
vizualize.project_classes(pump['X_td'], pump['Y']['target'], boundary=True)
vizualize.project_classes_3d(pump['X_td'], pump['Y']['target'])

In [None]:
vizualize.project_classes(pump['X_td'], pump['Y']['label_machine'], boundary=True)

In [None]:
vizualize.project_classes(pump['X_td'], pump['Y']['label_position'], boundary=True)

In [None]:
vizualize.project_classes(pump['X_td'], pump['Y']['label_binary'], boundary=True)

In [None]:
# Fluid pump spectral
vizualize.project_classes(pump['X_fd'], pump['Y']['target'], boundary=True)
vizualize.project_classes_3d(pump['X_fd'], pump['Y']['target'])

In [None]:
vizualize.project_classes(pump['X_fd'], pump['Y']['label_machine'], boundary=True)

In [None]:
vizualize.project_classes(pump['X_fd'], pump['Y']['label_position'], boundary=True)

In [None]:
vizualize.project_classes(pump['X_fd'], pump['Y']['label_binary'], boundary=True)

In [None]:
def visualize_machines(features: pd.DataFrame, labels: pd.DataFrame, machines: tuple):
    m = features.copy()
    columns = m.columns
    m['target'] = labels.astype('str')
    m = m[m['target'].isin(machines)].reset_index(drop=True)
    m['target'] = m['target'].astype('category')

    X = m[columns].copy()
    Y = m['target']
    vizualize.project_classes(X, Y, boundary=True)
    vizualize.project_classes_3d(X, Y)

In [None]:
compressors = ('C1', 'C2')
visualize_machines(pump['X_td'], pump['Y']['target'], compressors)
visualize_machines(pump['X_fd'], pump['Y']['target'], compressors)

In [None]:
waterpumps = ('P1', 'P2')
visualize_machines(pump['X_td'], pump['Y']['target'], waterpumps)
visualize_machines(pump['X_fd'], pump['Y']['target'], waterpumps)

In [None]:
motors = ('M1', 'M2')
visualize_machines(pump['X_td'], pump['Y']['target'], motors)
visualize_machines(pump['X_fd'], pump['Y']['target'], motors)

#### Change in compressors features over time
- Each domain (2x)
    - Scatter plot PCA - position - colors are dates

In [None]:
def visualize_compressors_by_date(X, Y):
    for placement, rows in Y[Y['device'].isin(('K3', 'K5'))].groupby(by=['device', 'position']):
        idx = list(rows.index)
        rows['date'] = rows['date'].astype('category')
        print(placement)
        vizualize.project_classes(X, rows['date'], boundary=True)

In [None]:
visualize_compressors_by_date(pump['X_td'], pump['Y'])

In [None]:
visualize_compressors_by_date(pump['X_fd'], pump['Y'])

#### Classification accuracy 
- choices of k. and feat. count, 5-fold cross validation

- All features 
    - for mafaulda and custom (which classes - all or just one machine)
    - 1 ks All features (2x subplots TD, FD)
	    - Each lineplot (k = 3,5,7)

In [None]:
def plot_all_knn(td_results, fd_results, kfold=5):
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    ax.plot(td_results['k'], td_results['train'], marker='x', color='darkblue', label='train - temporal')
    ax.plot(td_results['k'], td_results['test'], marker='x', color='blue', label='test - temporal')

    ax.plot(fd_results['k'], fd_results['train'], marker='x', color='darkgreen', label='train - spectral')
    ax.plot(fd_results['k'], fd_results['test'], marker='x', color='green', label='test - spectral')

    ax.set_ylabel(f'Accuracy')
    ax.set_xlabel('K-neighbors')
    ax.set_xticks(td_results['k'])
    ax.grid(True)
    ax.legend()
    plt.show()

In [None]:
td_results = models.all_features(mafaulda['X_td'], mafaulda['Y']['target'], MODEL_TYPE)
fd_results = models.all_features(mafaulda['X_fd'], mafaulda['Y']['target'], MODEL_TYPE)
plot_all_knn(td_results, fd_results)

In [None]:
td_results = models.all_features(pump['X_td'], pump['Y']['target'], MODEL_TYPE)
fd_results = models.all_features(pump['X_fd'], pump['Y']['target'], MODEL_TYPE)
plot_all_knn(td_results, fd_results)

In [None]:
td_results = models.all_features(pump['X_td'], pump['Y']['label_machine'], MODEL_TYPE)
fd_results = models.all_features(pump['X_fd'], pump['Y']['label_machine'], MODEL_TYPE)
plot_all_knn(td_results, fd_results)

In [None]:
td_results = models.all_features(pump['X_td'], pump['Y']['label_position'], MODEL_TYPE)
fd_results = models.all_features(pump['X_fd'], pump['Y']['label_position'], MODEL_TYPE)
plot_all_knn(td_results, fd_results)

In [None]:
#td_results = models.all_features(pump['X_td'], pump['Y']['label_binary'], MODEL_TYPE)
#fd_results = models.all_features(pump['X_fd'], pump['Y']['label_binary'], MODEL_TYPE)
#plot_all_knn(td_results, fd_results)

- All models (Exhausive) 
    - draw rank, corr, f-stat, mi as horizontal line
    - 3 ks plots (2, 3, 4 features)
	    - Each plot 2 boxplot subplots (TD, FD) - k-neigh. vs. accuracy of all models

In [None]:
# 6156 models (120/165, 210/330, 252/462), 25 minutes (longer because of oversampling)
mafaulda_models_summary = models.enumerate_models(mafaulda['X_td'], mafaulda['X_fd'], mafaulda['Y']['target'], model=MODEL_TYPE)

In [None]:
#mafaulda_models_summary.to_csv('mafaulda_models_summary.csv')
mafaulda_models_summary

In [None]:
def boxplot_enumerate_models_accuracy(results, metric, plots_col, inplot_col):
    for fnum, features in results.groupby(by=plots_col):
        fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
        for i, group in enumerate(features.groupby(by='domain', sort=False)):
            domain_name, domain = group 
            ax[i].grid()
            
            if plots_col == 'k':
                ax[i].set_title(f'K-neighbors: {fnum}, Domain: {domain_name}')
            if plots_col == 'f':
                ax[i].set_title(f'Features: {fnum}, Domain: {domain_name}')

            boxplot_data = {}
            for k, models in domain.groupby(by=[inplot_col]):
                boxplot_data[k[0]] = models[metric].to_list()

            ax[i].boxplot(
                boxplot_data.values(),
                labels=boxplot_data.keys(),
                medianprops = {'linewidth': 2, 'color': 'black'})
            ax[i].set_ylabel('Accuracy')
            if plots_col == 'f':
                ax[i].set_xlabel('K-neighbors')
            if plots_col == 'k':
                ax[i].set_xlabel('Number of features')
    plt.show()

#### MaFaulDa

In [None]:
boxplot_enumerate_models_accuracy(mafaulda_models_summary, 'train', 'f', 'k')

In [None]:
boxplot_enumerate_models_accuracy(mafaulda_models_summary, 'test', 'f', 'k')

In [None]:
boxplot_enumerate_models_accuracy(mafaulda_models_summary, 'train', 'k', 'f')

In [None]:
boxplot_enumerate_models_accuracy(mafaulda_models_summary, 'test', 'k', 'f')

- Compare accuracies of best models in each categories for given number of features and k:
    - 1 ks plot - bar chart - color rainbow - one x (td), second x (fd)
    - Scores side by side (bar chart)
    - best permuted, pca, rank product, corr, fstat, mi

In [None]:
visualize.plot_models_performance_bar(mafaulda['X_td'], mafaulda['X_fd'], mafaulda['Y']['target'], mafaulda_models_summary)

In [None]:
X = mafaulda['X_td']
Y = mafaulda['Y']['target']
features = find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d(X, Y, list(features), boundary=True)

In [None]:
X = mafaulda['X_fd']
Y = mafaulda['Y']['target']
features = find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d(X, Y, list(features), boundary=True)

3d plot

In [None]:
X = mafaulda['X_td']
Y = mafaulda['Y']['target']
features = find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))

In [None]:
X = mafaulda['X_fd']
Y = mafaulda['Y']['target']
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))

In [None]:
X = pump['X_td']
Y = pump['Y']['label_machine']
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))

In [None]:
X = pump['X_td']
Y = pump['Y']['label_position']
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))

In [None]:
X = pump['X_td']
Y = pump['Y']['label_binary']
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))

In [None]:
X = pump['X_fd']
Y = pump['Y']['label_machine']
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))

In [None]:
X = pump['X_fd']
Y = pump['Y']['label_position']
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))

In [None]:
X = pump['X_fd']
Y = pump['Y']['label_binary']
features = models.find_best_subset(X, Y, 'rank')
visualize.scatter_features_3d_plot(X, Y, list(features))