In [None]:
import numpy as np
import pandas as pd
from zipfile import ZipFile

# Feature selection
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif, f_classif

# Models
from scipy.spatial.distance import euclidean, mahalanobis
from sklearn.metrics.pairwise import rbf_kernel

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier

# Model evaluation
from sklearn import metrics
from kneed import KneeLocator

# Plotting and table formatting
import matplotlib.pyplot as plt
from IPython.display import Markdown
from tabulate import tabulate
import seaborn as sb
from collections import Counter
from tqdm.notebook import tqdm

# System modules
import os
import sys
sys.path.append('../../')

# Custom modules
from feature import mafaulda
from feature import discovery as fdiscovery
from feature.selection import load_td_feat, load_fd_feat, corr_classif
from feature.models import (
    fault_labeling, pipeline_v1, pipeline_v1_core, filter_out_metadata_columns
)


FEATURES_PATH =  '../../datasets/features_data/'
FAULT_CLASSES = {
    'normal': 'N',
    'imbalance': 'I',
    'horizontal-misalignment': 'HM',
    'vertical-misalignment': 'VM'
}

Parameters:
- Distance metric
- k neighbours (odd numbers because of majority voting) - elbow curve

#### 2A-TD. Time domain features import and transformations

In [None]:
TRAINING_SET_RATIO = 0.6
N_FEATURES = 3
N_NEIGHBOURS = 5
DIST_METRIC = 'euclidean'
FSEL_METHOD = mutual_info_classif

features = load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
td_features = fault_labeling(features, FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(
    td_features, 
    train=TRAINING_SET_RATIO,
    func_select=FSEL_METHOD,
    nfeat=N_FEATURES
)

#### 2B-TD. Classification with kNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS, metric=DIST_METRIC, algorithm='kd_tree')
knn.fit(X_train, y_train)
y_predict_train = knn.predict(X_train)
y_predict_test = knn.predict(X_test)

#### 2C-TD. Get the classification report

In [None]:
print(f'Train accuracy: {metrics.accuracy_score(y_train, y_predict_train):.4f}')
print(f'Test accuracy: {metrics.accuracy_score(y_test, y_predict_test):.4f}')
print(metrics.classification_report(y_test, y_predict_test))

#### 2D-TD. Confusion matrix

In [None]:
cm = metrics.confusion_matrix(y_test, y_predict_test)
ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')

#### TD: Find best k neighbors - elbow analysis

In [None]:
def get_knn_error_rates(k_values, X_train, y_train, X_test, y_test):
    errors = []
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree', metric=DIST_METRIC)
        knn.fit(X_train, y_train)
        y_predict = knn.predict(X_test)
        errors.append(np.mean(y_predict != y_test))
    return errors


def get_knn_accuracies(k_values, X_train, y_train, X_test, y_test):
    accuracies = []

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree', metric=DIST_METRIC)
        knn.fit(X_train, y_train)
        y_predict = knn.predict(X_test)
        accuracies.append(metrics.accuracy_score(y_test, y_predict))
    return accuracies


def plot_knn_k_param(k_values, accuracies, error_rates):
    fig, ax = plt.subplots(1, 2, figsize=(15, 4))
    
    ax[0].plot(k_values, accuracies, color='darkblue', marker='o', markerfacecolor='darkgreen', markersize=5)
    ax[0].set_xlabel('K neighbors')
    ax[0].set_ylabel('Accuracy')
    ax[0].grid(True)

    ax[1].plot(k_values, error_rates, color='darkblue', marker='o', markerfacecolor='darkgreen', markersize=5)
    ax[1].set_xlabel('K neighbors')
    ax[1].set_ylabel('Error rate')
    ax[1].grid(True)
    
    plt.show()

In [None]:
k_values = list(range(3, 40, 2))
accuracies = get_knn_accuracies(k_values, X_train, y_train, X_test, y_test)
error_rates = get_knn_error_rates(k_values, X_train, y_train, X_test, y_test)
plot_knn_k_param(k_values, accuracies, error_rates)

#### 2A-FD. Frequncy domain features import and transformations

In [None]:
TRAINING_SET_RATIO = 0.6
N_FEATURES = 3
N_NEIGHBOURS = 5
DIST_METRIC = 'euclidean'
FSEL_METHOD = mutual_info_classif

features = load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
fd_features = fault_labeling(features.copy(), FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(
    fd_features, 
    train=TRAINING_SET_RATIO,
    func_select=FSEL_METHOD,
    nfeat=N_FEATURES
)

#### 2B-FD. Classification with kNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS, metric=DIST_METRIC, algorithm='kd_tree')
knn.fit(X_train, y_train)
y_predict_train = knn.predict(X_train)
y_predict_test = knn.predict(X_test)

#### 2C-FD. Get the classification report

In [None]:
print(f'Train accuracy: {metrics.accuracy_score(y_train, y_predict_train):.4f}')
print(f'Test accuracy: {metrics.accuracy_score(y_test, y_predict_test):.4f}')
print(metrics.classification_report(y_test, y_predict_test))

#### 2D-FD. Confusion matrix

In [None]:
cm = metrics.confusion_matrix(y_test, y_predict_test)
ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')

#### FD: Find best k neighbors - elbow analysis

In [None]:
k_values = list(range(3, 40, 2))
accuracies = get_knn_accuracies(k_values, X_train, y_train, X_test, y_test)
error_rates = get_knn_error_rates(k_values, X_train, y_train, X_test, y_test)
plot_knn_k_param(k_values, accuracies, error_rates)

### Change number of features
- evaluate kNN classification accuracies in each feature domain
  - features for each axis separately (x, y, z)
  - features in each measurement poins (a, b)

In [None]:
TRAINING_SET_RATIO = 0.6
DIST_METRIC = 'euclidean'
FEATURE_SELECTION_METHOD = mutual_info_classif
KNN_K_VALUES = list(range(3, 20, 2))

def evaluate_knn_number_of_features(features, columns):    
    results = []
    for n in tqdm(range(1, len(columns) + 1)):
        X_train, X_test, y_train, y_test = pipeline_v1(
            fd_features, 
            train=TRAINING_SET_RATIO,
            func_select=FEATURE_SELECTION_METHOD,
            nfeat=n
        )
        metric = get_knn_error_rates(KNN_K_VALUES, X_train, y_train, X_test, y_test)
        kneedle = KneeLocator(KNN_K_VALUES, metric, S=1.0, curve='convex', direction='decreasing')
        results.append([n, kneedle.elbow, kneedle.knee_y])
    
    return pd.DataFrame(results, columns=['n_features', 'k_neighbors', 'error_rate'])


def find_optimal_error_rate(success):
    plt.plot(success['n_features'], success['error_rate'])
    plt.grid()
    plt.xlabel('Number of features')
    plt.ylabel('Error rate')

    # TODO: Find knee instead of min
    # best = success[success['error_rate'] == success['error_rate'].min()]
    kneedle = KneeLocator(
        success['n_features'], success['error_rate'], 
        S=1.0, curve='convex', direction='decreasing'
    )
    best = success[success['n_features'] == kneedle.elbow]
    return best


def knn_best_evaluate(best):
    k_best = best['k_neighbors'].to_numpy()[0]
    n_best = best['n_features'].to_numpy()[0]
    
    X_train, X_test, y_train, y_test = pipeline_v1(
        fd_features, train=TRAINING_SET_RATIO, nfeat=n_best, func_select=mutual_info_classif
    )
    knn = KNeighborsClassifier(n_neighbors=k_best, metric=DIST_METRIC)
    knn.fit(X_train, y_train)
    y_predict_train = knn.predict(X_train)
    y_predict_test = knn.predict(X_test)

    print(f'Train accuracy: {metrics.accuracy_score(y_train, y_predict_train):.4f} %')
    print(f'Test accuracy: {metrics.accuracy_score(y_test, y_predict_test):.4f} %')
    print(metrics.classification_report(y_test, y_predict_test))
    
    print('Most informative features:')
    print(list(X_train.columns))

1. Domain: time, Axis: ax

In [None]:
features = load_td_feat(['az'], path=FEATURES_PATH)
fd_features = fault_labeling(features, FAULT_CLASSES, debug=False)

columns = filter_out_metadata_columns(features.copy()).columns
success = evaluate_knn_number_of_features(fd_features, columns)

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(best)

2. Domain: frequency, Axis: az

In [None]:
fd_features = fault_labeling(load_fd_feat(['az'], path=FEATURES_PATH), FAULT_CLASSES)

columns = filter_out_metadata_columns(features.copy()).columns
success = evaluate_knn_number_of_features(fd_features, columns)

Find best (minimal) error rate of kNN classifier

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(best)

3. Domain: time, Axis: (ax, ay, az)

In [None]:
fd_features = fault_labeling(load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH), FAULT_CLASSES)

columns = filter_out_metadata_columns(features.copy()).columns
success = evaluate_knn_number_of_features(fd_features, columns)

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(best)

4. Domain: frequency, Axis: (ax, ay, az)

In [None]:
fd_features = fault_labeling(load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH), FAULT_CLASSES)

columns = filter_out_metadata_columns(features.copy()).columns
success = evaluate_knn_number_of_features(fd_features, columns)

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(best)

K-Fold validation

In [None]:
TRAINING_SET_RATIO = 0.6
N_FEATURES = 3
N_NEIGHBOURS = 5
DIST_METRIC = 'euclidean'           # euclidean, mahalanobis, rbf
FSEL_METHOD = mutual_info_classif   # corr_classif, f_classif, mutual_info_classif

features = fault_labeling(load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH), FAULT_CLASSES)

# crossvalid = StratifiedShuffleSplit(n_splits=5, test_size=0.6, random_state=100)
crossvalid = StratifiedKFold(n_splits=5)
X = filter_out_metadata_columns(features)
y = features['fault']

evaluation = {
    'train_accuracy': [],
    'train_precision': [],
    'train_recall': [],
    'test_accuracy': [],
    'test_precision': [],
    'test_recall': []
}

for train_index, test_index in tqdm(crossvalid.split(X, y)):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]

    X_train, X_test, y_train, y_test = pipeline_v1_core(
        FSEL_METHOD, N_FEATURES, 
        X_train, y_train, X_test, y_test
    )

    if DIST_METRIC == 'euclidean':
        d = 'euclidean'
    elif DIST_METRIC == 'mahalanobis':
        d = lambda x, y: mahalanobis(x, y, np.cov(X_train.T))
    elif DIST_METRIC == 'rbf':
        d = lambda x, y: np.exp(-euclidean(x, y)**2 * (1 / len(x)))

    knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS, metric=d)
    knn.fit(X_train, y_train)
    y_predict_train = knn.predict(X_train)
    y_predict_test = knn.predict(X_test)

    evaluation['train_accuracy'].append(
        metrics.accuracy_score(y_train, y_predict_train)
    )
    evaluation['train_precision'].append(
        metrics.precision_score(y_train, y_predict_train, average='micro')
    )
    evaluation['train_recall'].append(
        metrics.recall_score(y_train, y_predict_train, average='micro')
    )
    
    evaluation['test_accuracy'].append(
        metrics.accuracy_score(y_test, y_predict_test)
    )
    evaluation['test_precision'].append(
        metrics.precision_score(y_test, y_predict_test, average='micro')
    )
    evaluation['test_recall'].append(
        metrics.recall_score(y_test, y_predict_test, average='micro')
    )

print(f'Most informative features: {list(X_train.columns)}')
evaluation = pd.DataFrame(evaluation)
evaluation.mean()

Compression ratio 
- when all features are in use
- calculate compression ratio: dimensions of feature matrix : dimensions of waveform source matrix

In [None]:
def load_dataset_info():
    RPM = 2900
    RPM_RANGE = 500
    MAFAULDA_METADATA = os.path.join(FEATURES_PATH, 'mafaulda_metadata.csv')
    meta = pd.read_csv(MAFAULDA_METADATA, index_col='filename')
    files = meta[
        (meta['fault'].isin(FAULT_CLASSES)) &
        (meta['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both'))
    ].copy()
    return files

def compression_percentage(waveforms, domain, one_file):
    if domain == 'td':
        features = load_td_feat(waveforms, path=FEATURES_PATH)
    elif domain == 'fd':
        features = load_fd_feat(waveforms, path=FEATURES_PATH)
        
    files = load_dataset_info()
    features_clean = filter_out_metadata_columns(features.copy())

    if one_file:
        files = files[:1]
        features_clean = features_clean[:1]
    
    original_size = np.sum(len(waveforms) * files['length'].to_numpy())
    new_size = np.prod(features_clean.shape)
    compression_percentage = (new_size / original_size) * 100

    return compression_percentage

columns = ['ax']
print('Time domain')
print(f'One file: features from raw data {compression_percentage(columns, "td", one_file=True):.4f} %')
print(f'All files: features from raw data {compression_percentage(columns, "td", one_file=False):.4f} %')

print()
print('Frequency domain')
print(f'One file: features from raw data {compression_percentage(columns, "fd", one_file=True):.4f} %')
print(f'All files: features from raw data {compression_percentage(columns, "fd", one_file=False):.4f} %')