In [None]:
import numpy as np
import pandas as pd
from zipfile import ZipFile

# Feature selection
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif, f_classif

# Models
from scipy.spatial.distance import euclidean, mahalanobis
from sklearn.metrics.pairwise import rbf_kernel

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

# Model evaluation
from sklearn import metrics
from kneed import KneeLocator

# Plotting and table formatting
import matplotlib.pyplot as plt
from IPython.display import Markdown
from tabulate import tabulate
import seaborn as sb
from collections import Counter
from tqdm.notebook import tqdm

# System modules
import os
import sys
sys.path.append('../')

# Custom modules
from vibrodiagnostics import (
    mafaulda,
    discovery, 
    selection,
    models
)

TRAIN_SET_RATIO = 0.7
ANOMALY_SEVERITY = 0.7
SOURCE_AXIS = ('ax', 'ay', 'az')
TARGET = 'fault'

PATH_PREFIX = '../../datasets/'
FEATURES_PATH =  os.path.join(PATH_PREFIX, 'features_data')

TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)
TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)

TSFEL_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_FEATURES_PATH)
TSFEL_TD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_TIME_FEATURES_PATH)
TSFEL_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_FREQ_FEATURES_PATH)
TSFEL_SD_FEATURES = os.path.join(FEATURES_PATH, selection.TSFEL_STAT_FEATURES_PATH)

Parameters:
- Distance metric
- k neighbours (odd numbers because of majority voting) - elbow curve

#### Features import and transformations

In [None]:
TRAINING_SET_RATIO = 0.6
N_FEATURES = 3
N_NEIGHBOURS = 5
DIST_METRIC = 'euclidean'
FSEL_METHOD = mutual_info_classif


X_train, y_train, X_test, y_test = models.features_subset_offline(
    TD_FD_FEATURES, selection.FAULT_CLASSES, SOURCE_AXIS, 'fault', 
    train_size=TRAIN_SET_RATIO, anomaly_severity=ANOMALY_SEVERITY
)
X_train, X_test, y_train, y_test = models.pipeline_v1_core(
    FSEL_METHOD, N_FEATURES,
    X_train, y_train, X_test, y_test
)

#### Classification with kNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS, metric=DIST_METRIC, algorithm='kd_tree')
knn.fit(X_train, y_train)
y_predict_train = knn.predict(X_train)
y_predict_test = knn.predict(X_test)

#### Get the classification report

In [None]:
print(f'Train accuracy: {metrics.accuracy_score(y_train, y_predict_train):.4f}')
print(f'Test accuracy: {metrics.accuracy_score(y_test, y_predict_test):.4f}')
print(metrics.classification_report(y_test, y_predict_test))

#### Confusion matrix

In [None]:
cm = metrics.confusion_matrix(y_test, y_predict_test)
ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')

#### Find best k neighbors - elbow analysis

In [None]:
def get_knn_error_rates(k_values, X_train, y_train, X_test, y_test):
    errors = []
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree', metric=DIST_METRIC)
        knn.fit(X_train, y_train)
        y_predict = knn.predict(X_test)
        errors.append(np.mean(y_predict != y_test))
    return errors


def get_knn_accuracies(k_values, X_train, y_train, X_test, y_test):
    accuracies = []

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree', metric=DIST_METRIC)
        knn.fit(X_train, y_train)
        y_predict = knn.predict(X_test)
        accuracies.append(metrics.accuracy_score(y_test, y_predict))
    return accuracies


def plot_knn_k_param(k_values, accuracies, error_rates):
    fig, ax = plt.subplots(1, 2, figsize=(15, 4))
    
    ax[0].plot(k_values, accuracies, color='darkblue', marker='o', markerfacecolor='darkgreen', markersize=5)
    ax[0].set_xlabel('K neighbors')
    ax[0].set_ylabel('Accuracy')
    ax[0].grid(True)

    ax[1].plot(k_values, error_rates, color='darkblue', marker='o', markerfacecolor='darkgreen', markersize=5)
    ax[1].set_xlabel('K neighbors')
    ax[1].set_ylabel('Error rate')
    ax[1].grid(True)
    
    plt.show()

In [None]:
k_values = list(range(3, 40, 2))
accuracies = get_knn_accuracies(k_values, X_train, y_train, X_test, y_test)
error_rates = get_knn_error_rates(k_values, X_train, y_train, X_test, y_test)
plot_knn_k_param(k_values, accuracies, error_rates)

### Change number of features
- evaluate kNN classification accuracies in each feature domain
  - features for each axis separately (x, y, z)
  - features in each measurement poins (a, b)

In [None]:
TRAINING_SET_RATIO = 0.6
DIST_METRIC = 'euclidean'
FEATURE_SELECTION_METHOD = mutual_info_classif
KNN_K_VALUES = list(range(3, 20, 2))

def evaluate_knn_number_of_features(features_filename):    
    results = []
    X_train0, y_train0, X_test0, y_test0 = models.features_subset_offline(
        features_filename, selection.FAULT_CLASSES, SOURCE_AXIS, TARGET, 
        train_size=TRAINING_SET_RATIO, anomaly_severity=ANOMALY_SEVERITY
    )

    for n in tqdm(range(1, len(X_train0.columns) + 1)):
        X_train, X_test, y_train, y_test = models.pipeline_v1_core(
            FEATURE_SELECTION_METHOD, n,
            X_train0, y_train0, X_test0, y_test0
        )

        metric = get_knn_error_rates(KNN_K_VALUES, X_train, y_train, X_test, y_test)
        kneedle = KneeLocator(KNN_K_VALUES, metric, S=1.0, curve='convex', direction='decreasing')
        results.append([n, kneedle.elbow, kneedle.knee_y])
    
    return pd.DataFrame(results, columns=['n_features', 'k_neighbors', 'error_rate'])


def find_optimal_error_rate(success):
    plt.plot(success['n_features'], success['error_rate'])
    plt.grid()
    plt.xlabel('Number of features')
    plt.ylabel('Error rate')

    kneedle = KneeLocator(
        success['n_features'], success['error_rate'], 
        S=1.0, curve='convex', direction='decreasing'
    )
    best = success[success['n_features'] == kneedle.elbow]
    return best


def knn_all_evaluate(features_filename, title, subset=None, k=5):
    X_train, y_train, X_test, y_test = models.features_subset_offline(
        features_filename, selection.FAULT_CLASSES, SOURCE_AXIS, TARGET, 
        train_size=TRAINING_SET_RATIO, anomaly_severity=ANOMALY_SEVERITY
    )

    if subset is not None:
        # subset feature columns
        subset = [f'az_{col}' for col in subset]
        X_train = X_train[subset]
        X_test = X_test[subset]

    scaler = MinMaxScaler()
    X_train[X_train.columns] = scaler.fit_transform(X_train)
    X_test[X_test.columns] = scaler.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=k, metric=DIST_METRIC)
    knn.fit(X_train, y_train)
    y_predict_train = knn.predict(X_train)
    y_predict_test = knn.predict(X_test)

    y_proba_train = knn.predict_proba(X_train)
    y_proba_test = knn.predict_proba(X_test)

    return {
        'feature_set': title,
        'train_accuracy': metrics.accuracy_score(y_train, y_predict_train),
        'train_precision': metrics.precision_score(y_train, y_predict_train, average='micro'),
        'train_recall': metrics.recall_score(y_train, y_predict_train, average='micro'),
        'train_auc': metrics.roc_auc_score(y_train, y_proba_train, multi_class='ovo', average='macro'), # class imbalance insensivity
        'test_accuracy': metrics.accuracy_score(y_test, y_predict_test),
        'test_precision': metrics.precision_score(y_test, y_predict_test, average='micro'),
        'test_recall': metrics.recall_score(y_test, y_predict_test, average='micro'),
        'test_auc': metrics.roc_auc_score(y_test, y_proba_test, multi_class='ovo', average='macro')
    }


def knn_best_evaluate(features_filename, best):
    k_best = best['k_neighbors'].to_numpy()[0]
    n_best = best['n_features'].to_numpy()[0]

    X_train0, y_train0, X_test0, y_test0 = models.features_subset_offline(
        features_filename, selection.FAULT_CLASSES, SOURCE_AXIS, TARGET, 
        train_size=TRAINING_SET_RATIO, anomaly_severity=ANOMALY_SEVERITY
    )

    X_train, X_test, y_train, y_test = models.pipeline_v1_core(
        FEATURE_SELECTION_METHOD, n_best,
        X_train0, y_train0, X_test0, y_test0
    )
    knn = KNeighborsClassifier(n_neighbors=k_best, metric=DIST_METRIC)
    knn.fit(X_train, y_train)
    y_predict_train = knn.predict(X_train)
    y_predict_test = knn.predict(X_test)

    print(f'Train accuracy: {metrics.accuracy_score(y_train, y_predict_train):.4f} %')
    print(f'Test accuracy: {metrics.accuracy_score(y_test, y_predict_test):.4f} %')
    print(metrics.classification_report(y_test, y_predict_test))
    
    print('Most informative features:')
    print(list(X_train.columns))


def kfold_validate_knn(features_filename):
    features = pd.read_csv(FD_FEATURES)
    features = models.fault_labeling(features, selection.FAULT_CLASSES)
    columns = features.columns.str.startswith(tuple(['ax', 'ay', 'az']))
    X = features[features.columns[columns]]
    y = features['fault'].astype('category')

    # crossvalid = StratifiedShuffleSplit(n_splits=5, test_size=0.6, random_state=100)
    crossvalid = StratifiedKFold(n_splits=5)
    evaluation = []

    for train_index, test_index in tqdm(crossvalid.split(X, y)):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        X_train, X_test, y_train, y_test = models.pipeline_v1_core(
            FSEL_METHOD, N_FEATURES, 
            X_train, y_train, X_test, y_test
        )

        if DIST_METRIC == 'euclidean':
            d = 'euclidean'
        elif DIST_METRIC == 'mahalanobis':
            d = lambda x, y: mahalanobis(x, y, np.cov(X_train.T))
        elif DIST_METRIC == 'rbf':
            d = lambda x, y: np.exp(-euclidean(x, y)**2 * (1 / len(x)))

        knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS, metric=d)
        knn.fit(X_train, y_train)
        y_predict_train = knn.predict(X_train)
        y_predict_test = knn.predict(X_test)

        evaluation.append({
            'train_accuracy': metrics.accuracy_score(y_train, y_predict_train),
            'train_precision': metrics.precision_score(y_train, y_predict_train, average='micro'),
            'train_recall': metrics.recall_score(y_train, y_predict_train, average='micro'),
            'test_accuracy': metrics.accuracy_score(y_test, y_predict_test),
            'test_precision': metrics.precision_score(y_test, y_predict_test, average='micro'),
            'test_recall': metrics.recall_score(y_test, y_predict_test, average='micro')
        })

    print(f'Most informative features: {list(X_train.columns)}')
    evaluation = pd.DataFrame.from_records(evaluation)
    print(evaluation.mean())

#### Feature set #1: Custom features all

In [None]:
success = evaluate_knn_number_of_features(TD_FD_FEATURES)
success

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(TD_FD_FEATURES, best)

In [None]:
kfold_validate_knn(TD_FD_FEATURES)

#### Feature set #2: Custom time domain features

In [None]:
success = evaluate_knn_number_of_features(TD_FEATURES)
success

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(TD_FEATURES, best)

In [None]:
kfold_validate_knn(TD_FEATURES)

#### Feature set #3: Custom frequency domain features

In [None]:
success = evaluate_knn_number_of_features(FD_FEATURES)
success

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(FD_FEATURES, best)

In [None]:
kfold_validate_knn(FD_FEATURES)

#### Feature set #4: TSFEL features all

In [None]:
success = evaluate_knn_number_of_features(TSFEL_FEATURES)
success

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(TSFEL_FEATURES, best)

In [None]:
kfold_validate_knn(TSFEL_FEATURES)

#### Feature set #5: TSFEL temporal domain features

In [None]:
success = evaluate_knn_number_of_features(TSFEL_TD_FEATURES)
success

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(TSFEL_TD_FEATURES, best)

In [None]:
kfold_validate_knn(TSFEL_TD_FEATURES)

#### Feature set #6: TSFEL spectral domain features

In [None]:
success = evaluate_knn_number_of_features(TSFEL_FD_FEATURES)
success

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(TSFEL_FD_FEATURES, best)

In [None]:
kfold_validate_knn(TSFEL_FD_FEATURES)

#### Feature set #7: TSFEL statistical domain features

In [None]:
success = evaluate_knn_number_of_features(TSFEL_SD_FEATURES)
success

In [None]:
best = find_optimal_error_rate(success)
best

In [None]:
knn_best_evaluate(TSFEL_SD_FEATURES, best)

In [None]:
kfold_validate_knn(TSFEL_SD_FEATURES)

### Feature sets metrics (all features)

In [None]:
feature_sets = {
    TD_FD_FEATURES: 'Custom all features',
    TD_FEATURES: 'Custom time domain features',
    FD_FEATURES: 'Custom frequency domain features',
    TSFEL_FEATURES: 'TSFEL all features',
    TSFEL_TD_FEATURES: 'TSFEL temporal domain features',
    TSFEL_FD_FEATURES: 'TSFEL specral domain features',
    TSFEL_SD_FEATURES: 'TSFEL statistical features',
}
success_rates = []
for fset, fname in feature_sets.items():
    scores = knn_all_evaluate(fset, fname)
    success_rates.append(scores)

success_rates = pd.DataFrame.from_records(success_rates)
success_rates

### TODO: Feature Subsets

In [None]:
# TODO fill list with 3 and 7 most informative features
# TODO: targets - fault (place A), anomaly
feature_sets = {
    (FD_FEATURES, 'Custom frequency domain features'): ['noisiness_1024', 'flux_mean_1024', 'centroid_1024'],
    (TD_FEATURES, 'Custom time domain features'): ['shape', 'rms', 'skew'],
    (TSFEL_TD_FEATURES, 'TSFEL temporal domain features'): ['area_under_the_curve', 'autocorrelation', 'median_absolute_diff'],
    (TSFEL_FD_FEATURES, 'TSFEL spectral domain features'): ['spectral_entropy', 'spectral_roll-on', 'wavelet_entropy'],
    (TSFEL_SD_FEATURES, 'TSFEL statistical features'): ['spectral_entropy', 'wavelet_entropy', 'median_absolute_deviation'],
}

success_rates_subset = []
for fset, features in feature_sets.items():
    scores = knn_all_evaluate(fset[0], fset[1], subset=features)   # TODO: use all axis (extend ax, ay, az)
    success_rates_subset.append(scores)

success_rates_subset = pd.DataFrame.from_records(success_rates_subset)
success_rates_subset