In [None]:
import numpy as np
import pandas as pd
from zipfile import ZipFile

# Feature selection
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif, f_classif

# Models
from scipy.spatial.distance import euclidean, mahalanobis
from sklearn.metrics.pairwise import rbf_kernel

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from scipy.stats import gmean

# Model evaluation
from sklearn import metrics
from kneed import KneeLocator

# Plotting and table formatting
import matplotlib.pyplot as plt
from IPython.display import Markdown
from tabulate import tabulate
import seaborn as sb
from collections import Counter
from tqdm.notebook import tqdm

# System modules
import re
import itertools
import os
import sys
sys.path.append('../')

# Custom modules
from vibrodiagnostics import (
    mafaulda,
    discovery, 
    selection,
    models
)

SOURCE_AXIS = ('ax', 'ay', 'az')
TARGET = 'fault'

PATH_PREFIX = '../../datasets/'
FEATURES_PATH =  os.path.join(PATH_PREFIX, 'features_data')

TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)
TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)

Parameters:
- Distance metric
- k neighbours (odd numbers because of majority voting) - elbow curve

### Temporal features

In [None]:
def knn_evaluation(x_train, y_train, x_test, y_test):
    knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean', algorithm='kd_tree')
    knn.fit(x_train, y_train)
    y_predict_train = knn.predict(x_train)
    y_predict_test = knn.predict(x_test)

    print(f'Train accuracy: {metrics.accuracy_score(y_train, y_predict_train) * 100:.2f} %')
    print(f'Test accuracy: {metrics.accuracy_score(y_test, y_predict_test) * 100:.2f} %')
    print(metrics.classification_report(y_test, y_predict_test))

    cm = metrics.confusion_matrix(y_test, y_predict_test)
    ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')
    plt.show()


def knn_one_case_eval(neighbours, features, x_train, y_train, x_test, y_test):
    x_train_selected = x_train[features]
    x_test_selected = x_test[features]

    knn = KNeighborsClassifier(n_neighbors=neighbours, metric='euclidean', algorithm='kd_tree')
    knn.fit(x_train_selected, y_train)
    y_predict_train = knn.predict(x_train_selected)
    y_predict_test = knn.predict(x_test_selected)

    y_proba_train = knn.predict_proba(x_train_selected)
    y_proba_test = knn.predict_proba(x_test_selected)

    return {
        'features': features,
        'train_accuracy': metrics.accuracy_score(y_train, y_predict_train),
        'train_precision': metrics.precision_score(y_train, y_predict_train, average='micro'),
        'train_recall': metrics.recall_score(y_train, y_predict_train, average='micro'),
        'train_error_rate': np.mean(y_train != y_predict_train),
        # 'train_auc': metrics.roc_auc_score(y_train, y_proba_train, multi_class='ovo', average='macro'), # class imbalance insensivity
        'test_accuracy': metrics.accuracy_score(y_test, y_predict_test),
        'test_precision': metrics.precision_score(y_test, y_predict_test, average='micro'),
        'test_recall': metrics.recall_score(y_test, y_predict_test, average='micro'),
        'test_error_rate': np.mean(y_test != y_predict_test)
        #'test_auc': metrics.roc_auc_score(y_test, y_proba_test, multi_class='ovo', average='macro')
    }

def knn_feature_combinations(neighbours, all_features, combinations, x_train, y_train, x_test, y_test):
    evaluation = []

    for features in itertools.combinations(all_features, r=combinations):
        result = knn_one_case_eval(neighbours, list(features), x_train, y_train, x_test, y_test)
        evaluation.append(result)

    evaluation = pd.DataFrame.from_records(evaluation)
    return evaluation.sort_values(by='test_error_rate', ascending=True).reset_index(drop=True)

In [None]:
TRAIN_SET_RATIO = 0.8
ANOMALY_SEVERITY = 0.9
WINDOW_SIZE = 2**14


x_train, y_train, x_test, y_test = models.load_feature_set(
    TD_FEATURES, 
    selection.FAULT_CLASSES,
    SOURCE_AXIS,
    TARGET, 
    train_size=TRAIN_SET_RATIO, 
    anomaly_severity=ANOMALY_SEVERITY,
    balance=True,
    rpm_limit=False,
    window_size=None,
    domain='temporal'
)
scaler = MinMaxScaler()
x_train[x_train.columns] = scaler.fit_transform(x_train)
x_test[x_test.columns] = scaler.transform(x_test)

#### Classification with kNN (all features)

In [None]:
knn_evaluation(x_train, y_train, x_test, y_test)

#### Classification with kNN (choose features)

In [None]:
# features = ['std', 'rms', 'pp'] 
features = ['std', 'skewness', 'pp']
knn_evaluation(x_train[features], y_train, x_test[features], y_test)

#### Classification with kNN (all cominations)

In [None]:
# Minimal number of features: 3 (Benchmark: because we need at least 3 PC)  20 seconds (3), 40s (4)
evaluation = knn_feature_combinations(5, list(x_train.columns), 3, x_train, y_train, x_test, y_test)
evaluation

In [None]:
evaluation.describe()

---
### Spectral features

In [None]:
WINDOW_SIZE = 2**14

x_train, y_train, x_test, y_test = models.load_feature_set(
    FD_FEATURES, 
    selection.FAULT_CLASSES,
    SOURCE_AXIS,
    'fault', 
    train_size=TRAIN_SET_RATIO, 
    anomaly_severity=ANOMALY_SEVERITY,
    balance=True,
    rpm_limit=False,
    window_size=WINDOW_SIZE,
    domain='spectral'
)
scaler = MinMaxScaler()
x_train[x_train.columns] = scaler.fit_transform(x_train)
x_test[x_test.columns] = scaler.transform(x_test)

#### Classification with kNN (all features)

In [None]:
knn_evaluation(x_train, y_train, x_test, y_test)

#### Classification with kNN (choose features)

In [None]:
#features = ['entropy', 'std', 'noisiness'] 
features = ['entropy', 'noisiness', 'centroid']
knn_evaluation(x_train[features], y_train, x_test[features], y_test)

In [None]:
# Minimal number of features: 3 (Benchmark: because we need at least 3 PC)  30 seconds (3), 60s (4)
evaluation = knn_feature_combinations(5, list(x_train.columns), 3, x_train, y_train, x_test, y_test)
evaluation

In [None]:
evaluation.describe()

---

In [None]:
def load_source(dataset: str, domain: str, row: dict):
    RPM = 2900
    RPM_RANGE = 500
    faults = {
        'shaft': {
            'normal': 'N',
            'imbalance': 'I',
            'horizontal-misalignment': 'HM',
            'vertical-misalignment': 'VM'
        },
        'bearings': {
            'overhang-cage_fault': 'O-Cage',
            'underhang-cage_fault': 'U-Cage',
            'underhang-ball_fault': 'U-Ball',
            'overhang-ball_fault': 'O-Ball',
            'underhang-outer_race': 'U-Race',
            'overhang-ball_fault': 'O-Race'
        }
    }
    placements = {
        'A': ['ax', 'ay', 'az'],
        'B': ['bx', 'by', 'bz']
    }

    features = pd.read_csv(dataset).fillna(0)

    # Filter catgories for classification
    classes = faults[row['hardware']]
    features = features[features['fault'].isin(classes)]

    # Choosing rpm range
    if row['rpm_limit']:
        features = features[features['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both')]

    # Labeling anomaly severity levels
    target = re.search(r'([a-z]+)_?(\d+)?', row['target'])
    anomaly_severity = target.group(2) or '60'
    anomaly_severity = float(anomaly_severity) / 100
    features = models.fault_labeling(features, classes, anomaly_severity)

    # Keeping columns for measurement placement: A, B
    axis = placements['A'] + placements['B'] # TODO: placements[row['placement']]
    columns = features.columns.str.startswith(tuple(axis))
    X = features[features.columns[columns]]

    # Select predicted variable column
    label = target.group(1)
    Y = features[label].astype('category')

    # Filter columns in feature domain with window size 2**14
    if domain == 'spectral':
        window_size = 2**14
        X = X.loc[:,X.columns.str.endswith(f'_{window_size}')]
        X.columns = X.columns.str.extract(r'(\w+)_\w+$')[0]

    # Calculate feature magnitudes from 3D vector
    feature_names = get_features_list({domain: dataset})
    result = pd.DataFrame()
    for name in feature_names:              
        # Remove prefix: temporal, spectral
        name = re.search(r'[a-z]+_([\w\_]+)', name).group(1)
        vector_dims = [f'{dim}_{name}' for dim in axis]
        result[name] = X[vector_dims].apply(np.linalg.norm, axis=1)
    X = result

    # Batch / Online hold-out (balance and event sequencing)
    train_size = 0.8
    # if row['online']:
    #     X_train, X_test, Y_train, Y_test = train_test_split(
    #         X, Y, train_size=train_size, random_state=10
    #     )   
    #     X_train, X_test, Y_train, Y_test = (
    #         X_train.sort_index(), X_test.sort_index(),
    #         Y_train.sort_index(), Y_test.sort_index()
    #     )

    # else:
    oversample = RandomOverSampler(sampling_strategy='not majority', random_state=10)
    X, Y = oversample.fit_resample(X, Y.to_numpy())
    X.reset_index(drop=True, inplace=True)
    Y = pd.Series(Y)

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, train_size=train_size, stratify=Y, random_state=10
    )

    scaler = MinMaxScaler()
    X_train[X_train.columns] = scaler.fit_transform(X_train)
    X_test[X_test.columns] = scaler.transform(X_test)

    return X_train, X_test, Y_train, Y_test

def get_features_list(domains):
    features = []
    for dname, dataset in domains.items():
        names = pd.read_csv(dataset)
        names = names.columns.str.extract(r'([a-z]{2})_([a-z\_\-]+)')[1].unique()
        features.extend([f'{dname}_{col.strip("_")}' for col in names if not pd.isnull(col)])

    return features

def knn_fsel_evaluation(domain, source, neighbours=5, exhaustive=False, ranks=False):
    columns = get_features_list({domain: source})
    if ranks:
        best_set_membership = pd.read_csv('../FeatureSelection/best_set_ranks.csv')
    else:
        best_set_membership = pd.read_csv('../FeatureSelection/best_set_membership.csv')

    best_set_membership['target'] = best_set_membership['target'].str.extract(r'([a-z]+)')

    model = []
    groupby_columns = ['rpm_limit', 'hardware', 'target']
    for key, group in tqdm(best_set_membership.groupby(by=groupby_columns)):
        if ranks:
            situation = group[columns].apply(gmean, axis=0).sort_values(ascending=True).head(3)
        else:
            situation = group[columns][group == True].count(axis=0).sort_values(ascending=False).head(3)
        
        features = [re.search(r'[a-z+]_(\w+)', s).group(1) for s in situation.index]

        result = dict(zip(groupby_columns, key))
        x_train, x_test, y_train, y_test = load_source(source, domain, result)
        if exhaustive:
            score = knn_feature_combinations(
                neighbours, list(x_train.columns), 3, x_train, y_train, x_test, y_test
            ).head(1).to_dict(orient='index')[0]
        else:
            score = knn_one_case_eval(neighbours, features, x_train, y_train, x_test, y_test)
        result.update(score)
        model.append(result)

    return pd.DataFrame.from_records(model)

def knn_neighbours_eval(domain, dataset):
    train_rates = pd.DataFrame()
    test_rates = pd.DataFrame()
    for n in range(3, 21, 2):
        scores = (
            knn_fsel_evaluation(domain, dataset, neighbours=n)
            .drop(columns=['features'])
            .set_index(['rpm_limit', 'hardware', 'target'])
        )
        train_rates[n] = scores['train_error_rate']
        test_rates[n] = scores['test_error_rate']
    return train_rates, test_rates

Chosen features

In [None]:
knn_fsel_evaluation('temporal', TD_FEATURES)

In [None]:
knn_fsel_evaluation('temporal', TD_FEATURES, exhaustive=False, ranks=True)

In [None]:
knn_fsel_evaluation('spectral', FD_FEATURES)

In [None]:
knn_fsel_evaluation('spectral', FD_FEATURES, exhaustive=False, ranks=True)

In [None]:
train, test = knn_neighbours_eval('temporal', TD_FEATURES)

In [None]:
train

In [None]:
test

In [None]:
train.T.plot(marker='.', grid=True, figsize=(10, 5), xlabel='Neighbours', ylabel='Error rate')
plt.show()
test.T.plot(marker='.', grid=True, figsize=(10, 5), xlabel='Neighbours', ylabel='Error rate')
plt.show()

In [None]:
knn_fsel_evaluation('temporal', TD_FEATURES, exhaustive=False, ranks=True)

In [None]:
train, test = knn_neighbours_eval('spectral', FD_FEATURES)

In [None]:
train

In [None]:
test

In [None]:
train.T.plot(marker='.', grid=True, figsize=(10, 5), xlabel='Neighbours', ylabel='Error rate')
plt.show()
test.T.plot(marker='.', grid=True, figsize=(10, 5), xlabel='Neighbours', ylabel='Error rate')
plt.show()

In [None]:
knn_fsel_evaluation('spectral', FD_FEATURES, exhaustive=False, ranks=True)

Exhaustive

In [None]:
knn_fsel_evaluation('temporal', TD_FEATURES, exhaustive=True)

In [None]:
knn_fsel_evaluation('temporal', TD_FEATURES, exhaustive=True, ranks=True)

In [None]:
knn_fsel_evaluation('spectral', FD_FEATURES, exhaustive=True)

In [None]:
knn_fsel_evaluation('spectral', FD_FEATURES, exhaustive=True, ranks=True)

---
#### Find best k neighbors - elbow analysis

In [None]:
k_values = list(range(3, 40, 2))
accuracies = get_knn_accuracies(k_values, X_train, y_train, X_test, y_test)
error_rates = get_knn_error_rates(k_values, X_train, y_train, X_test, y_test)
plot_knn_k_param(k_values, accuracies, error_rates)

### Change number of features
- evaluate kNN classification accuracies in each feature domain
  - features for each axis separately (x, y, z)
  - features in each measurement poins (a, b)

In [None]:
TRAINING_SET_RATIO = 0.8
DIST_METRIC = 'euclidean'
FEATURE_SELECTION_METHOD = mutual_info_classif
KNN_K_VALUES = list(range(3, 20, 2))
WINDOW_SIZE = 2**14

def evaluate_knn_number_of_features(features_filename, domain):    
    results = []
    X_train0, y_train0, X_test0, y_test0 = models.load_feature_set(
        features_filename, 
        selection.FAULT_CLASSES,
        SOURCE_AXIS,
        'fault', 
        train_size=TRAINING_SET_RATIO, 
        anomaly_severity=ANOMALY_SEVERITY,
        balance=True,
        rpm_limit=False,
        window_size=WINDOW_SIZE,
        domain=domain
    )
    for n in tqdm(range(1, len(X_train0.columns) + 1)):
        X_train, X_test, y_train, y_test = models.pipeline_v1_core(
            FEATURE_SELECTION_METHOD, n,
            X_train0, y_train0, X_test0, y_test0
        )

        metric = get_knn_error_rates(KNN_K_VALUES, X_train, y_train, X_test, y_test)
        kneedle = KneeLocator(KNN_K_VALUES, metric, S=1.0, curve='convex', direction='decreasing')
        results.append([n, kneedle.elbow, kneedle.knee_y])
    
    return pd.DataFrame(results, columns=['n_features', 'k_neighbors', 'error_rate'])


def find_optimal_error_rate(success):
    plt.plot(success['n_features'], success['error_rate'])
    plt.grid()
    plt.xlabel('Number of features')
    plt.ylabel('Error rate')

    kneedle = KneeLocator(
        success['n_features'], success['error_rate'], 
        S=1.0, curve='convex', direction='decreasing'
    )
    best = success[success['n_features'] == kneedle.elbow]
    return best


def kfold_validate_knn(features_filename):
    features = pd.read_csv(FD_FEATURES)
    features = models.fault_labeling(features, selection.FAULT_CLASSES)
    columns = features.columns.str.startswith(tuple(['ax', 'ay', 'az']))
    X = features[features.columns[columns]]
    y = features['fault'].astype('category')

    # crossvalid = StratifiedShuffleSplit(n_splits=5, test_size=0.6, random_state=100)
    crossvalid = StratifiedKFold(n_splits=5)
    evaluation = []

    for train_index, test_index in tqdm(crossvalid.split(X, y)):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        X_train, X_test, y_train, y_test = models.pipeline_v1_core(
            FSEL_METHOD, N_FEATURES, 
            X_train, y_train, X_test, y_test
        )

        if DIST_METRIC == 'euclidean':
            d = 'euclidean'
        elif DIST_METRIC == 'mahalanobis':
            d = lambda x, y: mahalanobis(x, y, np.cov(X_train.T))
        elif DIST_METRIC == 'rbf':
            d = lambda x, y: np.exp(-euclidean(x, y)**2 * (1 / len(x)))

        knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS, metric=d)
        knn.fit(X_train, y_train)
        y_predict_train = knn.predict(X_train)
        y_predict_test = knn.predict(X_test)

        evaluation.append({
            'train_accuracy': metrics.accuracy_score(y_train, y_predict_train),
            'train_precision': metrics.precision_score(y_train, y_predict_train, average='micro'),
            'train_recall': metrics.recall_score(y_train, y_predict_train, average='micro'),
            'test_accuracy': metrics.accuracy_score(y_test, y_predict_test),
            'test_precision': metrics.precision_score(y_test, y_predict_test, average='micro'),
            'test_recall': metrics.recall_score(y_test, y_predict_test, average='micro')
        })

    print(f'Most informative features: {list(X_train.columns)}')
    evaluation = pd.DataFrame.from_records(evaluation)
    print(evaluation.mean())