In [None]:
import re
import itertools
import os

import numpy as np
import pandas as pd
from zipfile import ZipFile
from typing import List, Dict

from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sb
from tabulate import tabulate
from tqdm.notebook import tqdm

import sys
sys.path.append('../')
from vibrodiagnostics import (
    mafaulda, 
    extraction
)

In [None]:
N_NEIGHBORS = 5

Parameters:
- Distance metric
- k neighbours (odd numbers because of majority voting) - elbow curve

In [None]:
def knn_one_case_eval(
        k: int,
        features: List[str],
        x_train: pd.DataFrame,
        y_train: pd.DataFrame,
        x_test: pd.DataFrame,
        y_test: pd.DataFrame) -> Dict[str, float]:

    x_train_selected = x_train[features]
    x_test_selected = x_test[features]

    knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean', algorithm='kd_tree')
    knn.fit(x_train_selected, y_train)
    y_predict_train = knn.predict(x_train_selected)
    y_predict_test = knn.predict(x_test_selected)

    y_proba_train = knn.predict_proba(x_train_selected)
    y_proba_test = knn.predict_proba(x_test_selected)

    return {
        'features': features,
        'train_accuracy': metrics.accuracy_score(y_train, y_predict_train),
        'train_precision': metrics.precision_score(y_train, y_predict_train, average='micro'),
        'train_recall': metrics.recall_score(y_train, y_predict_train, average='micro'),
        'train_error_rate': np.mean(y_train != y_predict_train),
        'test_accuracy': metrics.accuracy_score(y_test, y_predict_test),
        'test_precision': metrics.precision_score(y_test, y_predict_test, average='micro'),
        'test_recall': metrics.recall_score(y_test, y_predict_test, average='micro'),
        'test_error_rate': np.mean(y_test != y_predict_test)
    }


def knn_feature_combinations(k, all_features, combinations, x_train, y_train, x_test, y_test):
    evaluation = []

    for features in tqdm(itertools.combinations(all_features, r=combinations)):
        result = knn_one_case_eval(k, list(features), x_train, y_train, x_test, y_test)
        evaluation.append(result)

    evaluation = pd.DataFrame.from_records(evaluation)
    return evaluation.sort_values(by='train_accuracy', ascending=False).reset_index(drop=True)


def find_knn_best(conditions) -> pd.DataFrame:
    best = []
    train_range = []
    test_range = []
    all_results = pd.DataFrame()
    domains = ('TD', 'FD')

    for row in tqdm(conditions):
        for domain_label in domains:
            x_train, x_test, y_train, y_test = mafaulda.load_source(domain_label, row)
            result = knn_feature_combinations(N_NEIGHBORS, list(x_train.columns), 3, x_train, y_train, x_test, y_test)

            top_result = result.head(1).to_dict(orient='index')[0]
            train_accuracies = result.describe()['train_accuracy'].to_dict()
            test_accuracies = result.describe()['test_accuracy'].to_dict()

            for k, v in row.items():
                result[k] = v
            result['domain'] = domain_label
            all_results = pd.concat([all_results, result])

            best.append({**row, 'domain': domain_label, **top_result})
            train_range.append({**row, 'domain': domain_label, **train_accuracies})
            test_range.append({**row, 'domain': domain_label, **test_accuracies})

    return (
        all_results,
        pd.DataFrame.from_records(best),
        pd.DataFrame.from_records(train_range),
        pd.DataFrame.from_records(test_range)
    )

In [None]:
def knn_fsel_evaluation(domain, neighbours=N_NEIGHBORS, filename='best_set/rank_product.csv'):
    best_set_membership = pd.read_csv(filename)
    columns = [col for col in best_set_membership if col.startswith(domain)]

    situation = (
        best_set_membership[columns][best_set_membership == True]
        .count(axis=0)
        .sort_values(ascending=False)
        .head(3)
    )
    columns = [c.lstrip(domain + '_') for c in situation.index]
    x_train, x_test, y_train, y_test = mafaulda.load_source(domain, {'placement': 'A', 'online': False})
    score = knn_one_case_eval(neighbours, columns, x_train, y_train, x_test, y_test)
    score['features'] = columns
    score['domain'] = domain
    score['placement'] = 'A'
    model = [score]

    return pd.DataFrame.from_records(model)

def knn_neighbours_eval(domain):
    train_rates = pd.DataFrame()
    test_rates = pd.DataFrame()
    for n in range(3, 21, 2):
        scores = (
            knn_fsel_evaluation(domain, neighbours=n)
            .drop(columns=['features'])
            .set_index(['placement'])
        )
        train_rates[n] = scores['train_error_rate']
        test_rates[n] = scores['test_error_rate']
    return train_rates, test_rates


def knn_evaluation(x_train, y_train, x_test, y_test, n=5):
    knn = KNeighborsClassifier(n_neighbors=n, metric='euclidean', algorithm='kd_tree')
    knn.fit(x_train, y_train)
    y_predict_train = knn.predict(x_train)
    y_predict_test = knn.predict(x_test)

    print(f'Train accuracy: {metrics.accuracy_score(y_train, y_predict_train) * 100:.2f} %')
    print(f'Test accuracy: {metrics.accuracy_score(y_test, y_predict_test) * 100:.2f} %')
    print(metrics.classification_report(y_test, y_predict_test))

    labels = np.unique(y_test)
    cm = metrics.confusion_matrix(y_test, y_predict_test)
    cm = pd.DataFrame(cm, index=labels, columns=labels)

    ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')
    ax.set_xlabel('Predicted label')    # size=15)
    ax.set_ylabel('True label')         # size=15)
    plt.show()

#### Classification with kNN (all features)

In [None]:
# Time domain to fault
x_train, x_test, y_train, y_test = mafaulda.load_source('TD', {'placement': 'A', 'online': False})
knn_evaluation(x_train, y_train, x_test, y_test)

In [None]:
x_train, x_test, y_train, y_test = mafaulda.load_source('FD', {'placement': 'A', 'online': False})
knn_evaluation(x_train, y_train, x_test, y_test)

B

In [None]:
x_train, x_test, y_train, y_test = mafaulda.load_source('TD', {'placement': 'B', 'online': False})
knn_evaluation(x_train, y_train, x_test, y_test)

In [None]:
x_train, x_test, y_train, y_test = mafaulda.load_source('FD', {'placement': 'B', 'online': False})
knn_evaluation(x_train, y_train, x_test, y_test)

Performance - Global best and PCA

In [None]:
def knn_all_features_accuracy(conditions, pca=None) -> pd.DataFrame:
    cases = []
    domains = ('TD', 'FD')
    for row in tqdm(conditions):
        for domain_label in domains:
            x_train, x_test, y_train, y_test = mafaulda.load_source(domain_label, row)
            if pca is not None:
                model = PCA(n_components=pca).fit(x_train)
                x_train = pd.DataFrame(model.transform(x_train))
                x_test = pd.DataFrame(model.transform(x_test))

            knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS, metric='euclidean', algorithm='kd_tree')
            knn.fit(x_train, y_train)
            y_predict_train = knn.predict(x_train)
            y_predict_test = knn.predict(x_test)

            case = {
                'train_accuracy': metrics.accuracy_score(y_train, y_predict_train),
                'train_precision': metrics.precision_score(y_train, y_predict_train, average='micro'),
                'train_recall': metrics.recall_score(y_train, y_predict_train, average='micro'),
                'train_error_rate': np.mean(y_train != y_predict_train),
                'test_accuracy': metrics.accuracy_score(y_test, y_predict_test),
                'test_precision': metrics.precision_score(y_test, y_predict_test, average='micro'),
                'test_recall': metrics.recall_score(y_test, y_predict_test, average='micro'),
                'test_error_rate': np.mean(y_test != y_predict_test)
            }
            cases.append({**row, 'domain': domain_label, **case})

    return pd.DataFrame.from_records(cases)


column_names = ['placement', 'online']
initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(['A', 'B'], [False, True])
]
all_features_accuracies = knn_all_features_accuracy(initial_conditions, pca=None)
all_features_accuracies

In [None]:
column_names = ['placement', 'online']
initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(['A', 'B'], [False, True])
]
pca_all_features_accuracies = knn_all_features_accuracy(initial_conditions, pca=3)
pca_all_features_accuracies


Experiments and best performing feature combinations
- combinations of all 3 members subsets of features (get best performace train accuracy)

In [None]:
column_names = ['placement', 'online']
initial_conditions = [
    dict(zip(column_names, row)) 
    for row in itertools.product(['A', 'B'], [False, True])
]

all_permut_results, best, train_range, test_range = find_knn_best(initial_conditions)

In [None]:
best.to_csv('best_set/best_knn.csv', index=False)
train_range.to_csv('best_set/knn_train_accuracy_range.csv', index=False)
test_range.to_csv('best_set/knn_test_accuracy_range.csv', index=False)

In [None]:
all_permut_results

In [None]:
# Plain model range
def plot_all_models_performance_permut(experiments, metric):
    fig, ax = plt.subplots(1, 1, figsize=(10, 5), sharey=True)
    ax.grid()
    
    boxplot_data = []
    for attr, group in experiments.groupby(by=['placement', 'domain']): 
        place, domain = attr
        boxplot_data.append((place, domain, group[metric].to_list()))

    s = sorted(sorted(boxplot_data, key=lambda x: x[1], reverse=True), key=lambda x: x[0])
    boxplot_data = {f'{x[0]}, {x[1]}': x[2] for x in s}

    ax.boxplot(
        boxplot_data.values(),
        labels=boxplot_data.keys(),
        medianprops = {'linewidth': 2, 'color': 'black'})
    ax.set_ylabel('Accuracy')
    ax.set_xlabel('Placement and domain')


plot_all_models_performance_permut(all_permut_results, 'train_accuracy')
plt.show()
plot_all_models_performance_permut(all_permut_results, 'test_accuracy')
plt.show()

Best combination of features

In [None]:
best = pd.read_csv('best_set/best_knn.csv')
best[['placement', 'domain', 'features', 'train_accuracy', 'test_accuracy']].sort_values(by=['placement'])

### Rank product chosen features

In [None]:
filename = 'best_set/rank_product.csv'
frames = []
for domain in ('TD', 'FD'):
    frames.append(knn_fsel_evaluation(domain, filename=filename))

chosen_rankproduct = pd.concat(frames).sort_values(by=['placement'])
chosen_rankproduct

#### Correlation chosen features

In [None]:
filename = 'best_set/corr.csv'
frames = []
for domain in ('TD', 'FD'):
    frames.append(knn_fsel_evaluation(domain, filename=filename))

chosen_corr = pd.concat(frames).sort_values(by=['placement'])
chosen_corr

#### F statistic chosen features

In [None]:
filename = 'best_set/fstat.csv'
frames = []
for domain in ('TD', 'FD'):
    frames.append(knn_fsel_evaluation(domain, filename=filename))

chosen_fstat = pd.concat(frames).sort_values(by=['placement'])
chosen_fstat

#### Mutual information chosen features

In [None]:
filename = 'best_set/mi.csv'
frames = []
for domain in ('TD', 'FD'):
    frames.append(knn_fsel_evaluation(domain, filename=filename))

chosen_mi = pd.concat(frames).sort_values(by=['placement'])
chosen_mi

### Number of neighbors

In [None]:
train, test = knn_neighbours_eval('TD')

In [None]:
train

In [None]:
test

In [None]:
train.T.plot(marker='.', grid=True, figsize=(10, 5), xlabel='Neighbours', ylabel='Error rate')
plt.show()
test.T.plot(marker='.', grid=True, figsize=(10, 5), xlabel='Neighbours', ylabel='Error rate')
plt.show()

In [None]:
train, test = knn_neighbours_eval('FD')

In [None]:
train

In [None]:
test

In [None]:
train.T.plot(marker='.', grid=True, figsize=(10, 5), xlabel='Neighbours', ylabel='Error rate')
plt.show()
test.T.plot(marker='.', grid=True, figsize=(10, 5), xlabel='Neighbours', ylabel='Error rate')
plt.show()