In [None]:
# 1) Label: machine
# 2) Label: machine, placement
# 3) Hold-out validations
# - Parameters: k-neighbors, number of features (how to choose them)

import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn import metrics

from sklearn.decomposition import PCA
import seaborn as sb

import sys
sys.path.append('../')
from vibrodiagnostics import (
    ranking,
    nn,
    discovery,
    models
)

In [None]:
# Load features
TEMPORAL_NAME = 'industry_temporal.csv'
SPECTRAL_NAME = 'industry_spectral.csv'
FEATURES_PATH =  '../../datasets/features_data/'

TD_FEATURES = os.path.join(FEATURES_PATH, TEMPORAL_NAME)
FD_FEATURES = os.path.join(FEATURES_PATH, SPECTRAL_NAME)

domains = {'temporal': TD_FEATURES, 'spectral': FD_FEATURES}
N_NEIGHBORS = 5

#  6 classes
LABELS = {
    'KSB-1': {
        'MTR001': 'M1',
        'MTR002': 'M1',
        'PMP003': 'P1',
        'PMP004': 'P1'
    },
    'KSB-7': {
        'MTR001': 'M2',
        'MTR002': 'M2',
        'PMP003': 'P2',
        'PMP004': 'P2'
    },
    'K3': {
        '001': 'C1',
        '002': 'C1'
    },
    'K5': {
        '001': 'C2',
        '002': 'C2'
    }
}


def load_source(dataset: str, domain: str, train_size: float = 0.8, kfold: int = 5):
    features = pd.read_csv(dataset)

    axis = ('x', 'y', 'z')

    features['target'] = features.apply(lambda row: LABELS[row['device']].get(row['position']), axis=1)
    features = features.dropna().reset_index(drop=True)

    columns = features.columns.str.startswith(axis)
    x = features[features.columns[columns]]
    y = features['target'].astype('category')

    if domain == 'spectral':
        window_size = 2**14
        x = x.loc[:,x.columns.str.endswith(f'_{window_size}')]
        x.columns = x.columns.str.extract(r'(\w+)_\w+$')[0]
    
    feature_names = features.columns.str.extract(r'([a-z]{1})_([a-z\_\-]+)')[1].unique()
    result = pd.DataFrame()
    for name in filter(lambda x: not pd.isnull(x), feature_names): 
        name = name.rstrip('_') 
        vector_dims = [f'{dim}_{name}' for dim in axis]
        result[name] = x[vector_dims].apply(np.linalg.norm, axis=1)
    x = result

    kf = KFold(n_splits=kfold, shuffle=True, random_state=10)

    for train, test in kf.split(x):
        x_train, x_test, y_train, y_test = (
            x.loc[train].copy(), x.loc[test].copy(),
            y.loc[train].copy(), y.loc[test].copy()
        )
        scaler = MinMaxScaler()
    
        x_train[x_train.columns] = scaler.fit_transform(x_train)
        x_test[x_test.columns] = scaler.transform(x_test)
    
        yield x_train, x_test, y_train, y_test

In [None]:
x_train, x_test, y_train, y_test = next(load_source(FD_FEATURES, 'spectral'))
x_train

In [None]:
y_train

### Classification with kNN - all features (k-fold)
- 7 fold cross-validation
- find best k

In [None]:
def all_features(domain, domain_name):
    kvalues = list(range(1, 16, 2))
    train_scores = []
    test_scores = []

    for k in kvalues:
        scores = []
        for x_train, x_test, y_train, y_test in load_source(domain, domain_name, kfold=7):
            # nn.knn_evaluation(x_train, y_train, x_test, y_test, n=N_NEIGHBORS)
            s = nn.knn_one_case_eval(k, x_train.columns, x_train, y_train, x_test, y_test)
            scores.append(s)

        r = pd.DataFrame.from_records(scores).drop(columns=['features']).mean()
        train_scores.append(r['train_accuracy'])
        test_scores.append(r['test_accuracy'])

    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    ax.plot(kvalues, train_scores, marker='x', color='darkblue', label='train')
    ax.plot(kvalues, test_scores, marker='x', color='green', label='test')
    ax.set_ylabel('accuracy (7-fold mean)')
    ax.set_xlabel('k-neighbors')
    
    print(train_scores[2])
    print(test_scores[2])

    ax.grid()
    ax.legend()
    ax.set_xticks(kvalues)
    plt.show()


all_features(TD_FEATURES, 'temporal')
all_features(FD_FEATURES, 'spectral')

### All models exhaustive

In [None]:
def find_knn_best() -> pd.DataFrame:
    best = []
    train_range = []
    test_range = []
    all_results = pd.DataFrame()
    kvalues = list(range(3, 9, 2))
    for k in kvalues:
        for domain_label, dataset in domains.items():
            x_train, x_test, y_train, y_test = next(load_source(dataset, domain_label))
            # TODO: find best number of features (3)
            result = nn.knn_feature_combinations(k, list(x_train.columns), 3, x_train, y_train, x_test, y_test)

            top_result = result.head(1).to_dict(orient='index')[0]
            train_accuracies = result.describe()['train_accuracy'].to_dict()
            test_accuracies = result.describe()['test_accuracy'].to_dict()

            result['k'] = k
            result['domain'] = domain_label
            all_results = pd.concat([all_results, result])

            best.append({'k': k, 'domain': domain_label, **top_result})
            train_range.append({'k': k, 'domain': domain_label, **train_accuracies})
            test_range.append({'k': k, 'domain': domain_label, **test_accuracies})
    return (
        all_results,
        pd.DataFrame.from_records(best),
        pd.DataFrame.from_records(train_range),
        pd.DataFrame.from_records(test_range)
    )

all_permut_results, best, train_range, test_range = find_knn_best()

In [None]:
best

In [None]:

def all_permut_models_plot(results, metric):
    fig, ax = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
    i = 0
    for name, group in results.groupby(by='domain'):
        boxplot_data = {}
        for k, g in group.groupby(by=['k']): 
            boxplot_data[k[0]] = g[metric].to_list()

        ax[i].boxplot(
            boxplot_data.values(),
            labels=boxplot_data.keys(),
            medianprops = {'linewidth': 2, 'color': 'black'})
        ax[0].set_ylabel('accuracy')
        ax[i].set_xlabel('k-neighbors')
        ax[i].set_ylim(0.8, 1.01)
        ax[i].set_title(name)
        ax[i].grid()
        i += 1
    plt.show()


all_permut_models_plot(all_permut_results, 'train_accuracy')
all_permut_models_plot(all_permut_results, 'test_accuracy')


### Best performing feature combinations

In [None]:
# Prerob mafaulda na k-fold cross-validáciu
# Dvojice featue pairplot s farebnými classes

### Scatter plots of principal components

In [None]:
def project_classes(X, Y):
    X = X.reset_index(drop=True)
    Y = Y.reset_index(drop=True)
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    X_pca = pd.DataFrame(X_pca)

    categories = Y.cat.categories
    colors = sb.color_palette('hls', len(categories))
    fig, ax = plt.subplots(1, 1, figsize=(5, 4))

    for label, color in zip(categories, colors):
        rows = list(Y[Y == label].index)
        x = X_pca.loc[rows, 0]
        y = X_pca.loc[rows, 1]
        ax.scatter(x, y, s=2, color=color, label=label)


    var = 100 * pca.explained_variance_ratio_
    ax.set_xlabel(f'PC1 ({var[0]:.2f} %)')
    ax.set_ylabel(f'PC2 ({var[1]:.2f} %)')
    ax.grid(True)
    ax.legend()

In [None]:
x_train, x_test, y_train, y_test = next(load_source(TD_FEATURES, 'temporal'))
x = pd.concat([x_train, x_test])
y = pd.concat([y_train, y_test])
project_classes(x, y)

x_train, x_test, y_train, y_test = next(load_source(FD_FEATURES, 'spectral'))
x = pd.concat([x_train, x_test])
y = pd.concat([y_train, y_test])
project_classes(x, y)

In [None]:
from adjustText import adjust_text

def loading_plot(loadings, feature_names, bottom, top):
    xs = loadings[0]
    ys = loadings[1]

    texts = []
    # Plot the loadings on a scatterplot
    for i, varnames in enumerate(feature_names):
        plt.arrow(
            0, 0, # coordinates of arrow base
            xs[i], # length of the arrow along x
            ys[i], # length of the arrow along y
            color='r', 
            head_width=0.01
        )
        texts.append(plt.text(xs[i], ys[i], varnames))

    # Define the axes
    adjust_text(texts, only_move={'points':'y', 'texts':'y'})
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.xlim(bottom, top)
    plt.ylim(bottom, top)
    plt.grid()
    plt.show()

x_train, x_test, y_train, y_test = next(load_source(TD_FEATURES, 'temporal'))
x = pd.concat([x_train, x_test])
y = pd.concat([y_train, y_test])

X_td = x.copy()
pca_td = PCA(n_components=10)
X_pca = pca_td.fit_transform(x)

x_train, x_test, y_train, y_test = next(load_source(FD_FEATURES, 'spectral'))
x = pd.concat([x_train, x_test])
y = pd.concat([y_train, y_test])

X_fd = x.copy()
pca_fd = PCA(n_components=10)
X_pca = pca_fd.fit_transform(x)

loading_plot(pca_td.components_, X_td.columns, -0.5, 1)
plt.show()
loading_plot(pca_fd.components_, X_fd.columns, -0.5, 1)
plt.show()

## best knn

In [None]:
def plot_models_performance_bar(df, best, all_feat, all_feat_pca, ch_rank, ch_corr, ch_fstat, ch_mi, accuracy_type):

    fig, ax = plt.subplots(1, 2, figsize=(15, 5))
    i = 0
    columns = ('temporal', 'spectral')
    for pos in columns:
        x = np.arange(len(columns))
        width = 0.13

        y_best = list(all_feat[all_feat['domain'] == pos][accuracy_type])
        rect = ax[i].bar(x - 3*width, y_best, width, label='All features')
        ax[i].bar_label(rect, padding=3, fmt=lambda x: f'{x * 100:.0f}')

        y_best = list(all_feat_pca[all_feat_pca['domain'] == pos][accuracy_type])
        rect = ax[i].bar(x - 2*width, y_best, width, label='PCA 3 PC')
        ax[i].bar_label(rect, padding=3, fmt=lambda x: f'{x * 100:.0f}')

        y_best = list(best[best['domain'] == pos][accuracy_type])
        rect = ax[i].bar(x - 1*width, y_best, width, label='Best 3 features')
        ax[i].bar_label(rect, padding=3, fmt=lambda x: f'{x * 100:.0f}')

        y_chosen = list(ch_rank[ch_rank['domain'] == pos][accuracy_type])
        rect = ax[i].bar(x - 0*width, y_chosen, width, label='Rank product')
        ax[i].bar_label(rect, padding=3, fmt=lambda x: f'{x * 100:.0f}')

        y_chosen = list(ch_corr[ch_corr['domain'] == pos][accuracy_type])
        rect = ax[i].bar(x + 1*width, y_chosen, width, label='Correlation')
        ax[i].bar_label(rect, padding=3, fmt=lambda x: f'{x * 100:.0f}')

        y_chosen = list(ch_fstat[ch_fstat['domain'] == pos][accuracy_type])
        rect = ax[i].bar(x + 2*width, y_chosen, width, label='F statistic')
        ax[i].bar_label(rect, padding=3, fmt=lambda x: f'{x * 100:.0f}')

        y_chosen = list(ch_mi[ch_mi['domain'] == pos][accuracy_type])
        rect = ax[i].bar(x + 3*width, y_chosen, width, label='Mutual information')
        ax[i].bar_label(rect, padding=3, fmt=lambda x: f'{x * 100:.0f}')

        ax[i].set_xticks(x, columns)
        ax[i].legend(loc='lower right')
        ax[i].set_ylim(0.5, None)
        ax[i].set_title(pos)
        
        ax[0].set_ylabel('Accuracy')
        ax[i].set_xlabel('Domain')
        i += 1

def run_feature_selection(exp_output): 
    experiments = {}
    for domain_label, dataset in domains.items():
        x_train, x_test, y_train, y_test = next(load_source(dataset, domain_label))
        
        if exp_output == ranking.ExperimentOutput.BEST_SET:
            ranks = ranking.batch_feature_ranking(x_train, y_train)
            synonyms = ranking.compute_correlations(x_train, corr_above=0.95)
            subset = ranking.best_subset(ranks, synonyms, n=3)
            output = subset

        elif exp_output == ranking.ExperimentOutput.BEST_CORR:
            ranks = ranking.batch_feature_ranking(x_train, y_train, 'corr')
            synonyms = ranking.compute_correlations(x_train, corr_above=0.95)
            subset = ranking.best_subset(ranks, synonyms, n=3)
            output = subset

        elif exp_output == ranking.ExperimentOutput.BEST_F_STAT:
            ranks = ranking.batch_feature_ranking(x_train, y_train,'f_stat')
            synonyms = ranking.compute_correlations(x_train, corr_above=0.95)
            subset = ranking.best_subset(ranks, synonyms, n=3)
            output = subset

        elif exp_output == ranking.ExperimentOutput.BEST_MI:
            ranks = ranking.batch_feature_ranking(x_train, y_train, 'mi')
            synonyms = ranking.compute_correlations(x_train, corr_above=0.95)
            subset = ranking.best_subset(ranks, synonyms, n=3)
            output = subset

        elif exp_output == ranking.ExperimentOutput.RANKS:
            ranks = ranking.batch_feature_ranking(x_train, Y_train)
            output = ranks

        output.reset_index(inplace=True)
        output = dict(zip(list(output['feature']), list(output['rank'])))
        experiments.update({domain_label: [k for k, v in output.items() if v]})

    return experiments

def knn_eval_features(feature_names, n=5):
    scores = []
    for domain_label, dataset in domains.items():
        features = membership[domain_label]
        x_train, x_test, y_train, y_test = next(load_source(dataset, domain_label))
        score = nn.knn_one_case_eval(n, features, x_train, y_train, x_test, y_test)
        score['domain'] = domain_label
        scores.append(score)
    
    return pd.DataFrame.from_records(scores)

In [None]:
membership = run_feature_selection(ranking.ExperimentOutput.BEST_SET)
chosen_rankproduct = knn_eval_features(membership)

membership = run_feature_selection(ranking.ExperimentOutput.BEST_CORR)
chosen_corr = knn_eval_features(membership)

membership = run_feature_selection(ranking.ExperimentOutput.BEST_F_STAT)
chosen_fstat = knn_eval_features(membership)

membership = run_feature_selection(ranking.ExperimentOutput.BEST_MI)
chosen_mi = knn_eval_features(membership)

In [None]:
def knn_all_features_accuracy(pca=None, n=5) -> pd.DataFrame:
    cases = []
    for domain_label, dataset in domains.items():
        x_train, x_test, y_train, y_test = next(load_source(dataset, domain_label))
        if pca is not None:
            model = PCA(n_components=pca).fit(x_train)
            x_train = pd.DataFrame(model.transform(x_train))
            x_test = pd.DataFrame(model.transform(x_test))

        knn = KNeighborsClassifier(n_neighbors=n, metric='euclidean', algorithm='kd_tree')
        knn.fit(x_train, y_train)
        y_predict_train = knn.predict(x_train)
        y_predict_test = knn.predict(x_test)

        case = {
            'train_accuracy': metrics.accuracy_score(y_train, y_predict_train),
            'train_precision': metrics.precision_score(y_train, y_predict_train, average='micro'),
            'train_recall': metrics.recall_score(y_train, y_predict_train, average='micro'),
            'train_error_rate': np.mean(y_train != y_predict_train),
            'test_accuracy': metrics.accuracy_score(y_test, y_predict_test),
            'test_precision': metrics.precision_score(y_test, y_predict_test, average='micro'),
            'test_recall': metrics.recall_score(y_test, y_predict_test, average='micro'),
            'test_error_rate': np.mean(y_test != y_predict_test)
        }
        cases.append({'domain': domain_label, **case})

    return pd.DataFrame.from_records(cases)


all_features_accuracies = knn_all_features_accuracy(pca=None)
pca_all_features_accuracies = knn_all_features_accuracy(pca=3)

plot_models_performance_bar(
    train_range[train_range['k'] == 5],
    all_features_accuracies,
    pca_all_features_accuracies,
    best[train_range['k'] == 5],
    chosen_rankproduct,
    chosen_corr,
    chosen_fstat,
    chosen_mi,
    'train_accuracy'
)
plt.tight_layout()
plt.show()

plot_models_performance_bar(
    test_range[test_range['k'] == 5],
    all_features_accuracies,
    pca_all_features_accuracies,
    best[train_range['k'] == 5],
    chosen_rankproduct,
    chosen_corr,
    chosen_fstat,
    chosen_mi,
    'test_accuracy'
)
plt.tight_layout()
plt.show()

#### Compressor and pump scatter plots separately

In [None]:
# - oprav obrázky - clustering dve tie isté vedľa seba sú

In [None]:
# Oddeľ kompresory a pumpy