In [None]:
import os
import json
from zipfile import ZipFile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import PowerTransformer
from tqdm.notebook import tqdm

import sys
sys.path.append('../')
from vibrodiagnostics import (
    mafaulda, 
    extraction,
    visualize,
    models
)

plt.rcParams.update({'font.size': 14})

In [None]:
PARTS = 1
FFT_WINDOW = 2**15
EXTRACT = False
GENERATE = False
POWER_TRANSFORM = False
MODEL_TYPE = 'knn'

K_NEIGHBORS_OPTIONS = (3, 5, 11)
NUM_FEATURES_OPTIONS = (2, 3, 4)

PATH = '../datasets'
FEATURES_PATH = os.path.join(PATH, 'features')
RESULTS_PATH = os.path.join(FEATURES_PATH, 'results.json')
BEST_FEATURES_PATH = os.path.join(FEATURES_PATH, 'best_features_accuracy.csv')
DATASET_PATH = os.path.join(PATH, 'MAFAULDA.zip')
LABELED_DATASET_PATH = os.path.join(FEATURES_PATH, 'MAFAULDA_LABEL.csv')
KNN_BRUTE_FORCE_PATH = os.path.join(PATH, 'knn_brute_force_features')
FEATURES = {
    'TD': os.path.join(FEATURES_PATH, 'MAFAULDA_TD.csv'),
    'FD': os.path.join(FEATURES_PATH, 'MAFAULDA_FD.csv'),
}

In [None]:
def features_time_domain(dataset: ZipFile, filename: str, parts: int = PARTS) -> pd.DataFrame:
    return mafaulda.features_by_domain(extraction.time_features_calc, dataset, filename, parts=parts)


def features_frequency_domain(dataset: ZipFile, filename: str, parts: int = PARTS) -> pd.DataFrame:
    return mafaulda.features_by_domain(extraction.frequency_features_calc, dataset, filename, window=FFT_WINDOW, parts=parts)

In [None]:
if EXTRACT is True:
    features = extraction.load_files_split(ZipFile(DATASET_PATH), features_time_domain)
    features.to_csv(FEATURES['TD'], index=False)
else:
    features = pd.read_csv(FEATURES['TD'])
features

In [None]:
if EXTRACT is True:
    features = extraction.load_files_split(ZipFile(DATASET_PATH), features_frequency_domain)
    features.to_csv(FEATURES['FD'], index=False)
else:
    features = pd.read_csv(FEATURES['FD'])
features

In [None]:
# display example severities
df = extraction.load_features(FEATURES['TD'], mafaulda.BEARING_A_COLUMNS, mafaulda.LABEL_COLUMNS) 
df = mafaulda.label_severity(df, 'A', 0.5, True)

In [None]:
# generate different feature sets
datasets = []
domains = ('TD', 'FD')
dimensions = (1, 3)
columns = {
    'A': {
        1: ['ay'],
        3: mafaulda.BEARING_A_COLUMNS
    },
    'B': {
        1: ['by'],
        3: mafaulda.BEARING_B_COLUMNS
    }
}

for domain in domains:
    for dim in dimensions:
        a = extraction.load_features(FEATURES[domain], columns['A'][dim], mafaulda.LABEL_COLUMNS) 
        a = mafaulda.assign_labels(a, 'A')
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A', 'severity': False, 'data': a})

        b = extraction.load_features(FEATURES[domain], columns['B'][dim], mafaulda.LABEL_COLUMNS) 
        b = mafaulda.assign_labels(b, 'B')
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'B', 'severity': False, 'data': b})

        ab = pd.concat([a, b]).reset_index(drop=True)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A+B', 'severity': False, 'data': ab})

        a = extraction.load_features(FEATURES[domain], columns['A'][dim], mafaulda.LABEL_COLUMNS) 
        a = mafaulda.label_severity(a, 'A', 0.5)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A', 'severity': True, 'data': a})

        b = extraction.load_features(FEATURES[domain], columns['B'][dim], mafaulda.LABEL_COLUMNS) 
        b = mafaulda.label_severity(b, 'B', 0.5)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'B', 'severity': True, 'data': b})

        ab = pd.concat([a, b]).reset_index(drop=True)
        datasets.append({'domain': domain, 'dim': dim, 'bearing': 'A+B', 'severity': True, 'data': ab})


datasets_domains = pd.DataFrame.from_records(datasets)

# Join columns of features in time and frequency domain
for name, group in datasets_domains.groupby(by=['dim', 'bearing', 'severity']):
    dim, bearing, severity = name
    frames_by_domain = [
        df.drop(columns=['label']).reset_index(drop=True).add_prefix(f'{domain}-')
        for domain, df in zip(group['domain'].to_list(), group['data'].values)
    ]
    df = pd.concat(frames_by_domain, axis=1)
    df['label'] = group['data'].values[0]['label']
    datasets.append({'domain': 'TD+FD', 'dim': dim, 'bearing': bearing, 'severity': severity, 'data': df})


datasets = pd.DataFrame.from_records(datasets)

# TODO: temp
# datasets = datasets.iloc[25:]
datasets = datasets[datasets['domain'].isin(['TD', 'FD'])]

In [None]:
datasets

In [None]:
# Zisti počty jednotlivých tried - ovplyvnené cez severity a bearing
# domain ovplyvnuje počet stĺpcov, dim - iba z koľkých pôvodných stĺpcov
# Riadok - bearings, severity
# Stĺpce - počet z každej class
label_counts = []
for name, group in datasets_domains.groupby(by=['severity', 'bearing']):
    severity, bearing = name
    df = group['data'].values[0]
    scenario = {'bearing': bearing, 'severity': severity}
    counts = df['label'].value_counts().to_dict()
    counts['sum'] = sum(counts.values())
    scenario.update(counts)
    label_counts.append(scenario)

pd.DataFrame.from_records(label_counts)

In [None]:
# Range of values in features
for name, group in datasets_domains.groupby(by=['domain', 'dim', 'bearing']):
    df = group['data'].values[0].drop(columns=['label'])

    fig, ax = plt.subplots(1, len(df.columns), figsize=(20, 4))
    print(name)
    for i, col in enumerate(df):
        df.boxplot([col], ax=ax[i])
    fig.tight_layout()
    plt.show()

In [None]:
# Range of values in features - Power transform
for name, group in datasets_domains.groupby(by=['domain', 'dim', 'bearing']):
    df = group['data'].values[0].drop(columns=['label'])

    pt = PowerTransformer(method='yeo-johnson', standardize=True)
    df[df.columns] = pt.fit_transform(df)

    fig, ax = plt.subplots(1, len(df.columns), figsize=(20, 4))
    print(name)
    for i, col in enumerate(df):
        df.boxplot([col], ax=ax[i])
    fig.tight_layout()
    plt.show()

#### All features on each scenario

In [None]:
results = []
if GENERATE is True:
    for index, row in tqdm(datasets.iterrows()):
        source = row['data']
        y = source['label']
        x = source.drop(columns=['label'])
        r = models.all_features(
            x, y, power_transform=POWER_TRANSFORM
        )
        r.update({
            'domain': row['domain'],
            'dim': row['dim'],
            'bearing': row['bearing'],
            'severity': row['severity']
        })
        results.append(r)
    json.dump(results, open(RESULTS_PATH, 'w'))

results = json.load(open(RESULTS_PATH, 'r'))
results[:1]

In [None]:
results = json.load(open(RESULTS_PATH, 'r'))
for row in results:
    print(row['domain'], row['dim'], row['bearing'], row['severity'])
    visualize.plot_all_knn_simple(row)

#### Enumerate feature combinations on each scenario

In [None]:
def make_filename(row: dict, separator: str = '#'):
    parts = [
        row['domain'],
        row['dim'],
        row['bearing'],
        row['severity'],
        POWER_TRANSFORM
    ]
    filename = separator.join([str(p) for p in parts])
    return filename

results = []
Path(KNN_BRUTE_FORCE_PATH).mkdir(parents=True, exist_ok=True)

if GENERATE is True:
    for index, row in tqdm(datasets.iterrows()):
        source = row['data']
        y = source['label']
        x = source.drop(columns=['label'])

        result = models.enumerate_models(
            x, y, row['domain'],
            power_transform=POWER_TRANSFORM,
            k_neighbors=K_NEIGHBORS_OPTIONS,
            num_of_features=NUM_FEATURES_OPTIONS
        )
        filename = make_filename(row)
        filename = os.path.join(KNN_BRUTE_FORCE_PATH, filename)
        result.to_csv(filename, index=False)

In [None]:
for filename in os.listdir(KNN_BRUTE_FORCE_PATH):
    if not os.path.isfile(os.path.join(KNN_BRUTE_FORCE_PATH, filename)):
        continue
    models_summary = pd.read_csv(os.path.join(KNN_BRUTE_FORCE_PATH, filename))
    print(filename)
    visualize.boxplot_enumerate_models_accuracy(models_summary, 'test', 'f', 'k')
    visualize.boxplot_enumerate_models_accuracy(models_summary, 'test', 'k', 'f')

#### Bar chart for specfic number of features

In [None]:
if GENERATE is True:
    results = []
    for index, row in tqdm(datasets.iterrows()):
        source = row['data']
        y = source['label']
        x = source.drop(columns=['label'])
        filename = make_filename(row)
        models_summary = pd.read_csv(os.path.join(KNN_BRUTE_FORCE_PATH, filename))

        for fnum in NUM_FEATURES_OPTIONS:
            for k in K_NEIGHBORS_OPTIONS:
                result = models.feature_selection_accuracies(
                    x, y, 
                    row['domain'],
                    models_summary,
                    k_neighbors=k,
                    number_of_features=fnum, 
                    power_transform=POWER_TRANSFORM
                )
                for r in result:
                    r.update({
                        'dim': row['dim'],
                        'bearing': row['bearing'],
                        'severity': row['severity'],
                        'k': k,
                        'f': fnum
                    })
                results.extend(result)
    
    results = pd.DataFrame.from_records(results)
    results.to_csv(BEST_FEATURES_PATH, index=False)
else:
    results = pd.read_csv(BEST_FEATURES_PATH)
results

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'k', 'f']):
    print(name)
    visualize.plot_models_performance_bar(group)

#### Accuracy and percentile of feature selection methods to number of features
- The best features don't have always 100 percentile. The best subset is taken after sorting training set and distribution of accuracies is from validation set.

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'k']):
    print(name)
    (group[['f', 'set', 'test_accuracy']]
     .pivot(index='f', columns='set', values='test_accuracy')
     .plot(figsize=(8, 5), marker='o', grid=True, xlabel='Number of features', ylabel='Accuracy'))
    plt.xticks(NUM_FEATURES_OPTIONS)
    plt.show()

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'k']):
    print(name)
    (group[['f', 'set', 'test_percentile']]
     .pivot(index='f', columns='set', values='test_percentile')
     .plot(figsize=(8, 5), marker='o', grid=True, xlabel='Number of features', ylabel='Percentile'))
    plt.xticks(NUM_FEATURES_OPTIONS)
    plt.show()

#### Accuracy and percentile of feature selection methods to number of features

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'f']):
    print(name)
    (group[['k', 'set', 'test_accuracy']]
     .pivot(index='k', columns='set', values='test_accuracy')
     .plot(figsize=(8, 6), marker='o', grid=True, xlabel='k-Neighbors', ylabel='Accuracy'))
    plt.xticks(K_NEIGHBORS_OPTIONS)
    plt.show()

In [None]:
for name, group in results.groupby(by=['dim', 'bearing', 'severity', 'domain', 'f']):
    print(name)
    (group[['k', 'set', 'test_percentile']]
     .pivot(index='k', columns='set', values='test_percentile')
     .plot(figsize=(8, 6), marker='o', grid=True, label='k-Neighbors', ylabel='Percentile'))
    plt.xticks(K_NEIGHBORS_OPTIONS)
    plt.show()

#### In how many cases is rank product best among all the selection methods?

In [None]:
def count_wins_of_methods(datasets, results, methods):
    columns = list(set(datasets.columns) - {'data'}) + ['k', 'f']
    counts = dict(zip(methods + ['total'], np.zeros(len(methods)+1)))
    percentiles = dict(zip(methods, [[] for _ in range(len(methods))]))
    
    for name, group in results.groupby(by=columns):
        counts['total'] += 1
        row = (
            group[group['set'].isin(methods)]
            .sort_values(by='test_percentile', ascending=False)
            .head(1)
        )
        key = row.head(1)['set'].values[0]
        percentile = row.head(1)['test_percentile'].values[0]

        percentiles[key].append(percentile)
        counts[key] += 1
        
    percentiles = {k: np.mean(v or [0]) for k, v in percentiles.items()}
    percentiles = pd.DataFrame.from_dict(percentiles, orient='index', columns=['score'])
        
    counts = pd.DataFrame.from_dict(counts, orient='index', columns=['count'])
    counts['percentage'] = 100 * (counts['count'] / counts[counts.index == 'total'].values[0])
    counts = counts.join(percentiles)
    return counts

In [None]:
methods = list(set(results['set'].unique()) - {'PCA PC', 'All features', 'Best features'})
count_wins_of_methods(datasets, results, methods)

In [None]:
methods = list(set(results['set'].unique()) - {'All features', 'Best features'})
count_wins_of_methods(datasets, results, methods)

In [None]:
methods = list(set(results['set'].unique()) - {'Best features'})
count_wins_of_methods(datasets, results, methods)

#### Histogram of model accuracy distribution and vertical line for individual feature selection methods
- dim=3, bearing=A, severity=False, k=5, f=3

In [None]:
from cycler import cycler

fig, ax = plt.subplots(2, 2, figsize=(15, 8))
for i, domain in enumerate(FEATURES):
    row = {
        'domain': domain,
        'dim': 3,
        'bearing': 'A',
        'severity': False
    }
    filename = make_filename(row)
    distribution = pd.read_csv(os.path.join(KNN_BRUTE_FORCE_PATH, filename))
    mselection = results[
        (results['domain'] == row['domain']) &
        (results['dim'] == row['dim']) &
        (results['bearing'] == row['bearing']) &
        (results['severity'] == row['severity']) &
        (results['k'] == 5) &
        (results['f'] == 3)
    ]
    params = dict(
        grid=True,
        bins=50,
        edgecolor='black',
        linewidth=0.5,
        color='gray',
        range=(0.5, 1)
    )
    colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']

    params['ax'] = ax[0][i]
    params['ax'].set_xlabel('Accuracy')
    params['ax'].set_ylabel('Number of k-NN models')
    params['ax'].set_title(visualize.DOMAIN_TITLES[domain] + ', Training set')
    distribution['train'].hist(**params)

    sel = mselection.set_index('set').to_dict()['train_accuracy']
    color_cycle = cycler(color=colors)
    for c, (k, v) in zip(color_cycle, sel.items()):
        params['ax'].axvline(v, linestyle='--', lw=2, label=k, **c) 
    params['ax'].legend()

    params['ax'] = ax[1][i]
    params['ax'].set_xlabel('Accuracy')
    params['ax'].set_ylabel('Number of k-NN models')
    params['ax'].set_title(visualize.DOMAIN_TITLES[domain] + ', Testing set')
    distribution['test'].hist(**params)

    sel = mselection.set_index('set').to_dict()['test_accuracy']
    color_cycle = cycler(color=colors)
    for c, (k, v) in zip(color_cycle, sel.items()):
        params['ax'].axvline(v, linestyle='--', lw=2, label=k, **c) 
    params['ax'].legend()

plt.tight_layout()
plt.show()

In [None]:
# TODO: same experiments for online models (4 experiments) - Po
# TODO: check if fsel is done on train set - best set is broken!

# TODO: Features EDA - corr to rpm, histograms per machine, time waveform, frequency spectra, time-frequency waveform, features in feature space
# TODO: EDA of compressors

# .................... Writing - St