### kNN streaming data

In [None]:
DOMAINS = ['TD', 'FD']
DOMAIN = DOMAINS[0]

In [None]:
import os
import re
import random
import functools
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn import metrics as skmetrics

from tqdm.notebook import tqdm
from river import metrics
from river import preprocessing
from river import neighbors, utils, evaluate, stream

import sys
sys.path.append('../')
from vibrodiagnostics import (
    mafaulda, 
    extraction,
    visualize,
    models
)

plt.rcParams.update({'font.size': 14})

Load features

In [None]:
def load_source(domain: str, row: dict):
    PATH = '../datasets/'
    FEATURES_PATH = os.path.join(PATH, 'features')
    MAFAULDA_TEMPORAL = os.path.join(FEATURES_PATH, 'MAFAULDA_TD.csv')
    MAFAULDA_SPECTRAL = os.path.join(FEATURES_PATH, 'MAFAULDA_FD.csv')

    dataset = {
        'TD': MAFAULDA_TEMPORAL,
        'FD': MAFAULDA_SPECTRAL,
        'axis': {
            'A': ['ax', 'ay', 'az'],
            'B': ['bx', 'by', 'bz']
        },
        'labels': ['fault', 'severity', 'rpm']
    }

    placement = row['placement']
    df = extraction.load_features(
        dataset[domain],
        dataset['axis'][placement],
        dataset['labels']
    )
    frame = mafaulda.assign_labels(df, placement)
    Y = frame['label']
    X = frame.drop(columns=['label'])

    # Shuffle order within severity level and order event with increasing severity
    features = mafaulda.label_severity(df, placement, 0.5, keep=True)
    # Shuffle order within severity level and order event with increasing severity
    groups = [
        frame.sample(frac=1, random_state=10)
        for i, frame in (
            features
            .sort_values(by='severity_level')
            .groupby('severity_level')
        )
    ]
    rows = list(pd.concat(groups).index)
    
    features = features.loc[rows].reset_index(drop=True)
    X = X.loc[rows].reset_index(drop=True)
    Y = Y.loc[rows].reset_index(drop=True)
    X['severity_level'] = features['severity_level']

    X, Y, features = X.sort_index(), Y.sort_index(), features.sort_index()
    
    visualize.evolution_of_severity_levels(X)
    X = X.drop(columns=['severity_level'])

    return X, Y

In [None]:
X, Y = load_source(DOMAIN, {'placement': 'A', 'domain': DOMAIN})

Evolution of faults 

In [None]:
a, b = visualize.plot_label_occurences(Y)

Test

Gradual learning
- 6 classes

In [None]:
results = models.knn_online_learn(X, Y, window_len=1)
ax = results[['accuracy']].plot(
    grid=True, legend=False, figsize=(10, 5),
    xlabel='Observations', ylabel='Accuracy' # title='Fault classes: 6, Window size: 1'
)
best = results.tail(1)
best

Window learning
- Compare classification accuracies for window sizes in one graph: (1, 10, 50, 100, 250)

In [None]:
learning_window_lengths = (1, 10, 100)

fault_evolution = pd.DataFrame()
for n in tqdm(learning_window_lengths):
    results = models.knn_online_learn(X, Y, window_len=n)
    accuracy = results['accuracy']
    accuracy.index += n             # Starts learning after at least one window has been filled
    fault_evolution[str(n)] = accuracy

In [None]:
ax = fault_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(10, 5), #ylim=(0.8, 1.01),
    xlabel='Sample', ylabel='Accuracy' #, title='Faults: Label with delay'
)
fault_evolution.tail(1)

Missing labels - Faults

In [None]:
window_len = 10
labels_skips = (0, 2, 10, 50)

fault_skip_evolution = pd.DataFrame()
for s in tqdm(labels_skips):
    results = models.knn_online_learn(X, Y, window_len=window_len, learn_skip=s)
    accuracy = results['accuracy']
    accuracy.index += len(X) - len(accuracy)
    fault_skip_evolution[str(s)] = accuracy

In [None]:
ax = fault_skip_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(10, 5), # ylim=(0, 1.01),
    xlabel='Sample', ylabel='Accuracy' # , title=f'Faults (4 classes): Skip labels (out of {len(XStream_fault)} total), Window: {window_len}'
)
fault_skip_evolution.tail(1)

#### Accuracy at same observation point with different label skips

In [None]:
# Line - generation (), Y-axis: accuracy, X-axis: skip amount
evolution = fault_skip_evolution[200:len(fault_skip_evolution)-1:400]
evolution = evolution.T.reset_index()
evolution['index'] = evolution['index'].astype(int)
evolution['index'] = 100 / evolution['index']
evolution.replace([np.inf], 100, inplace=True)
evolution =  evolution.set_index('index')
evolution

In [None]:
ax = evolution.plot(marker='o', grid=True, figsize=(8, 6))
ax.set_xlabel('Fraction of original labels [%]')
ax.set_ylabel('Accuracy [%]')
plt.show()

Scatter plot - True labels vs. Predicted labels

In [None]:
visualize.project_classifier_map_plot(
    X.drop(columns=['label']).reset_index(drop=True),
    Y.reset_index(drop=True),
    models.knn_online_learn(X.reset_index(drop=True), Y.reset_index(drop=True), window_len=1, learn_skip=0, clusters=True)
)
plt.show()

In [None]:
def knn_model_setup(n):
    engine = neighbors.SWINN(
        dist_func=functools.partial(utils.math.minkowski_distance, p=2),
        seed=10
    )
    model = (
        preprocessing.MinMaxScaler() |
        neighbors.KNNClassifier(n_neighbors=n, engine=engine)
    )
    return model


def knn_accuracy_with_delays(X, y, delays):
    knn = knn_model_setup(5)

    evolution = defaultdict(dict)
    for delay in delays:
        steps = evaluate.iter_progressive_val_score(
            model=knn,
            dataset=stream.iter_pandas(X, y),
            metric=metrics.Accuracy(),
            step=100,
            delay=delay
        )
        for step in steps:
            step_num = step['Step']
            evolution[step_num]['Observation'] = step_num
            evolution[step_num][delay] = step['Accuracy'].get()

    evolution = (
        pd.DataFrame
        .from_records(list(evolution.values()))
        .set_index('Observation')
    )
    evolution.plot(
        grid=True, figsize=(8, 4), 
        ylabel='Accuracy'
       # title='Accuracy with different delays'
    )
    return evolution


def knn_conf_matrix_plot(X, y):
    knn = knn_model_setup(5)
    #confmatrix = metrics.ConfusionMatrix()
    y_predictions = []

    for x, y_true in stream.iter_pandas(X, y):
        y_predict = knn.predict_one(x) or 0
        knn.learn_one(x, y_true)
        y_predictions.append(y_predict)
        # confmatrix.update(y_true, y_predict)

    cm = skmetrics.confusion_matrix(y, y_predictions)
    ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')
    ax.set(xlabel='Prediction', ylabel='Truth')


def knn_visualize_classes(X, y):
    knn = knn_model_setup(5)

    y_predictions = []
    for xs, ys in stream.iter_pandas(X, y):
        y_predict = knn.predict_one(xs)
        knn.learn_one(xs, ys)
        y_predictions.append(y_predict)

    y_predictions = pd.Series(y_predictions)
    mismatch = visualize.project_classifier_map_plot(X, y, y_predictions)
    print(f'Error rate: {100 * (len(mismatch) / len(y)):.2f} %')

### kNN classifier (Faults)

In [None]:
X, y = X.drop(columns=['label']), Y

In [None]:
evolution = knn_accuracy_with_delays(X, y, (1, 50, 100, 250))
plt.show()
evolution.tail(1)

### Classification report (Faults)

In [None]:
evaluate.progressive_val_score(
    model=knn_model_setup(5),
    dataset=stream.iter_pandas(X, y),
    metric=metrics.ClassificationReport()
)

### Confusion matrix (Faults)

In [None]:
knn_conf_matrix_plot(X, y)