### kNN streaming data

In [5]:
DOMAIN = 'fd'

from river import preprocessing
from river import neighbors, utils, evaluate, stream

import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sb

import os
import sys
sys.path.append('../')
from vibrodiagnostics import selection, models
from vibrodiagnostics.selection import load_td_feat, load_fd_feat, METADATA_COLUMNS_ALL
from vibrodiagnostics.models import (
    fault_labeling, filter_out_metadata_columns, project_classifier_map_plot
)


from sklearn.feature_selection import SelectKBest
from sklearn import metrics
import random
import functools
from collections import defaultdict
from sklearn import metrics as skmetrics


FEATURES_PATH =  '../../datasets/features_data/'
FAULT_CLASSES = {
    'normal': 'N',
    'imbalance': 'I',
    'horizontal-misalignment': 'HM',
    'vertical-misalignment': 'VM'
}

TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)
TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)

FEATURE_SETS = [
    TD_FEATURES, FD_FEATURES, TD_FD_FEATURES
]
FEATURES = FEATURE_SETS[0]

In [6]:

def knn_online_learn(dataset, label='fault', window_len=1, learn_skip=0, clusters=False):
    # Buffer true samples for learning for later: simulate delayed annotation
    learning_window = []

    # Model consists of scaler to give approximately same weight to all features and kNN
    scaler = preprocessing.MinMaxScaler() 
    knn = neighbors.KNNClassifier(n_neighbors=5)

    scores = []                 # List of tuples with accuracy, precision and recall score on each iteration
    v_true = []                 # Append y true sample on each iteration
    v_predict = []              # Append y predicted sample on each iteration

    # Randomize order of seen faults
    random.seed(10)
    rows_index = dataset.index.to_list()
    random.shuffle(rows_index)

    skipping = 0
    started = False
    order_saved = []

    for step, idx in enumerate(rows_index):
        row = dataset.iloc[idx]
        x = {k: v for k, v in dict(row).items() if k not in METADATA_COLUMNS_ALL}

        x_scaled = scaler.learn_one(x).transform_one(x)
        y_true = row[label]
        learning_window.append((x_scaled, y_true))

        if started:
            # Predict sample after at least one example has been learned
            y_predict = knn.predict_one(x_scaled)
            v_true.append(y_true)
            v_predict.append(y_predict)
            order_saved.append(idx)

            scores.append([
                step,
                metrics.accuracy_score(v_true, v_predict),
                metrics.precision_score(v_true, v_predict, average='micro'),
                metrics.recall_score(v_true, v_predict, average='micro')
            ])

        # Provide labels after window length has passed
        if len(learning_window) == window_len:
            for x, y in learning_window:
                # Learn first sample at start of window
                if skipping == learn_skip:
                    started = True
                    knn.learn_one(x, y)
                    skipping = 0
                else:
                    skipping += 1
            learning_window = []

    if clusters:
        return pd.Series(v_predict, index=order_saved)
        
    return pd.DataFrame(scores, columns=['step', 'accuracy', 'precision', 'recall'])

Load features

In [7]:

if DOMAIN == 'td':
    stream = load_td_feat(['az'], path=FEATURES_PATH)
    stream = fault_labeling(stream, FAULT_CLASSES)
    #stream = stream[['fault', 'anomaly', 'az_rms', 'az_pp', 'az_shape']]

elif DOMAIN == 'fd':
    stream = load_fd_feat(['az'], path=FEATURES_PATH)
    stream = fault_labeling(stream, FAULT_CLASSES)
    #stream = stream[['fault', 'anomaly', 'az_centroid_64', 'az_centroid_256', 'az_skew_64', 'az_roll_off_256']] 

# Warning: leakage information if feature importance is learned based on whole dataset
# Chicken and egg problem: cannot know best features without seeing whole dataset, but it is neccessary to find best features

Gradual learning
- 4 classes - N, VM, HM, I

In [8]:
results = knn_online_learn(stream, label='fault', window_len=1)
ax = results[['accuracy']].plot(
    grid=True, legend=False, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Fault classes: 4, Window size: 1'
)
best = results.tail(1)
best

Gradual learning
- Binary classifier - anomaly

In [None]:
results = knn_online_learn(stream, label='anomaly', window_len=1)
ax = results[['accuracy']].plot(
    grid=True, legend=False, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Fault classes: 1, Window size: 1'
)
best = results.tail(1)
best

Window learning
- Compare classification accuracies for window sizes in one graph: (1, 10, 50, 100, 250)
- Scenarios: fault, anomaly

In [None]:
learning_window_lengths = (1, 10, 50, 100, 250)

fault_evolution = pd.DataFrame()
for n in tqdm(learning_window_lengths):
    results = knn_online_learn(stream, label='fault', window_len=n)
    accuracy = results['accuracy']
    accuracy.index += n             # Starts learning after at least one window has been filled
    fault_evolution[str(n)] = accuracy

In [None]:
ax = fault_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Faults: Label with delay'
)
fault_evolution.tail(1)

In [None]:
anomaly_evolution = pd.DataFrame()
for n in tqdm(learning_window_lengths):
    results = knn_online_learn(stream, label='anomaly', window_len=n)
    accuracy = results['accuracy']
    accuracy.index += n             # Starts learning after at least one window has been filled
    anomaly_evolution[str(n)] = accuracy

In [None]:
ax = anomaly_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Anomaly: Label with delay'
)
anomaly_evolution.tail(1)

Missing labels - Faults

In [None]:
window_len = 10
labels_skips = (0, 5, 15, 25, 50, 100)

fault_skip_evolution = pd.DataFrame()
for s in tqdm(labels_skips):
    results = knn_online_learn(stream, label='fault', window_len=window_len, learn_skip=s)
    accuracy = results['accuracy']
    accuracy.index += len(stream) - len(accuracy)
    fault_skip_evolution[str(s)] = accuracy

In [None]:
ax = fault_skip_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title=f'Faults (4 classes): Skip labels (out of {len(stream)} total), Window: {window_len}'
)
fault_skip_evolution.tail(1)

Missing labels - Anomaly

In [None]:
anomaly_skip_evolution = pd.DataFrame()
for s in tqdm(labels_skips):
    results = knn_online_learn(stream, label='anomaly', window_len=window_len, learn_skip=s)
    accuracy = results['accuracy']
    accuracy.index += len(stream) - len(accuracy)
    anomaly_skip_evolution[str(s)] = accuracy

In [None]:
ax = anomaly_skip_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title=f'Anomaly: Skip labels (out of {len(stream)} total), Window: {window_len}'
)
anomaly_skip_evolution.tail(1)

Scatter plot - True labels vs. Predicted labels
- Faults
- Anomaly

In [None]:
project_classifier_map_plot(
    filter_out_metadata_columns(stream),
    stream['fault'],
    knn_online_learn(stream, label='fault', window_len=1, learn_skip=0, clusters=True)
)
plt.show()

In [None]:
from river import metrics

def knn_model_setup(n):
    engine = neighbors.SWINN(
        dist_func=functools.partial(utils.math.minkowski_distance, p=2),
        seed=10
    )
    model = (
        preprocessing.StandardScaler() |
        neighbors.KNNClassifier(n_neighbors=n, engine=engine)
    )
    return model


def knn_accuracy_with_delays(X, y, delays):
    knn = knn_model_setup(5)

    evolution = defaultdict(dict)
    for delay in delays:
        steps = evaluate.iter_progressive_val_score(
            model=knn,
            dataset=stream.iter_pandas(X, y),
            metric=metrics.Accuracy(),
            step=100,
            delay=delay
        )
        for step in steps:
            step_num = step['Step']
            evolution[step_num]['Observation'] = step_num
            evolution[step_num][delay] = step['Accuracy'].get()


    evolution = (
        pd.DataFrame
        .from_records(list(evolution.values()))
        .set_index('Observation')
    )
    evolution.plot(
        grid=True, figsize=(15, 4), 
        marker='.', ylabel='Accuracy', 
        title='Accuracy with different delays'
    )


def knn_conf_matrix_plot(X, y):
    knn = knn_model_setup(5)
    #confmatrix = metrics.ConfusionMatrix()
    y_predictions = []

    for x, y_true in stream.iter_pandas(X, y):
        y_predict = knn.predict_one(x) or 0
        knn.learn_one(x, y_true)
        y_predictions.append(y_predict)
        # confmatrix.update(y_true, y_predict)


    cm = skmetrics.confusion_matrix(y, y_predictions)
    ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')
    ax.set(xlabel='Prediction', ylabel='Truth')


def knn_visualize_classes(X, y):
    knn = knn_model_setup(5)

    y_predictions = []
    for xs, ys in stream.iter_pandas(X, y):
        y_predict = int(knn.predict_one(xs) or 0)
        knn.learn_one(xs, ys)
        y_predictions.append(y_predict)

    y_predictions = pd.Series(y_predictions)
    mismatch = models.project_classifier_map_plot(X, y, y_predictions)
    print(f'Error rate: {100 * (len(mismatch) / len(y)):.2f} %')

### kNN classifier (Faults)

In [None]:
X, y = models.features_subset(FEATURES, selection.FAULT_CLASSES, ['az'], 'fault')
X.head(5)

In [None]:
knn_accuracy_with_delays(X, y, (1, 50, 100, 250))
plt.show()

### Classification report (Faults)

In [None]:
evaluate.progressive_val_score(
    model=knn_model_setup(5),
    dataset=stream.iter_pandas(X, y),
    metric=metrics.ClassificationReport()
)

### Confusion matrix (Faults)

In [None]:
knn_conf_matrix_plot(X, y)

### Visualize clusters of nearest neighbors (Faults)

In [None]:
knn_visualize_classes(X, y)

### kNN classifier (Anomaly)

In [None]:
X, y = models.features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'anomaly')
X.head(5)

In [None]:
knn_accuracy_with_delays(X, y, (1, 50, 100, 250))
plt.show()

### Classification report (Anomaly)

In [None]:
evaluate.progressive_val_score(
    model=knn_model_setup(5),
    dataset=stream.iter_pandas(X, y),
    metric=metrics.ClassificationReport()
)

### Confusion matrix (Anomaly)

In [None]:
knn_conf_matrix_plot(X, y)

### Visualize clusters of nearest neighbors (Anomaly)

In [None]:
knn_visualize_classes(X, y)