### kNN streaming data

In [None]:
#DOMAIN = 'TD' 
DOMAIN = 'FD'

In [None]:
import os
import re
import random
import functools
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn import metrics as skmetrics

from tqdm.notebook import tqdm
from river import metrics
from river import preprocessing
from river import neighbors, utils, evaluate, stream

import extraction
import mafaulda
import visualize

In [None]:
def knn_online_learn(X, Y, window_len=1, learn_skip=0, clusters=False):
    # Buffer true samples for learning for later: simulate delayed annotation
    learning_window = []

    # Model consists of scaler to give approximately same weight to all features and kNN
    scaler = preprocessing.MinMaxScaler() 
    knn = neighbors.KNNClassifier(n_neighbors=5)

    scores = []                 # List of tuples with accuracy, precision and recall score on each iteration
    v_true = []                 # Append y true sample on each iteration
    v_predict = []              # Append y predicted sample on each iteration

    skipping = 0
    started = False
    order_saved = []
    X['label'] = Y

    for idx, row in tqdm(X.iterrows()):
        x = {k: v for k, v in dict(row).items() if k != 'label'}
        x_scaled = scaler.learn_one(x).transform_one(x)
        y_true = row['label']
        learning_window.append((x_scaled, y_true))

        if started:
            # Predict sample after at least one example has been learned
            y_predict = knn.predict_one(x_scaled)
            v_true.append(y_true)
            v_predict.append(y_predict)
            order_saved.append(idx)

            scores.append([
                idx,
                skmetrics.accuracy_score(v_true, v_predict),
                skmetrics.precision_score(v_true, v_predict, average='micro'),
                skmetrics.recall_score(v_true, v_predict, average='micro')
            ])

        # Provide labels after window length has passed
        if len(learning_window) == window_len:
            for x, y in learning_window:
                # Learn first sample at start of window
                if skipping == learn_skip:
                    started = True
                    knn.learn_one(x, y)
                    skipping = 0
                else:
                    skipping += 1
            learning_window = []

    if clusters:
        return pd.Series(v_predict, index=order_saved)
        
    return pd.DataFrame(scores, columns=['step', 'accuracy', 'precision', 'recall'])

Load features

In [None]:
def load_source(domain: str, row: dict):
    PATH = '../datasets/'
    FEATURES_PATH = os.path.join(PATH, 'features')
    MAFAULDA_TEMPORAL = os.path.join(FEATURES_PATH, 'MAFAULDA_TD.csv')
    MAFAULDA_SPECTRAL = os.path.join(FEATURES_PATH, 'MAFAULDA_FD.csv')

    dataset = {
        'TD': MAFAULDA_TEMPORAL,
        'FD': MAFAULDA_SPECTRAL,
        'axis': {
            'A': ['ax', 'ay', 'az'],
            'B': ['bx', 'by', 'bz']
        },
        'labels': ['fault', 'severity', 'rpm']
    }

    placement = row['placement']
    df = extraction.load_features(
        dataset[domain],
        dataset['axis'][placement],
        dataset['labels']
    )
    frame = mafaulda.assign_labels(df, placement)
    Y = frame['label']
    X = frame.drop(columns=['label'])

    # Batch / Online hold-out (balance and event sequencing)
    train_size = 0.8

    # Shuffle order within severity level and order event with increasing severity
    features = mafaulda.label_severity(df, placement, 0.5, keep=True)
    # Shuffle order within severity level and order event with increasing severity
    groups = [
        frame.sample(frac=1, random_state=10)
        for i, frame in (
            features
            .sort_values(by='severity_level')
            .groupby('severity_level')
        )
    ]
    rows = list(pd.concat(groups).index)
    
    X = X.loc[rows].reset_index(drop=True)
    Y = Y.loc[rows].reset_index(drop=True)

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, train_size=train_size, random_state=10
    )  
    X_train, X_test, Y_train, Y_test = (
        X_train.sort_index(), X_test.sort_index(),
        Y_train.sort_index(), Y_test.sort_index()
    )

    serevity_groups = pd.concat(groups)['severity_level'].reset_index(drop=True).sort_index()
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(np.arange(0, len(X_train)), serevity_groups.loc[X_train.index], color='red')
    ax.set_xlabel('Observations')
    ax.set_ylabel('Severity level')
    ax.grid()
    plt.show()

    return X_train, Y_train, X_test, Y_test   # WARNING: order matters

In [None]:
XStream_fault, YStream_fault, _, YF1 = load_source(DOMAIN, {'placement': 'A', 'domain': DOMAIN})

Evolution of faults and anomalies

Train

In [None]:
a, b = visualize.plot_label_occurences(YStream_fault)

Test

In [None]:
visualize.plot_label_occurences(YF1)

Gradual learning
- 6 classes

In [None]:
results = knn_online_learn(XStream_fault, YStream_fault, window_len=1)
ax = results[['accuracy']].plot(
    grid=True, legend=False, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Fault classes: 6, Window size: 1'
)
best = results.tail(1)
best

Window learning
- Compare classification accuracies for window sizes in one graph: (1, 10, 50, 100, 250)
- Scenarios: fault, anomaly

In [None]:
learning_window_lengths = (1, 10, 50, 100, 250)

fault_evolution = pd.DataFrame()
for n in tqdm(learning_window_lengths):
    results = knn_online_learn(XStream_fault, YStream_fault, window_len=n)
    accuracy = results['accuracy']
    accuracy.index += n             # Starts learning after at least one window has been filled
    fault_evolution[str(n)] = accuracy

In [None]:
ax = fault_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4), #ylim=(0.8, 1.01),
    xlabel='Sample', ylabel='Accuracy' #, title='Faults: Label with delay'
)
fault_evolution.tail(1)

Missing labels - Faults

In [None]:
window_len = 10
labels_skips = (0, 5, 15, 25, 50, 100)

fault_skip_evolution = pd.DataFrame()
for s in tqdm(labels_skips):
    results = knn_online_learn(XStream_fault, YStream_fault, window_len=window_len, learn_skip=s)
    accuracy = results['accuracy']
    accuracy.index += len(XStream_fault) - len(accuracy)
    fault_skip_evolution[str(s)] = accuracy

In [None]:
ax = fault_skip_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4), # ylim=(0, 1.01),
    xlabel='Sample', ylabel='Accuracy' # , title=f'Faults (4 classes): Skip labels (out of {len(XStream_fault)} total), Window: {window_len}'
)
fault_skip_evolution.tail(1)

Scatter plot - True labels vs. Predicted labels

In [None]:
visualize.project_classifier_map_plot(
    XStream_fault.drop(columns=['label']).reset_index(drop=True),
    YStream_fault.reset_index(drop=True),
    knn_online_learn(XStream_fault.reset_index(drop=True), YStream_fault.reset_index(drop=True), window_len=1, learn_skip=0, clusters=True)
)
plt.show()

In [None]:
def knn_model_setup(n):
    engine = neighbors.SWINN(
        dist_func=functools.partial(utils.math.minkowski_distance, p=2),
        seed=10
    )
    model = (
        preprocessing.MinMaxScaler() |
        neighbors.KNNClassifier(n_neighbors=n, engine=engine)
    )
    return model


def knn_accuracy_with_delays(X, y, delays):
    knn = knn_model_setup(5)

    evolution = defaultdict(dict)
    for delay in delays:
        steps = evaluate.iter_progressive_val_score(
            model=knn,
            dataset=stream.iter_pandas(X, y),
            metric=metrics.Accuracy(),
            step=100,
            delay=delay
        )
        for step in steps:
            step_num = step['Step']
            evolution[step_num]['Observation'] = step_num
            evolution[step_num][delay] = step['Accuracy'].get()

    evolution = (
        pd.DataFrame
        .from_records(list(evolution.values()))
        .set_index('Observation')
    )
    evolution.plot(
        grid=True, figsize=(8, 4), 
        ylabel='Accuracy'
       # title='Accuracy with different delays'
    )
    return evolution


def knn_conf_matrix_plot(X, y):
    knn = knn_model_setup(5)
    #confmatrix = metrics.ConfusionMatrix()
    y_predictions = []

    for x, y_true in stream.iter_pandas(X, y):
        y_predict = knn.predict_one(x) or 0
        knn.learn_one(x, y_true)
        y_predictions.append(y_predict)
        # confmatrix.update(y_true, y_predict)

    cm = skmetrics.confusion_matrix(y, y_predictions)
    ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')
    ax.set(xlabel='Prediction', ylabel='Truth')


def knn_visualize_classes(X, y):
    knn = knn_model_setup(5)

    y_predictions = []
    for xs, ys in stream.iter_pandas(X, y):
        y_predict = knn.predict_one(xs)
        knn.learn_one(xs, ys)
        y_predictions.append(y_predict)

    y_predictions = pd.Series(y_predictions)
    mismatch = visualize.project_classifier_map_plot(X, y, y_predictions)
    print(f'Error rate: {100 * (len(mismatch) / len(y)):.2f} %')

### kNN classifier (Faults)

In [None]:
X, y = XStream_fault.drop(columns=['label']), YStream_fault

In [None]:
evolution = knn_accuracy_with_delays(X, y, (1, 50, 100, 250))
plt.show()
evolution.tail(1)

### Classification report (Faults)

In [None]:
evaluate.progressive_val_score(
    model=knn_model_setup(5),
    dataset=stream.iter_pandas(X, y),
    metric=metrics.ClassificationReport()
)

### Confusion matrix (Faults)

In [None]:
knn_conf_matrix_plot(X, y)

### Visualize clusters of nearest neighbors (Faults)

In [None]:
x_scaled = X.reset_index(drop=True)
y_scaled = y.reset_index(drop=True)
x_scaled[x_scaled.columns] = MinMaxScaler().fit_transform(x_scaled)
knn_visualize_classes(x_scaled, y_scaled)