### kNN streaming data

In [None]:
DOMAIN = 'td'

from river import preprocessing
from river import neighbors

import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sb

import sys
sys.path.append('../../')
from feature.selection import load_td_feat, load_fd_feat, METADATA_COLUMNS_ALL
from feature.models import fault_labeling, filter_out_metadata_columns


from sklearn.feature_selection import SelectKBest
from sklearn import metrics
import random


FEATURES_PATH =  '../../datasets/features_data/'
FAULT_CLASSES = {
    'normal': 'N',
    'imbalance': 'I',
    'horizontal-misalignment': 'HM',
    'vertical-misalignment': 'VM'
}

In [None]:

def knn_online_learn(dataset, label='fault', window_len=1, learn_skip=0, clusters=False):
    # Buffer true samples for learning for later: simulate delayed annotation
    learning_window = []

    # Model consists of scaler to give approximately same weight to all features and kNN
    scaler = preprocessing.MinMaxScaler() 
    knn = neighbors.KNNClassifier(n_neighbors=5)

    scores = []                 # List of tuples with accuracy, precision and recall score on each iteration
    v_true = []                 # Append y true sample on each iteration
    v_predict = []              # Append y predicted sample on each iteration

    # Randomize order of seen faults
    random.seed(10)
    rows_index = dataset.index.to_list()
    random.shuffle(rows_index)

    skipping = 0
    started = False
    order_saved = []

    for step, idx in enumerate(rows_index):
        row = dataset.iloc[idx]
        x = {k: v for k, v in dict(row).items() if k not in METADATA_COLUMNS_ALL}

        x_scaled = scaler.learn_one(x).transform_one(x)
        y_true = row[label]
        learning_window.append((x_scaled, y_true))

        if started:
            # Predict sample after at least one example has been learned
            y_predict = knn.predict_one(x_scaled)
            v_true.append(y_true)
            v_predict.append(y_predict)
            order_saved.append(idx)

            scores.append([
                step,
                metrics.accuracy_score(v_true, v_predict),
                metrics.precision_score(v_true, v_predict, average='micro'),
                metrics.recall_score(v_true, v_predict, average='micro')
            ])

        # Provide labels after window length has passed
        if len(learning_window) == window_len:
            for x, y in learning_window:
                # Learn first sample at start of window
                if skipping == learn_skip:
                    started = True
                    knn.learn_one(x, y)
                    skipping = 0
                else:
                    skipping += 1
            learning_window = []

    if clusters:
        return pd.Series(v_predict, index=order_saved)
        
    return pd.DataFrame(scores, columns=['step', 'accuracy', 'precision', 'recall'])

Load features

In [None]:
if DOMAIN == 'td':
    stream = load_td_feat(['az'], path=FEATURES_PATH)
    stream = fault_labeling(stream, FAULT_CLASSES)
    stream = stream[['fault', 'anomaly', 'az_rms', 'az_pp', 'az_shape']]

elif DOMAIN == 'fd':
    stream = load_fd_feat(['az'], path=FEATURES_PATH)
    stream = fault_labeling(stream, FAULT_CLASSES)
    stream = stream[['fault', 'anomaly', 'az_centroid_64', 'az_centroid_kurt', 'az_centroid_skew', 'az_roll_off_256']] 

# Warning: leakage information if feature importance is learned based on whole dataset
# Chicken and egg problem: cannot know best features without seeing whole dataset, but it is neccessary to find best features

Gradual learning
- 4 classes - N, VM, HM, I

In [None]:


results = knn_online_learn(stream, label='fault', window_len=1)
ax = results[['accuracy']].plot(
    grid=True, legend=False, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Fault classes: 4, Window size: 1'
)
best = results.tail(1)
best

Gradual learning
- Binary classifier - anomaly

In [None]:
results = knn_online_learn(stream, label='anomaly', window_len=1)
ax = results[['accuracy']].plot(
    grid=True, legend=False, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Fault classes: 1, Window size: 1'
)
best = results.tail(1)
best

Window learning
- Compare classification accuracies for window sizes in one graph: (1, 10, 50, 100, 250)
- Scenarios: fault, anomaly

In [None]:
learning_window_lengths = (1, 10, 50, 100, 250)

fault_evolution = pd.DataFrame()
for n in tqdm(learning_window_lengths):
    results = knn_online_learn(stream, label='fault', window_len=n)
    accuracy = results['accuracy']
    accuracy.index += n             # Starts learning after at least one window has been filled
    fault_evolution[str(n)] = accuracy

In [None]:
ax = fault_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Faults: Label with delay'
)
fault_evolution.tail(1)

In [None]:
anomaly_evolution = pd.DataFrame()
for n in tqdm(learning_window_lengths):
    results = knn_online_learn(stream, label='anomaly', window_len=n)
    accuracy = results['accuracy']
    accuracy.index += n             # Starts learning after at least one window has been filled
    anomaly_evolution[str(n)] = accuracy

In [None]:
ax = anomaly_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title='Anomaly: Label with delay'
)
anomaly_evolution.tail(1)

Missing labels - Faults

In [None]:
window_len = 10
labels_skips = (0, 5, 15, 25, 50, 100)

fault_skip_evolution = pd.DataFrame()
for s in tqdm(labels_skips):
    results = knn_online_learn(stream, label='fault', window_len=window_len, learn_skip=s)
    accuracy = results['accuracy']
    accuracy.index += len(stream) - len(accuracy)
    fault_skip_evolution[str(s)] = accuracy

In [None]:
ax = fault_skip_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title=f'Faults (4 classes): Skip labels (out of {len(stream)} total), Window: {window_len}'
)
fault_skip_evolution.tail(1)

Missing labels - Anomaly

In [None]:
anomaly_skip_evolution = pd.DataFrame()
for s in tqdm(labels_skips):
    results = knn_online_learn(stream, label='anomaly', window_len=window_len, learn_skip=s)
    accuracy = results['accuracy']
    accuracy.index += len(stream) - len(accuracy)
    anomaly_skip_evolution[str(s)] = accuracy

In [None]:
ax = anomaly_skip_evolution.fillna(0).plot(
    grid=True, legend=True, figsize=(8, 4),
    xlabel='Sample', ylabel='Accuracy', title=f'Anomaly: Skip labels (out of {len(stream)} total), Window: {window_len}'
)
anomaly_skip_evolution.tail(1)

Scatter plot - True labels vs. Predicted labels
- Faults
- Anomaly

In [None]:
def scatter_classif(X, y_label, categories, colors, ax):
    for label, color in zip(categories, colors):
            rows = list(y_label[y_label == label].index)
            x = X.loc[rows,0]
            y = X.loc[rows,1]
            ax.scatter(x, y, s=2, color=color, label=label)

from sklearn.decomposition import PCA

X = filter_out_metadata_columns(stream)
y_true = stream['fault']
y_predict = knn_online_learn(stream, label='fault', window_len=1, learn_skip=0, clusters=True)
y_predict = y_predict.astype('category')

X_pca = PCA(n_components=2).fit_transform(X)
X_pca = pd.DataFrame(X_pca)

categories = y_true.cat.categories
colors = sb.color_palette('hls', len(categories))

# Plot
fig, ax = plt.subplots(1, 3, figsize=(20, 5))

scatter_classif(X_pca, y_true, categories, colors, ax[0])
scatter_classif(X_pca, y_predict, categories, colors, ax[1])

match = y_predict == y_true[y_predict.index]
good = y_predict[match == True].index
bad = y_predict[match == False].index

ax[2].scatter(X_pca[0].loc[good], X_pca[1].loc[good], s=2, color='green', label='Good')
ax[2].scatter(X_pca[0].loc[bad], X_pca[1].loc[bad], s=2, color='red', label='Bad')

for i in range(3):
    ax[i].grid()
    ax[i].legend()

plt.show()