### Online machine learning
- Incremental learning experiments

In [None]:
import os
import re
from zipfile import ZipFile
from pprint import pprint
from collections import defaultdict

import sys
sys.path.append('../../')
from feature import mafaulda
from feature import selection
from feature import discovery
from feature import models

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import functools
from sklearn import metrics as skmetrics
import river
from river import (
    anomaly,
    neighbors,
    preprocessing,
    evaluate,
    stream,
    metrics,
    feature_selection,
    stats
)
import tsfel


PATH_PREFIX = '../../datasets/'
FEATURES_PATH =  os.path.join(PATH_PREFIX, 'features_data')

DATASET_PATH = os.path.join(PATH_PREFIX, 'MAFAULDA.zip')
MAFAULDA_METADATA = os.path.join(FEATURES_PATH, 'mafaulda_metadata.csv')

FEATURES_FILENAME = os.path.join(FEATURES_PATH, selection.TSFEL_FEATURES_PATH)
# Implement: Mahanalobis distance (EmpiricalCovariance)
# Implement Feature selection: F test, MI

In [None]:
def tsfel_features_subset(filename, classes, axis, label=None, severity_sort=False, anomaly_severity=0.5):
    features = pd.read_csv(filename)
    features = models.fault_labeling(features, classes, anomaly_severity=anomaly_severity, debug=False)

    if severity_sort:
        groups = [
            df.sample(frac=1, random_state=10)
            for i, df in (
                features.sort_values(by='severity_level')
                        .groupby('severity_level')
            )
        ]
        features = pd.concat(groups).reset_index(drop=True)

    else:
        features = (         # Shuffle
            features
            .sample(frac=1, random_state=10)
            .reset_index(drop=True)
        )

    columns = features.columns.str.startswith(tuple(axis))
    features.columns[columns]
    X = features[features.columns[columns]]
    if label is None:
        return X
    else:
        y = features[label].astype('category').cat.codes
        return X, y

### Evolution of classes in dataset
- [ ] Faults over observations (number of seen until observation)
- [ ] Fault severity
- [ ] Anomaly (classes and percentages)

In [None]:
def plot_label_occurences(y):
    observations = []
    columns = list(y.astype('category').cat.categories)
    empty = dict(zip(columns, len(columns) * [0]))

    for row in y.astype('category').cat.codes:
        sample = empty.copy()
        sample[row] = 1
        observations.append(sample)

    class_occurences = pd.DataFrame.from_records(observations).cumsum()
    class_occurences.plot(grid=True, figsize=(10, 3), xlabel='Observations', ylabel='Label occurences')


X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'fault')
plot_label_occurences(y)
plt.show()

In [None]:
X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'anomaly')
plot_label_occurences(y)
plt.show()

### Feature selection

- [X] Pearson correlation
- [ ] Fisher score
- [ ] Mutual information

In [None]:
X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'fault')
selector = feature_selection.SelectKBest(
    similarity=stats.PearsonCorr(), k=2
)

best = []
for xs, ys in stream.iter_pandas(X, y):
    selector.learn_one(xs, ys)
    best.append({k: abs(v) for k, v in selector.leaderboard.items()})

# Get only n best featues to plot
n_top_names = [(k, abs(v)) for k, v in selector.leaderboard.items()]
n_top_names = sorted(n_top_names, key=lambda x: x[1], reverse=True)[:10]
n_top_names = set(map(lambda x: x[0], n_top_names))
best = [
    {k: v for k, v in step.items() if k in n_top_names}
    for step in best
]

feature_set = pd.DataFrame.from_records(best)
feature_set.plot(
    figsize=(15, 6), grid=True, ylim=(0.4, 0.9),
    xlabel='Observation', ylabel='Correlation with fault'
)
plt.show()
feature_set.tail(1)

---
### K Nearest Neighbors

In [None]:
def knn_model_setup(n):
    engine = neighbors.SWINN(
        dist_func=functools.partial(river.utils.math.minkowski_distance, p=2),
        seed=10
    )
    model = (
        preprocessing.StandardScaler() |
        neighbors.KNNClassifier(n_neighbors=n, engine=engine)
    )
    return model


def knn_accuracy_with_delays(X, y, delays):
    knn = knn_model_setup(5)

    evolution = defaultdict(dict)
    for delay in delays:
        steps = evaluate.iter_progressive_val_score(
            model=knn,
            dataset=stream.iter_pandas(X, y),
            metric=metrics.Accuracy(),
            step=100,
            delay=delay
        )
        for step in steps:
            step_num = step['Step']
            evolution[step_num]['Observation'] = step_num
            evolution[step_num][delay] = step['Accuracy'].get()


    evolution = (
        pd.DataFrame
        .from_records(list(evolution.values()))
        .set_index('Observation')
    )
    evolution.plot(
        grid=True, figsize=(15, 4), 
        marker='.', ylabel='Accuracy', 
        title='Accuracy with different delays'
    )


def knn_conf_matrix_plot(X, y):
    knn = knn_model_setup(5)
    #confmatrix = metrics.ConfusionMatrix()
    y_predictions = []

    for x, y_true in stream.iter_pandas(X, y):
        y_predict = knn.predict_one(x) or 0
        knn.learn_one(x, y_true)
        y_predictions.append(y_predict)
        # confmatrix.update(y_true, y_predict)


    cm = skmetrics.confusion_matrix(y, y_predictions)
    ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')
    ax.set(xlabel='Prediction', ylabel='Truth')


def knn_visualize_classes(X, y):
    knn = knn_model_setup(5)

    y_predictions = []
    for xs, ys in stream.iter_pandas(X, y):
        y_predict = int(knn.predict_one(xs) or 0)
        knn.learn_one(xs, ys)
        y_predictions.append(y_predict)

    y_predictions = pd.Series(y_predictions)
    mismatch = models.project_classifier_map_plot(X, y, y_predictions)
    print(f'Error rate: {100 * (len(mismatch) / len(y)):.2f} %')

### kNN classifier (Faults)

In [None]:
X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'fault')
X.head(5)

In [None]:
knn_accuracy_with_delays(X, y, (1, 50, 100, 250))
plt.show()

### Classification report (Faults)

In [None]:
evaluate.progressive_val_score(
    model=knn_model_setup(5),
    dataset=stream.iter_pandas(X, y),
    metric=metrics.ClassificationReport()
)

### Confusion matrix (Faults)

In [None]:
knn_conf_matrix_plot(X, y)

### Visualize clusters of nearest neighbors (Faults)

In [None]:
knn_visualize_classes(X, y)

### kNN classifier (Anomaly)

In [None]:
X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'anomaly')
X.head(5)

In [None]:
knn_accuracy_with_delays(X, y, (1, 50, 100, 250))
plt.show()

### Classification report (Anomaly)

In [None]:
evaluate.progressive_val_score(
    model=knn_model_setup(5),
    dataset=stream.iter_pandas(X, y),
    metric=metrics.ClassificationReport()
)

### Confusion matrix (Anomaly)

In [None]:
knn_conf_matrix_plot(X, y)

### Visualize clusters of nearest neighbors (Anomaly)

In [None]:
knn_visualize_classes(X, y)

---
### TODO: Hoeffding Tree classifier of faults

### TODO: DenStream

In [None]:
X, y = tsfel_features_subset(FEATURES_FILENAME, FAULT_CLASSES, ['az'], 'fault')
denstream = (
    preprocessing.StandardScaler() |
    cluster.DenStream(
        decaying_factor=0.01, beta=0.5, mu=2.5, epsilon=0.5, n_samples_init=10
    )
)
steps = evaluate.iter_progressive_val_score(
    model=denstream,
    dataset=river.stream.iter_pandas(X),
    metric=metrics.Silhouette(),
    step=200,
    delay=10
)
for step in steps:
    print(step)

#success = pd.DataFrame.from_records(steps).set_index('Step')
#success.plot(grid=True, figsize=(20, 5))

### DenStream Clusters visualization

In [None]:
denstream = (
    preprocessing.MinMaxScaler() |
    cluster.DenStream(
        decaying_factor=0.01, beta=0.5, mu=2.5, epsilon=0.5, n_samples_init=10
    )
)

predictions = []
for x, y in dataset.iter_pandas(dataset):
    y_predict = denstream.predict(x)
    denstream.learn_one(x)              # denstream = denstream.learn_one(x)
    predictions.append({'y_true': y, 'y_predict': y_predict})


pd.DataFrame.from_records(predictions)

### Half-space trees (Anomaly)

#### HST: Parameter = Trees

In [None]:
X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'anomaly')

results = []
for tree in (1, 10, 30, 50, 100, 500):
    hst = (
        preprocessing.MinMaxScaler() |
        anomaly.HalfSpaceTrees(n_trees=tree, height=3, window_size=30, seed=10)
    )
    steps = evaluate.iter_progressive_val_score(
        model=hst,
        dataset=stream.iter_pandas(X, y),
        metric=metrics.ROCAUC(),
        step=100
    )
    evolution = []
    for step in steps:
        evolution.append({
            'Step': step['Step'],
            tree: step['ROCAUC'].get()
        })

    evolution = pd.DataFrame.from_records(evolution).set_index('Step')
    results.append(evolution)

results = functools.reduce(lambda a, b: pd.merge(a, b, on=['Step'], how='inner'), results)
ax = results.plot(grid=True, figsize=(15, 4), title='Half-space Tree AUC (height = 3, window = 30)', xlabel='Observation', ylabel='AUC', marker='.')
ax.legend(title='Trees')
plt.show()
results.tail(1)

#### HST: Parameter = Window size

In [None]:
X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'anomaly')

results = []
for window in (1, 10, 30, 50, 100, 500):
    hst = (
        preprocessing.MinMaxScaler() |
        anomaly.HalfSpaceTrees(n_trees=10, height=3, window_size=window, seed=10)
    )
    steps = evaluate.iter_progressive_val_score(
        model=hst,
        dataset=stream.iter_pandas(X, y),
        metric=metrics.ROCAUC(),
        step=100
    )
    evolution = []
    for step in steps:
        evolution.append({
            'Step': step['Step'],
            window: step['ROCAUC'].get()
        })

    evolution = pd.DataFrame.from_records(evolution).set_index('Step')
    results.append(evolution)

results = functools.reduce(lambda a, b: pd.merge(a, b, on=['Step'], how='inner'), results)
ax = results.plot(grid=True, figsize=(15, 4), title='Half-space Tree AUC (trees = 10, height = 3)', xlabel='Observation', ylabel='AUC', marker='.')
ax.legend(title='Window size')
plt.show()
results.tail(1)

#### HST: Parameter = Height of tree

In [None]:
X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'anomaly')

results = []
for height in (1, 2, 3, 4, 5):
    hst = (
        preprocessing.MinMaxScaler() |
        anomaly.HalfSpaceTrees(n_trees=10, height=height, window_size=30, seed=10)
    )
    steps = evaluate.iter_progressive_val_score(
        model=hst,
        dataset=stream.iter_pandas(X, y),
        metric=metrics.ROCAUC(),
        step=100
    )
    evolution = []
    for step in steps:
        evolution.append({
            'Step': step['Step'],
            height: step['ROCAUC'].get()
        })

    evolution = pd.DataFrame.from_records(evolution).set_index('Step')
    results.append(evolution)

results = functools.reduce(lambda a, b: pd.merge(a, b, on=['Step'], how='inner'), results)
ax = results.plot(grid=True, figsize=(15, 4), title='Half-space Tree AUC (trees = 10, window = 30)', xlabel='Observation', ylabel='AUC', marker='.')
ax.legend(title='Tree height')
plt.show()
results.tail(1)

#### HST: Classification clusters

In [None]:
def hst_visualize_classes(X, y):
    hst = (
        preprocessing.MinMaxScaler() |
        anomaly.HalfSpaceTrees(n_trees=10, height=3, window_size=30, seed=10)
    )
    y_scores = []
    for xs, ys in stream.iter_pandas(X, y):
        score = hst.score_one(xs)
        hst.learn_one(xs)
        y_scores.append(score)

    y_scores = np.array(y_scores)
    models.project_anomaly_map_plot(X, y, y_scores)


X, y = tsfel_features_subset(FEATURES_FILENAME, selection.FAULT_CLASSES, ['az'], 'anomaly')
hst_visualize_classes(X, y)