In [None]:
import numpy as np
import pandas as pd
from zipfile import ZipFile

# Feature selection
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.feature_selection import SelectPercentile, SelectKBest

# Preprocessing of selected features
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Models - Nearest neigbors, Isolation forest, DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN

# Streaming algorithms
import functools
from river import cluster       
from river import anomaly
from river import preprocessing
from river import neighbors
from river import drift
from river import stream
from river import utils
from river import evaluate
from river import metrics

# from skmultiflow.anomaly_detection import HalfSpaceTrees
# from skmultiflow.lazy import KNNClassifier, KNNADWINClassifier

# Model evaluation
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.neighbors import NearestNeighbors
from kneed import KneeLocator

# Plotting and table formatting
import matplotlib.pyplot as plt
from IPython.display import Markdown
from tabulate import tabulate
import seaborn as sb

# System modules
import os
import sys
sys.path.append('../../')

# Custom modules
from feature import mafaulda
from feature import discovery as fdiscovery
from feature import selection as fselection

import warnings
warnings.filterwarnings('ignore')

# Constants
EXTRACT = False
MAFAULDA_PATH = '../../datasets/MAFAULDA.zip'
FEATURES_PATH =  '../../datasets/features_data/'

FAULT_CLASSES = {'normal': 'N', 'imbalance': 'I', 'horizontal-misalignment': 'HM', 'vertical-misalignment': 'VM'}
RPM = 2900
RPM_RANGE = 500

In [None]:
def fault_labeling(df, classes, anomaly_severity=0.7, debug=True):
    # Faults
    df['fault'] = df['fault'].astype('category')
    df['fault'] = df['fault'].cat.rename_categories(classes)
    # Print classes of faults
    if debug is True:
        print('Faults:', list(df['fault'].cat.categories), end='\n\n')
    
    # Number fault severities by sequence
    df['seq'] = (
        df.groupby(by=['fault', 'severity'], observed=True)
             .cumcount().astype(int)
    )
    # Keep only decimal numbers in severity
    df['severity'] = df['severity'].str.extract(r'(\d+\.?\d*)').astype(float)

    # Number severity per group (0 - best, 1 - worst)
    for name, group in df.groupby(by=['fault'], observed=True):
        group = group.sort_values(by='severity')
            
        severities = group['severity'].astype('category').cat.codes.values.reshape(-1, 1)
        # Transorm to range (0, 1)
        scale_severities = MinMaxScaler().fit_transform(severities)
        
        df.loc[group.index, 'severity_class'] = severities
        df.loc[group.index, 'severity_level'] = scale_severities

        if debug is True:
            # Print severity scales
            sev_names = list(group['severity'].astype('category').cat.categories)
            sev = list(group['severity'].astype('category').cat.codes.astype('category').cat.categories)
            scale = [float(f'{p:.2f}') for p in pd.Series(scale_severities[:, 0]).astype('category').cat.categories]
            print(f'Fault: {name[0]}, Files: {len(group)}, Severity names: {sev_names}, Severity: {sev}, Severity Levels: {scale}')

    df['anomaly'] = (df['severity_level'] >= anomaly_severity)
    df['anomaly'] = df['anomaly'].astype('category')
    return df


def highly_correlated_features(df, corr=0.95):
    # https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on
    corr_matrix = df.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # Find features with correlation greater than "corr"
    to_drop = [column for column in upper.columns if any(upper[column] > corr)]
    return to_drop


def pipeline_v1(features, train, nfeat, multiclass=True):
    # Split features dataset to training and testing sets
    X = features[features.columns[~features.columns.isin(fselection.METADATA_COLUMNS_ALL)]]

    if multiclass is True:
        y = features['fault']
    else:
        y = features['anomaly']

    # TODO: K-fold validation
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train, stratify=y)
    
    # Drop colinear features
    to_drop = highly_correlated_features(X_train)
    X_train.drop(to_drop, axis=1, inplace=True)
    X_test.drop(to_drop, axis=1, inplace=True)
    
    # Feature selection
    selector = SelectKBest(mutual_info_classif, k=nfeat)
    # selector = SelectPercentile(mutual_info_classif, percentile=20)
    
    selector.fit_transform(X_train, y_train)
    selector.transform(X_test)
    idx = selector.get_support(indices=True)
    X_train = X_train.iloc[:,idx]
    X_test = X_test.iloc[:,idx]
       
    # Normalize features (See inverse transform)
    scaler = MinMaxScaler()
    X_train[X_train.columns] = scaler.fit_transform(X_train)
    X_test[X_test.columns] = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test


def cross_cuts_3d(X_train, y_train):
    fig, ax = plt.subplots(1, 3, figsize=(15, 3))
    for i, axes in enumerate(((0, 1), (0, 2), (1, 2))):
        a, b = axes

        for label, color in (('VM', 'purple'), ('N', 'green'), ('I', 'blue'), ('HM', 'orange')):
            x = X_train.loc[
                list(y_train[y_train == label].index), 
                X_train.columns[a]
            ]
            y = X_train.loc[
                list(y_train[y_train == label].index),
                X_train.columns[b]
            ]
            ax[i].scatter(x, y, s=1, color=color, label=label)
        
        ax[i].set_xlabel(X_train.columns[a])
        ax[i].set_ylabel(X_train.columns[b])
        ax[i].grid()
        ax[i].legend()


def cross_cuts_3d_anomalies(dataframe, anomalies):
    df = dataframe.copy()
    df['anomaly'] = anomalies
    df['anomaly'] = df['anomaly'].astype('category')
    
    fig, ax = plt.subplots(1, 3, figsize=(15, 3))
    
    for i, axes in enumerate(((0, 1), (0, 2), (1, 2))):
        a, b = axes
        ax[i].grid()
        x = X_train.loc[:, X_train.columns[a]]
        y = X_train.loc[:, X_train.columns[b]]
        ax[i].scatter(x, y, color='grey', s=1)

        for flag, color in ((False, 'green'), (True, 'red')):
            points = list(df[df['anomaly'] == flag].index)
            x = df.loc[points, df.columns[a]]
            y = df.loc[points, df.columns[b]]
            ax[i].scatter(x, y, color=color, s=1)

## Extract metadata about files from whole dataset

In [None]:
if EXTRACT:
    file_index = mafaulda.dataset_index(MAFAULDA_PATH)
    file_index.to_csv(os.path.join(FEATURES_PATH, 'mafaulda_metadata.csv'), index=False)

## Import metadata about Mafaulda

In [None]:
meta = pd.read_csv(os.path.join(FEATURES_PATH, 'mafaulda_metadata.csv'), index_col='filename')
meta.info()
meta.head(10)

## File names selection
Choose 4 types of faults within limited rpm range

In [None]:
files = meta[
    (meta['fault'].isin(FAULT_CLASSES)) &
    (meta['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both'))
].copy()
files.head(10)

### Frequency spectrum comparison of faults in low and high RPM

In [None]:
def plot_rpm_comparison(files, fault, dB):
    table = files[
        (files['rpm'] == files['rpm'].min()) |
        (files['rpm'] == files['rpm'].max())
    ] 
    dataset = ZipFile(MAFAULDA_PATH)
    fig, ax = plt.subplots(1, 1, figsize=(15, 3), sharey=True)
    ax.set_title(f'{fault}')
    for filename, series in table.iterrows():
        fdiscovery.plot_frequency_spectrum(dataset, filename, 'ax', ax, dB=dB, label=f'{series["rpm"]:.2f}')

    ax.legend(loc="upper right")
    fig.tight_layout()
    plt.show()

In [None]:
files = fault_labeling(files.copy(), FAULT_CLASSES, 0.6, debug=True)
files.head(5)

#### Scale in m/s^2: frequency spectrum between lowest rpm and highest RPM

In [None]:
for fault, level in [('N', 0), ('I', 1), ('VM', 1), ('HM', 1)]:
    sources = files[(files['fault'] == fault) &  (files['severity_level'] == level)]
    plot_rpm_comparison(sources, fault, dB=False)

#### Scale in dB (baseline is 1 um/s^2): frequency spectrum between lowest rpm and highest RPM

In [None]:
for fault, level in [('N', 0), ('I', 1), ('VM', 1), ('HM', 1)]:
    sources = files[(files['fault'] == fault) &  (files['severity_level'] == level)]
    plot_rpm_comparison(sources, fault, dB=True)

---
## 1. Feature extraction

In [None]:
files

### Export features for chosen files

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
filenames = list(files.index)

#### 1A. Time domain features

In [None]:
if EXTRACT:
    features = mafaulda.import_files_split(dataset, filenames, fdiscovery.features_time_domain, parts=5)
    features.to_csv(FEATURES_PATH + fselection.TIME_FEATURES_PATH_NEW, index=False)
    features.head(10)

#### 1B. Frequency domain features

In [None]:
if EXTRACT:
    features = mafaulda.import_files_split(dataset, filenames, fdiscovery.features_frequency_domain, parts=5)
    features.to_csv(FEATURES_PATH + fselection.FREQ_FEATURES_PATH_NEW, index=False)
    features.head(10)

#### 1C. TSFEL package features

In [None]:
if EXTRACT:
    features = mafaulda.import_files_split(dataset, filenames, fdiscovery.tsfel_features_import, parts=5)
    features.to_csv(FEATURES_PATH + 'tsfel_features.csv', index=False)
    features.head(10)

#### 1D. Wavelet packet features (Mayer wavelet)

In [None]:
if EXTRACT:
    features = mafaulda.import_files_split(dataset, filenames, fdiscovery.features_wavelet_domain, parts=5)
    features.to_csv(FEATURES_PATH + 'tsfel_features.csv', index=False)
    features.head(10)

## Features' explanatory data analysis
#### TD: Features from one sensor position: (ax, ay, az)

In [None]:
features = fselection.load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
td_features = fault_labeling(features.copy(), FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(td_features, train=0.6, nfeat=3)

print()
X_train.info()
X_train.head(5)

#### TD: Statistical distribution of features in training set

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

#### TD: Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

#### TD: 3D distribution of data points

In [None]:
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(projection='3d')
ax.scatter(
    X_train.loc[:,X_train.columns[0]],
    X_train.loc[:,X_train.columns[1]],
    X_train.loc[:,X_train.columns[2]],
    s=1
)
ax.set_box_aspect(aspect=None, zoom=0.85)
ax.set_xlabel(X_train.columns[0])
ax.set_ylabel(X_train.columns[1])
ax.set_zlabel(X_train.columns[2])
plt.show()

#### TD: Cross sectional plots of anomalies

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(td_features, train=0.6, nfeat=3, multiclass=True)
cross_cuts_3d_anomalies(X_train, td_features['anomaly'].iloc[list(X_train.index)])

percentage = len(td_features[td_features['anomaly'] == True]) / len(td_features)
print(f'Percentage of anomalies: {percentage * 100:.2f} %')  # TODO: too high anomaly rate (adjust stratify)

#### FD: Features from one sensor position: (ax, ay, az)

In [None]:
features = fselection.load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
fd_features = fault_labeling(features.copy(), FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(fd_features, train=0.6, nfeat=3)

print()
X_train.info()
X_train.head(5)

#### FD: Statistical distribution of features in training set

In [None]:
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

#### FD: Cross sectional plots in pairs of axis

In [None]:
cross_cuts_3d(X_train, y_train)
plt.show()

#### FD: Cross sectional plots of anomalies

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(fd_features, train=0.6, nfeat=3, multiclass=True)
cross_cuts_3d_anomalies(X_train, fd_features['anomaly'].iloc[list(X_train.index)])

---
## 2. K Nearest Neighbors
Parameters:
- Distance metric
- k neighbours (odd numbers because of majority voting) - elbow curve

#### 2A-TD. Time domain features import and transformations

In [None]:
TRAINING_SET_RATIO = 0.6
N_FEATURES = 3
N_NEIGHBOURS = 5
DIST_METRIC = 'euclidean'

features = fselection.load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
td_features = fault_labeling(features.copy(), FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(td_features, train=TRAINING_SET_RATIO, nfeat=N_FEATURES)

#### 2B-TD. Classification with kNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS, metric=DIST_METRIC, algorithm='kd_tree')
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)

#### 2C-TD. Get the classification report

In [None]:
accuracy = accuracy_score(y_test, y_predict) * 100
print(f'Accuracy: {accuracy:.4f} %')
print(classification_report(y_test, y_predict))

#### 2D-TD. Confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_predict)
ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')

#### TD: Find best k neighbors - elbow analysis

In [None]:
def find_best_k_parameter_knn(X_train, y_train, X_test, y_test):
    errors = []
    k_values = list(range(3, 40, 2),)
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree', metric=DIST_METRIC)
        knn.fit(X_train, y_train)
        y_predict = knn.predict(X_test)
        errors.append(np.mean(y_predict != y_test))
    
    plt.plot(range(3, 40, 2), errors, color='darkblue', marker='o', markerfacecolor='darkgreen', markersize=5)
    plt.xlabel('K neighbors')
    plt.ylabel('Error rate')
    plt.grid(True)

find_best_k_parameter_knn(X_train, y_train, X_test, y_test)

#### 2A-FD. Frequncy domain features import and transformations

In [None]:
TRAINING_SET_RATIO = 0.6
N_FEATURES = 3
N_NEIGHBOURS = 5
DIST_METRIC = 'euclidean'

features = fselection.load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
fd_features = fault_labeling(features.copy(), FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(fd_features, train=TRAINING_SET_RATIO, nfeat=N_FEATURES)

#### 2B-FD. Classification with kNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=N_NEIGHBOURS, metric=DIST_METRIC, algorithm='kd_tree')
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)

#### 2C-FD. Get the classification report

In [None]:
accuracy = accuracy_score(y_test, y_predict) * 100
print(f'Accuracy: {accuracy:.4f} %')
print(classification_report(y_test, y_predict))

#### 2D-FD. Confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_predict)
ax = sb.heatmap(cm, cbar=True, cmap='BuGn', annot=True, fmt='d')

#### FD: Find best k neighbors - elbow analysis

In [None]:
find_best_k_parameter_knn(X_train, y_train, X_test, y_test)

---
## Isolation Forest

In [None]:
def anomalies_cluster_plot(dataframe):
    df = dataframe.copy()
    df['outlier'] = test_outliers
    df['outlier'] = df['outlier'].astype('category')
    
    fig, ax = plt.subplots(1, 3, figsize=(15, 3))
    
    for i, axes in enumerate(((0, 1), (0, 2), (1, 2))):
        a, b = axes
        ax[i].grid()
        x = X_train.loc[:, X_train.columns[a]]
        y = X_train.loc[:, X_train.columns[b]]
        ax[i].scatter(x, y, color='grey', s=1)
    
        inliers = list(df[df['outlier'] == +1].index)
        x = df.loc[inliers, df.columns[a]]
        y = df.loc[inliers, df.columns[b]]
        ax[i].scatter(x, y, color='green', s=1)
    
        outliers = list(df[df['outlier'] == -1].index)
        x = df.loc[outliers, df.columns[a]]
        y = df.loc[outliers, df.columns[b]]
        ax[i].scatter(x, y, color='red', s=1)

In [None]:
dataset = fselection.load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
dataset = fault_labeling(dataset.copy(), FAULT_CLASSES, debug=False)
X_train, X_test, y_train, y_test = pipeline_v1(dataset, train=0.6, nfeat=3)

forest = IsolationForest(n_estimators=10)
forest.fit(X_train)
test_outliers = forest.predict(X_test)    # For each observation, tells whether or not (+1 or -1) is inlier
anomalies_cluster_plot(X_test)
plt.show()

In [None]:
# TODO plot true anomalies, compare to true anomalies

# TBD

---
## K Nearest Neighbors Classifier (streaming algorithm)

In [None]:
l1_dist = functools.partial(utils.math.minkowski_distance, p=1)

model = (
    preprocessing.StandardScaler() |
    neighbors.KNNClassifier(
        engine=neighbors.SWINN(
            dist_func=l1_dist,
            seed=42
        )
    )
)
# learn_one, predict_one
dataset = fselection.load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
dataset = fault_labeling(dataset.copy(), FAULT_CLASSES, debug=False)
#evaluate.progressive_val_score(dataset, model, metrics.Accuracy())
dataset.head()

## Local outlier factor (streaming algorithm)

## DBSCAN
https://stats.stackexchange.com/questions/88872/a-routine-to-choose-eps-and-minpts-for-dbscan

In [None]:
dataset = fselection.load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
dataset = fault_labeling(dataset.copy(), FAULT_CLASSES, debug=False)
X_train, X_test, y_train, y_test = pipeline_v1(dataset, train=0.6, nfeat=3)

In [None]:
# Find distances between points
cnt_neighbors = 6
neighbors = NearestNeighbors(n_neighbors=cnt_neighbors)
neighbors.fit(X_train)
distances, indices = neighbors.kneighbors(X_train)

# PLot distances
distance_desc = sorted(distances[:, 1], reverse=True)
plt.plot(list(range(1,len(distance_desc)+1)), distance_desc)
plt.xlabel('Number of points')
plt.ylabel('Distance')
plt.grid()
plt.show()

In [None]:
kneedle = KneeLocator(range(1, len(distance_desc) + 1), distance_desc,
                      S=1.0, curve='convex', direction='decreasing')
kneedle.plot_knee_normalized()
print(kneedle.elbow, kneedle.knee_y)

In [None]:
def cross_cuts_3d_cluster(X_train, y_train, cluster):
    df = X_train.copy()
    df['cluster'] = cluster
    df['cluster'] = df['cluster'].astype('category')

    categories = df['cluster'].cat.categories
    colors = sb.color_palette("hls", len(categories))
    fig, ax = plt.subplots(1, 3, figsize=(15, 3))

    for i, axes in enumerate(((0, 1), (0, 2), (1, 2))):
        a, b = axes
         
        for label, color in zip(categories, colors):
            rows = list(df[df['cluster'] == label].index)
            x = df.loc[rows, df.columns[a]]
            y = df.loc[rows, df.columns[b]]
            ax[i].scatter(x, y, s=1, color=color, label=label)

        ax[i].set_xlabel(df.columns[a])
        ax[i].set_ylabel(df.columns[b])
        ax[i].grid()
        ax[i].legend()

In [None]:
# Range of values is MinMaxScaled in range (0, 1) - eps must be smaller than 1
# Noisy samples are given the label -1.
clustering = DBSCAN(eps=0.1, min_samples=5, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)

cross_cuts_3d_cluster(X_train, y_train, y_train_labels)
plt.show()

## Half-space Trees (online)
Half-Space Trees (window size, ensemble size)

In [None]:
X = X_train.iloc[:,0].to_numpy()

hst = anomaly.HalfSpaceTrees(n_trees=5, height=3, window_size=3, seed=42)

for x in X[:3]:
    hst = hst.learn_one({'x': x})

for x in X:
    features = {'x': x}
    hst = hst.learn_one(features)
    print(f'Anomaly score for x={x:.3f}: {hst.score_one(features):.3f}')

## DenStream (online)
DenStream (μ, ε, beta, λ)

In [None]:
denstream = cluster.DenStream(
    decaying_factor=0.01,
    beta=0.5,
    mu=2.5,
    epsilon=0.5,
    n_samples_init=10
)
# Choose one feature (from example)
X = X_train.to_numpy()

for x, _ in stream.iter_array(X):
    denstream = denstream.learn_one(x)

denstream.predict_one({0: -1, 1: -2})