In [None]:
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif, f_classif

from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN

from river import cluster       # cluster.DenStream
from river import anomaly       # anomaly.HalfSpaceTrees, LocalOutlierFactor
from river import preprocessing # preprocessing.StandardScaler
from river import neighbors     # neighbors.KNNClassifier, SWINN
from river import drift         # ADWIN
from river import stream
import warnings

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import numpy as np
import pandas as pd
from zipfile import ZipFile
import matplotlib.pyplot as plt
from IPython.display import Markdown
import seaborn as sb
import sys
sys.path.append('../../')
from feature import mafaulda
from feature import discovery as fdiscovery
from feature import selection as fselection

from tabulate import tabulate

import warnings
warnings.filterwarnings('ignore')

# from skmultiflow.anomaly_detection import HalfSpaceTrees
# from skmultiflow.lazy import KNNClassifier, KNNADWINClassifier
EXTRACT = False
MAFAULDA_PATH = '../../datasets/MAFAULDA.zip'
FEATURES_PATH =  '../../datasets/features_data/'

In [None]:
# Extract metadata (Skip)
if EXTRACT:
    file_index = mafaulda.dataset_index(MAFAULDA_PATH)
    file_index.to_csv(FEATURES_PATH + 'mafaulda_metadata.csv', index=False)

In [None]:
# Import metadata about Mafaulda
meta = pd.read_csv(FEATURES_PATH + 'mafaulda_metadata.csv', index_col='filename')
# Show dataframe
meta.info()
meta.head(10)

In [None]:
# Choose 4 types of faults within limited rpm range
classes = {'normal': 'N', 'imbalance': 'I', 'horizontal-misalignment': 'HM', 'vertical-misalignment': 'VM'}
rpm = 2900
rpm_range = 300

files = meta[
    (meta['fault'].isin(classes)) &
    (meta['rpm'].between(rpm - rpm_range, rpm + rpm_range, inclusive='both'))
].copy()
files.head(10)

In [None]:
def fault_labeling(df, debug=True):
    # Faults
    df['fault'] = df['fault'].astype('category')
    df['fault'] = df['fault'].cat.rename_categories(classes)
    # Print classes of faults
    print('Faults:', list(df['fault'].cat.categories), end='\n\n')
    
    # Number fault classes
    df['seq'] = (
        df.groupby(by=['fault', 'severity'], observed=True)
             .cumcount().astype(int)
    )
    # Keep only decimal numbers in severity
    df['severity'] = df['severity'].str.extract(r'(\d+\.?\d*)').astype(float)

    # Number severity per group (0 - best, 1 - worst)
    for name, group in df.groupby(by=['fault'], observed=True):
        group = group.sort_values(by='severity')
            
        severities = group['severity'].astype('category').cat.codes.values.reshape(-1, 1)
        # Transorm to range (0, 1)
        scale_severities = MinMaxScaler().fit_transform(severities)
        
        df.loc[group.index, 'severity_class'] = severities
        df.loc[group.index, 'severity_level'] = scale_severities

        if debug is True:
            # Print severity scales
            sev_names = list(group['severity'].astype('category').cat.categories)
            sev = list(group['severity'].astype('category').cat.codes.astype('category').cat.categories)
            scale = [float(f'{p:.2f}') for p in pd.Series(scale_severities[:, 0]).astype('category').cat.categories]
            print(f'Fault: {name[0]}, Files: {len(group)}, Severity names: {sev_names}, Severity: {sev}, Severity Levels: {scale}')
    return df

In [None]:
files = fault_labeling(files)
print()
files.info()
files.head(10)

In [None]:
def plot_rpm_comparison(files, fault, dB):
    table = files[
        (files['rpm'] == files['rpm'].min()) |
        (files['rpm'] == files['rpm'].max())
    ] 
    dataset = ZipFile(MAFAULDA_PATH)
    fig, ax = plt.subplots(1, 1, figsize=(15, 3), sharey=True)
    ax.set_title(f'{fault}')
    for filename, series in table.iterrows():
        fdiscovery.plot_frequency_spectrum(dataset, filename, 'ax', ax, dB=dB, label=f'{series["rpm"]:.2f}')

    ax.legend(loc="upper right")
    fig.tight_layout()
    plt.show()

In [None]:
# Difference in frequency spectrum between lowest rpm and highest rpm (non dB)
for fault, level in [('N', 0), ('I', 1), ('VM', 1), ('HM', 1)]:
    sources = files[(files['fault'] == fault) &  (files['severity_level'] == level)]
    plot_rpm_comparison(sources, fault, dB=False)

In [None]:
# Difference in frequency spectrum between lowest rpm and highest rpm (dB)
for fault, level in [('N', 0), ('I', 1), ('VM', 1), ('HM', 1)]:
    sources = files[(files['fault'] == fault) &  (files['severity_level'] == level)]
    plot_rpm_comparison(sources, fault, dB=True)

## Test train split and Feature selection
- Break severity to two levels (accept, no accept) in each fault - use accept as normal baseline
- Stratified sampling (Train: 0.7, Test: 0.3)
- Choose few faults/states: normal, unbalance, horizonatl misalignment, vertical misalignment (exclude bearings)

In [None]:
files

### Export features for chosen files

In [None]:
dataset = ZipFile(MAFAULDA_PATH)
filenames = list(files.index)

In [None]:
# Time domain features
if EXTRACT:
    features = mafaulda.import_files_split(dataset, filenames, fdiscovery.features_time_domain, parts=5)
    features.to_csv(FEATURES_PATH + fselection.TIME_FEATURES_PATH_NEW, index=False)
    features.head(10)

In [None]:
# Frequency domain features
if EXTRACT:
    features = mafaulda.import_files_split(dataset, filenames, fdiscovery.features_frequency_domain, parts=5)
    features.to_csv(FEATURES_PATH + fselection.FREQ_FEATURES_PATH_NEW, index=False)
    features.head(10)

In [None]:
# TSFEL package features
if EXTRACT:
    features = mafaulda.import_files_split(dataset, filenames, fdiscovery.tsfel_features_import, parts=5)
    features.to_csv(FEATURES_PATH + 'tsfel_features.csv', index=False)
    features.head(10)

### Import features

In [None]:
def highly_correlated_features(df, corr=0.95):
    # https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on
    corr_matrix = df.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # Find features with correlation greater than "corr"
    to_drop = [column for column in upper.columns if any(upper[column] > corr)]
    return to_drop


def pipeline_v1(features, train, nfeat):
    # Split features dataset to training and testing sets
    X = features[features.columns[~features.columns.isin(fselection.METADATA_COLUMNS_ALL)]]
    y = features['fault']

    # TODO: K-fold validation
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train, stratify=y)
    
    # Drop colinear features
    to_drop = highly_correlated_features(X_train)
    X_train.drop(to_drop, axis=1, inplace=True)
    X_test.drop(to_drop, axis=1, inplace=True)
    
    # Feature selection
    selector = SelectKBest(mutual_info_classif, k=nfeat)
    # selector = SelectPercentile(mutual_info_classif, percentile=20)
    
    selector.fit_transform(X_train, y_train)
    selector.transform(X_test)
    idx = selector.get_support(indices=True)
    X_train = X_train.iloc[:,idx]
    X_test = X_test.iloc[:,idx]
       
    # Normalize features (See inverse transform)
    scaler = MinMaxScaler()
    X_train[X_train.columns] = scaler.fit_transform(X_train)
    X_test[X_test.columns] = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

In [None]:
# Get features from one sensor (ax, ay, az) features
features = fselection.load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
td_features = fault_labeling(features.copy())
print()
td_features.info()
td_features.head()

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(td_features, train=0.5, nfeat=3)
X_train.head(5)

In [None]:
# Statistical distribution of features
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

In [None]:
# Cross sections plots in pairs of axis
def cross_cuts_3d(X_train, y_train):
    fig, ax = plt.subplots(1, 3, figsize=(15, 3))
    for i, axes in enumerate(((0, 1), (0, 2), (1, 2))):
        a, b = axes
        x = X_train.loc[:,X_train.columns[a]]
        y = X_train.loc[:,X_train.columns[b]]

        for label, color in (('VM', 'purple'), ('N', 'green'), ('I', 'blue'), ('HM', 'orange')):
            x = X_train.loc[
                list(y_train[y_train == label].index), 
                X_train.columns[a]
            ]
            y = X_train.loc[
                list(y_train[y_train == label].index),
                X_train.columns[b]
            ]
            ax[i].scatter(x, y, s=1, color=color, label=label)
        
        ax[i].set_xlabel(X_train.columns[a])
        ax[i].set_ylabel(X_train.columns[b])
        ax[i].grid()
        ax[i].legend()

cross_cuts_3d(X_train, y_train)    # TODO: color according to class
plt.show()


In [None]:
# 3D distribution of datapoints
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(projection='3d')
ax.scatter(
    X_train.loc[:,X_train.columns[0]],
    X_train.loc[:,X_train.columns[1]],
    X_train.loc[:,X_train.columns[2]],
    s=1
)
ax.set_box_aspect(aspect=None, zoom=0.85)
ax.set_xlabel(X_train.columns[0])
ax.set_ylabel(X_train.columns[1])
ax.set_zlabel(X_train.columns[2])
plt.show()

## K Nearest Neighbors
KNN (distance metric, k neighbours - elbow) 

In [None]:
# Classification with kNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)

In [None]:
# Get the classification report
accuracy = accuracy_score(y_test, y_predict) * 100
print("Accuracy: " + str(accuracy) + '%')
print(classification_report(y_test, y_predict))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_predict)
ax = sb.heatmap(cm, cbar=True, cmap="BuGn", annot=True, fmt='d')

### Freqency domain features - KNN

In [None]:
features = fselection.load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
fd_features = fault_labeling(features.copy())
print()
fd_features.info()
fd_features.head()

In [None]:
X_train, X_test, y_train, y_test = pipeline_v1(fd_features, train=0.5, nfeat=3)
X_train.head(5)

In [None]:
# Statistical distribution of features
X_train.hist(bins=100, figsize=(15, 3), layout=(1, 3), color='grey', ec='black')
plt.show()

In [None]:
cross_cuts_3d(X_train, y_train)

In [None]:
# Classification with kNN
knn = KNeighborsClassifier(n_neighbors=7, algorithm='kd_tree', metric='l2', weights='uniform')
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)

In [None]:
# Get the classification report
accuracy = accuracy_score(y_test, y_predict) * 100
print("Accuracy: " + str(accuracy) + '%')
print(classification_report(y_test, y_predict))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_predict)
ax = sb.heatmap(cm, cbar=True, cmap="BuGn", annot=True, fmt='d')

## K Nearest Neighbors (online)

In [None]:
import functools
from river import utils
from river import evaluate
from river import metrics
l1_dist = functools.partial(utils.math.minkowski_distance, p=1)

model = (
    preprocessing.StandardScaler() |
    neighbors.KNNClassifier(
        engine=neighbors.SWINN(
            dist_func=l1_dist,
            seed=42
        )
    )
)
# learn_one, predict_one
dataset = fselection.load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
dataset = fault_labeling(dataset.copy(), debug=False)
#evaluate.progressive_val_score(dataset, model, metrics.Accuracy())

## Local outlier factor (online, experimental)

## Isolation Forest

## Half-space Trees (online)
Half-Space Trees (window size, ensemble size)

## DBSCAN

## DenStream (online)
DenStream (μ, ε, beta, λ)