In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.model_selection import KFold
from sklearn import metrics

import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append('../')
from vibrodiagnostics.selection import load_td_feat, load_fd_feat
from vibrodiagnostics.models import (
    fault_labeling, pipeline_v1, pipeline_v1_core, filter_out_metadata_columns
)
from tqdm.notebook import tqdm


FEATURES_PATH =  '../../datasets/features_data/'
FAULT_CLASSES = {
    'normal': 'N',
    'imbalance': 'I',
    'horizontal-misalignment': 'HM',
    'vertical-misalignment': 'VM'
}

In [None]:
def anomalies_cluster_plot(dataframe, outliers):
    df = dataframe.copy()
    df['outlier'] = outliers.copy()
    df['outlier'] = df['outlier'].astype('category')
    
    fig, ax = plt.subplots(1, 3, figsize=(15, 3))
    
    for i, axes in enumerate(((0, 1), (0, 2), (1, 2))):
        a, b = axes
        ax[i].grid()
        ax[i].set_xlabel(df.columns[a])
        ax[i].set_ylabel(df.columns[b])
        x = df.loc[:, df.columns[a]]
        y = df.loc[:, df.columns[b]]
        ax[i].scatter(x, y, color='grey', s=1)
    
        inliers = list(df[df['outlier'] == False].index)
        x = df.loc[inliers, df.columns[a]]
        y = df.loc[inliers, df.columns[b]]
        ax[i].scatter(x, y, color='green', s=1)
    
        outliers = list(df[df['outlier'] == True].index)
        x = df.loc[outliers, df.columns[a]]
        y = df.loc[outliers, df.columns[b]]
        ax[i].scatter(x, y, color='red', s=1)

Time domain features

In [None]:
dataset = fault_labeling(load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH), FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(
    dataset, train=0.7, nfeat=3, multiclass=False, anomaly_ratio=0.1, func_select=mutual_info_classif
)
forest = IsolationForest(n_estimators=15)
forest.fit(X_train)
train_outliers = forest.predict(X_train)    # For each observation, tells whether or not (+1 or -1) is inlier
test_outliers = forest.predict(X_test) 

train_outliers = pd.Categorical(train_outliers).rename_categories({-1: True, 1: False})
print('Most important features:', list(X_train.columns))

In [None]:
anomalies_cluster_plot(X_train, y_train)
plt.suptitle('Ground truth')
plt.show()

In [None]:
anomalies_cluster_plot(X_train, train_outliers)
plt.suptitle('Anomalies')
plt.show()

In [None]:
print(f'Train accuracy: {metrics.accuracy_score(y_train, train_outliers):.4f}')
print(f'Test accuracy: {metrics.accuracy_score(y_test, test_outliers):.4f}')

Frequency domain features

In [None]:
dataset = fault_labeling(load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH), FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(
    dataset, train=0.7, nfeat=3, multiclass=False, anomaly_ratio=0.1, func_select=mutual_info_classif
)
forest = IsolationForest(n_estimators=15)
forest.fit(X_train)
train_outliers = forest.predict(X_train)    # For each observation, tells whether or not (+1 or -1) is inlier
test_outliers = forest.predict(X_test) 

train_outliers = pd.Categorical(train_outliers).rename_categories({-1: True, 1: False})
print('Most important features:', list(X_train.columns))

In [None]:
anomalies_cluster_plot(X_train, y_train)
plt.suptitle('Ground truth')
plt.show()

In [None]:
anomalies_cluster_plot(X_train, train_outliers)
plt.suptitle('Anomalies')
plt.show()

In [None]:
print(f'Train accuracy: {metrics.accuracy_score(y_train, train_outliers):.4f}')
print(f'Test accuracy: {metrics.accuracy_score(y_test, test_outliers):.4f}')

Find best number of features

In [None]:
def cross_validate(X, y, num_of_features, estimators):

    evaluation = []
    kfold = KFold(n_splits=5)
    
    for train_index, test_index in kfold.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        X_train, X_test, y_train, y_test = pipeline_v1_core(
            FSEL_METHOD, num_of_features, 
            X_train, y_train, X_test, y_test
        )

        forest = IsolationForest(n_estimators=estimators)
        forest.fit(X_train)
        y_predict_train = forest.predict(X_train)
        y_predict_test = forest.predict(X_test)
        
        y_predict_train = pd.Categorical(y_predict_train).rename_categories({-1: True, 1: False})
        y_predict_test = pd.Categorical(y_predict_test).rename_categories({-1: True, 1: False})

        evaluation.append([
            num_of_features,
            metrics.accuracy_score(y_train, y_predict_train),
            metrics.precision_score(y_train, y_predict_train, average='micro'),
            metrics.recall_score(y_train, y_predict_train, average='micro'),
            metrics.accuracy_score(y_test, y_predict_test),
            metrics.precision_score(y_test, y_predict_test, average='micro'),
            metrics.recall_score(y_test, y_predict_test, average='micro')
        ])

    evaluation = pd.DataFrame(evaluation, 
        columns=[
            'features',
            'train_accuracy', 'train_precision', 'train_recall', 
            'test_accuracy', 'test_precision', 'test_recall'
        ]
    )
    return evaluation.mean().to_frame().T

Time domain features

In [None]:
TRAINING_SET_RATIO = 0.6
N_ESTIMATORS = 25
FSEL_METHOD = mutual_info_classif   # f_classif, mutual_info_classif
features = fault_labeling(load_td_feat(['az'], path=FEATURES_PATH), FAULT_CLASSES)

X = filter_out_metadata_columns(features)
y = features['anomaly']

rows = []
for n in tqdm(range(1, len(X.columns) + 1)):
    rows.append(cross_validate(X, y, n, estimators=N_ESTIMATORS))
isoforest = pd.concat(rows)
isoforest.sort_values(by='test_accuracy', ascending=False).head(10)

In [None]:
# best
X_train, X_test, y_train, y_test = pipeline_v1(
    features, 
    train=TRAINING_SET_RATIO,
    func_select=FSEL_METHOD,
    nfeat=3
)
list(X_train.columns)

Frequency domain features

In [None]:
TRAINING_SET_RATIO = 0.6
N_ESTIMATORS = 25
FSEL_METHOD = mutual_info_classif   # f_classif, mutual_info_classif
features = fault_labeling(load_fd_feat(['az'], path=FEATURES_PATH), FAULT_CLASSES)

X = filter_out_metadata_columns(features)
y = features['anomaly']

rows = []
for n in tqdm(range(1, len(X.columns) + 1)):
    rows.append(cross_validate(X, y, n, estimators=N_ESTIMATORS))
isoforest = pd.concat(rows)
isoforest.sort_values(by='test_accuracy', ascending=False).head(10)

In [None]:
isoforest[['train_accuracy', 'test_accuracy']].hist(bins=10, figsize=(10, 3))
plt.show()

In [None]:
# best
X_train, X_test, y_train, y_test = pipeline_v1(
    features, 
    train=TRAINING_SET_RATIO,
    func_select=FSEL_METHOD,
    nfeat=17
)
list(X_train.columns)