DBSCAN
- https://stats.stackexchange.com/questions/88872/a-routine-to-choose-eps-and-minpts-for-dbscan

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tqdm.notebook import tqdm

from scipy import stats
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.model_selection import StratifiedKFold
from kneed import KneeLocator
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import sys
sys.path.append('../')
from vibrodiagnostics import (
    mafaulda,
    discovery, 
    selection,
    models
)
from vibrodiagnostics.models import (
    fault_labeling, pipeline_v1, pipeline_v1_core, filter_out_metadata_columns
)

import re
import os
# import warnings
# warnings.filterwarnings('ignore')

PATH_PREFIX = '../../datasets/'
FEATURES_PATH =  os.path.join(PATH_PREFIX, 'features_data')

TD_FD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_AND_FREQ_FEATURES_PATH)
TD_FEATURES = os.path.join(FEATURES_PATH, selection.TIME_FEATURES_PATH)
FD_FEATURES = os.path.join(FEATURES_PATH, selection.FREQ_FEATURES_PATH)

domains = {'temporal': TD_FEATURES, 'spectral': FD_FEATURES}
rpm_limit = [False, True]
target = ['fault', 'anomaly_60', 'anomaly_90']
placement = ['A', 'B']
online = [False]

In [None]:
# TODO: refactor
def get_features_list(domains):
    features = []
    for dname, dataset in domains.items():
        names = pd.read_csv(dataset)
        names = names.columns.str.extract(r'([a-z]{2})_([a-z\_\-]+)')[1].unique()
        features.extend([f'{dname}_{col.strip("_")}' for col in names if not pd.isnull(col)])

    return features
    
def load_source(dataset: str, domain: str, row: dict):
    faults = {
        'A': {
            'normal': 'normal',
            'imbalance': 'imbalance',
            'horizontal-misalignment': 'misalignment',
            'vertical-misalignment': 'misalignment',
            # 'underhang-outer_race': 'outer race fault',
            # 'underhang-cage_fault': 'cage fault',
            # 'underhang-ball_fault': 'ball fault'
        },
        'B': {
            'normal': 'normal',
            'imbalance': 'imbalance',
            'horizontal-misalignment': 'misalignment',
            'vertical-misalignment': 'misalignment',
            # 'overhang-cage_fault': 'cage fault',
            # 'overhang-ball_fault': 'ball fault',
            # 'overhang-outer_race': 'outer race fault'
        }
    }
    placements = {
        'A': ['ax', 'ay', 'az'],
        'B': ['bx', 'by', 'bz']
    }
    RPM = 2500
    RPM_RANGE = 500
    features = pd.read_csv(dataset).fillna(0)

    # Choosing rpm range
    if row['rpm_limit']:
        features = features[features['rpm'].between(RPM - RPM_RANGE, RPM + RPM_RANGE, inclusive='both')]

    # Labeling anomaly severity levels
    target = re.search(r'([a-z]+)_?(\d+)?', row['target'])
    anomaly_severity = target.group(2) or '60'
    anomaly_severity = float(anomaly_severity) / 100

    # Choose measurement placement: A or B
    place = row['placement']
    axis = placements[place]
    features = features[features['fault'].isin(tuple(faults[place]))]
    features = models.fault_labeling(features, faults[place], anomaly_severity)

    columns = features.columns.str.startswith(tuple(axis))
    X = features[features.columns[columns]]

    # Select predicted variable column
    label = target.group(1)
    Y = features[label].astype('category')

    # Filter columns in feature domain with window size 2**14
    if domain == 'spectral':
        window_size = 2**14
        X = X.loc[:,X.columns.str.endswith(f'_{window_size}')]
        X.columns = X.columns.str.extract(r'(\w+)_\w+$')[0]

    # Calculate feature magnitudes from 3D vector
    feature_names = get_features_list({domain: dataset})
    result = pd.DataFrame()
    for name in feature_names:              
        # Remove prefix: temporal, spectral
        name = re.search(r'[a-z]+_([\w\_]+)', name).group(1)
        vector_dims = [f'{dim}_{name}' for dim in axis]
        result[name] = X[vector_dims].apply(np.linalg.norm, axis=1)
    X = result

    # Batch / Online hold-out (balance and event sequencing)
    train_size = 0.8

    oversample = RandomOverSampler(sampling_strategy='not majority', random_state=10)
    X, Y = oversample.fit_resample(X, Y.to_numpy())
    X.reset_index(drop=True, inplace=True)
    Y = pd.Series(Y)

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, train_size=train_size, stratify=Y, random_state=10
    )

    ################ KEEP HERE ############################x
    scaler = MinMaxScaler()
    X_train[X_train.columns] = scaler.fit_transform(X_train)
    X_test[X_test.columns] = scaler.transform(X_test)
    ##############################################

    return X_train, X_test, Y_train, Y_test

In [None]:
DOMAIN = 'temporal'
dataset = {'temporal': TD_FEATURES, 'spectral': FD_FEATURES}
config = {'rpm_limit': False, 'placement': 'A', 'domain': DOMAIN, 'target': 'anomaly_90'}
X_train, X_test, y_train, y_test = load_source(dataset[DOMAIN], config['domain'], config)

Find distances among points

In [None]:
cnt_neighbors = 6
neighbors = NearestNeighbors(n_neighbors=cnt_neighbors)
neighbors.fit(X_train)
distances, indices = neighbors.kneighbors(X_train)


Plot distances among points

In [None]:
fig, ax = plt.subplots(figsize=(5, 3))
distance_desc = sorted(distances[:, 1], reverse=True)
ax.plot(list(range(1, len(distance_desc) + 1)), distance_desc)
ax.set_xlabel('Number of points')
ax.set_ylabel('Distance')
ax.grid(True)
plt.show()

In [None]:
kneedle = KneeLocator(range(1, len(distance_desc) + 1), distance_desc,
                      S=1.0, curve='convex', direction='decreasing')
kneedle.plot_knee_normalized()
print(kneedle.elbow, kneedle.knee_y)

In [None]:
def cross_cuts_3d_cluster(X_train, cluster, title):
    df = X_train.copy()
    df['cluster'] = cluster
    df['cluster'] = df['cluster'].astype('category')

    categories = df['cluster'].cat.categories
    colors = sb.color_palette('hls', len(categories))
    fig, ax = plt.subplots(1, 3, figsize=(15, 3))
    fig.suptitle(title)

    for i, axes in enumerate(((0, 1), (0, 2), (1, 2))):
        a, b = axes
         
        for label, color in zip(categories, colors):
            rows = list(df[df['cluster'] == label].index)
            x = df.loc[rows, df.columns[a]]
            y = df.loc[rows, df.columns[b]]
            ax[i].scatter(x, y, s=1, color=color, label=label)

        ax[i].set_xlabel(df.columns[a])
        ax[i].set_ylabel(df.columns[b])
        ax[i].grid()
        ax[i].legend()

- Range of values is MinMaxScaled in range (0, 1) - eps must be smaller than 1
- Noisy samples are given the label -1.

Time domain features

In [None]:
DOMAIN = 'temporal'
dataset = {'temporal': TD_FEATURES, 'spectral': FD_FEATURES}
config = {'rpm_limit': False, 'placement': 'A', 'domain': DOMAIN, 'target': 'anomaly_90'}
X_train, X_test, y_train, y_test = load_source(dataset[DOMAIN], config['domain'], config)

fsel = ['shape', 'std', 'margin']
X_train = X_train[fsel]
X_test = X_test[fsel]

cross_cuts_3d_cluster(X_train, y_train, 'Ground truth')
plt.show()

In [None]:
clustering = DBSCAN(eps=0.05, min_samples=5, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)

cross_cuts_3d_cluster(X_train, y_train_labels, 'Clusters')
plt.show()

In [None]:

def evaluate_clustering(X_train, y_train_labels, X_test, y_predict):
    # The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample.
    # The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters
    print('Silhouette score:')
    print('Train:', silhouette_score(X_train, y_train_labels, metric='euclidean'))
    print('Test:', silhouette_score(X_test, y_predict, metric='euclidean'))

    # Davies–Bouldin index: The minimum score is zero, with lower values indicating better clustering.
    print('Davies-Bouldin index')
    print('Train:', davies_bouldin_score(X_train, y_train_labels))
    print('Test:', davies_bouldin_score(X_test, y_predict))

    occurences = pd.DataFrame(
        data=contingency_matrix(y_train, y_train_labels),
        index=np.unique(y_train),
        columns=np.unique(y_train_labels)
    )
    ax = sb.heatmap(occurences, cbar=True, cmap='BuGn', annot=True, fmt='d')


evaluate_clustering(X_train, y_train_labels, X_test, y_predict)

Frequency domain features

In [None]:
DOMAIN = 'spectral'
dataset = {'temporal': TD_FEATURES, 'spectral': FD_FEATURES}
config = {'rpm_limit': False, 'placement': 'A', 'domain': DOMAIN, 'target': 'anomaly_90'}
X_train, X_test, y_train, y_test = load_source(dataset[DOMAIN], config['domain'], config)

In [None]:
clustering = DBSCAN(eps=0.5, min_samples=5, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)

cross_cuts_3d_cluster(X_train, y_train_labels, 'Clusters')
plt.show()

In [None]:
evaluate_clustering(X_train, y_train_labels, X_test, y_predict)

Find best parameters for DBSCAN in supervised learning

In [None]:
def cross_validate_clustering_score(X, y, num_of_features, eps, min_samples):
    train_scores = []
    test_scores = []
    clusters = []

    for train_index, test_index in crossvalid.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        X_train, X_test, y_train, y_test = pipeline_v1_core(
            FSEL_METHOD, int(num_of_features), 
            X_train, y_train, X_test, y_test
        )

        clustering = DBSCAN(eps=eps, min_samples=int(min_samples), metric='l2')
        clustering.fit(X_train)
        y_train_labels = clustering.labels_
        y_predict = clustering.fit_predict(X_test)

        num_of_clusters = len(np.unique(y_train_labels))
        clusters.append(num_of_clusters)
        if num_of_clusters > 1:
            train_scores.append(silhouette_score(X_train, y_train_labels, metric='euclidean'))
            #test_scores.append(silhouette_score(X_test, y_predict, metric='euclidean'))

    clusters = stats.mode(clusters).mode
    train_scores = np.array(train_scores)
    test_scores = np.array(test_scores)

    clusters = np.mean(clusters)
    train_score_mean = np.mean(train_scores)
    train_score_std = np.std(train_scores)
    #test_score_mean = np.mean(test_scores)
    #test_score_std = np.std(test_scores)

    return clusters, train_score_mean, train_score_std #test_score_mean, test_score_std



DOMAIN = 'temporal'
dataset = {'temporal': TD_FEATURES, 'spectral': FD_FEATURES}
config = {'rpm_limit': False, 'placement': 'A', 'domain': DOMAIN, 'target': 'fault'}
X, _, y, _ = load_source(dataset[DOMAIN], config['domain'], config)

crossvalid = StratifiedKFold(n_splits=5)



num_of_features = np.arange(1, len(X.columns) + 1)
eps = np.linspace(0.05, 0.8, 8)
min_samples = np.linspace(3, 8, 2)

grid = np.array(np.meshgrid(num_of_features, eps, min_samples)).T.reshape(-1, 3)

rows = []
for f, e, s in tqdm(grid):
    row = [f, e, s]
    row.extend(cross_validate_clustering_score(X, y, f, e, s))
    rows.append(row)


results = pd.DataFrame(rows, columns=[
    'num_of_features', 'eps', 'min_samples', 'clusters', 
    'train_score_mean', 'train_score_std'
]).dropna()

Top 10 best scored parameters with silhouette score

In [None]:
results[results['clusters'] > 1].sort_values(by='train_score_mean', ascending=False).head(10)