#### DBSCAN

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tqdm.notebook import tqdm
from kneed import KneeLocator

from scipy import stats
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import silhouette_score, davies_bouldin_score

import sys
sys.path.append('../')
from vibrodiagnostics import mafaulda, visualize

Plot distances among points
- https://stats.stackexchange.com/questions/88872/a-routine-to-choose-eps-and-minpts-for-dbscan

In [None]:
def plot_points_distances(df: pd.DataFrame, n_neighbors: int) -> list: 
    neighbors = NearestNeighbors(n_neighbors=n_neighbors)
    neighbors.fit(df)
    distances, indices = neighbors.kneighbors(df)

    fig, ax = plt.subplots(figsize=(6, 4))
    distance_desc = sorted(distances[:, 1], reverse=True)
    ax.plot(list(range(1, len(distance_desc) + 1)), distance_desc)
    ax.set_xlabel('Number of points')
    ax.set_ylabel('Distance')
    ax.grid(True)
    plt.show()

    return distance_desc

- Range of values is MinMaxScaled in range (0, 1) - eps must be smaller than 1
- Noisy samples are given the label -1.

Time domain features

In [None]:
CONFIG = {'placement': 'A', 'online': False}
X_train, X_test, y_train, y_test = mafaulda.load_source('TD', CONFIG)

distance_desc = plot_points_distances(X_train, 6)
kneedle = KneeLocator(range(1, len(distance_desc) + 1), distance_desc,
                      S=1.0, curve='convex', direction='decreasing')
kneedle.plot_knee_normalized()
print(kneedle.elbow, kneedle.knee_y)

In [None]:
fsel = ['shape', 'rms', 'clearance']
X_train = X_train[fsel]
X_test = X_test[fsel]

visualize.cross_cuts_3d_cluster(X_train, y_train, 'Ground truth')
plt.show()

In [None]:
clustering = DBSCAN(eps=kneedle.knee_y, min_samples=5, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)

visualize.cross_cuts_3d_cluster(X_train, y_train_labels, 'Clusters')
plt.show()

In [None]:
def evaluate_clustering(X_train, y_train_labels, X_test, y_predict):
    # The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample.
    # The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters
    print('Silhouette score:')
    print('Train:', silhouette_score(X_train, y_train_labels, metric='euclidean'))
    print('Test:', silhouette_score(X_test, y_predict, metric='euclidean'))

    # Davies–Bouldin index: The minimum score is zero, with lower values indicating better clustering.
    print('Davies-Bouldin index')
    print('Train:', davies_bouldin_score(X_train, y_train_labels))
    print('Test:', davies_bouldin_score(X_test, y_predict))

    occurences = pd.DataFrame(
        data=contingency_matrix(y_train, y_train_labels),
        index=np.unique(y_train),
        columns=np.unique(y_train_labels)
    )
    ax = sb.heatmap(occurences, cbar=True, cmap='BuGn', annot=True, fmt='d')


evaluate_clustering(X_train, y_train_labels, X_test, y_predict)

Frequency domain features

In [None]:
CONFIG = {'placement': 'A', 'online': False}
X_train, X_test, y_train, y_test = mafaulda.load_source('FD', CONFIG)

distance_desc = plot_points_distances(X_train, 6)
kneedle = KneeLocator(range(1, len(distance_desc) + 1), distance_desc,
                      S=1.0, curve='convex', direction='decreasing')
kneedle.plot_knee_normalized()
print(kneedle.elbow, kneedle.knee_y)

In [None]:
clustering = DBSCAN(eps=kneedle.knee_y, min_samples=15, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)

visualize.cross_cuts_3d_cluster(X_train, y_train_labels, 'Clusters')
plt.show()

In [None]:
evaluate_clustering(X_train, y_train_labels, X_test, y_predict)

Find best parameters for DBSCAN in supervised learning

In [None]:
def cross_validate_clustering_score(X, y, num_of_features, eps, min_samples):
    train_scores = []
    test_scores = []

    crossvalid = StratifiedKFold(n_splits=5)

    for train_index, test_index in crossvalid.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        clustering = DBSCAN(eps=eps, min_samples=int(min_samples), metric='l2')
        clustering.fit(X_train)
        y_train_labels = clustering.labels_
        y_predict = clustering.fit_predict(X_test)

        num_of_clusters_train = len(np.unique(y_train_labels))
        num_of_clusters_test = len(np.unique(y_predict))
        if num_of_clusters_train > 1 and num_of_clusters_test > 1:
            train_scores.append(silhouette_score(X_train, y_train_labels, metric='euclidean'))
            test_scores.append(silhouette_score(X_test, y_predict, metric='euclidean'))

    train_scores = np.array(train_scores)
    test_scores = np.array(test_scores)

    train_score_mean = np.mean(train_scores)
    train_score_std = np.std(train_scores)
    test_score_mean = np.mean(test_scores)
    test_score_std = np.std(test_scores)

    return train_score_mean, train_score_std, test_score_mean, test_score_std

In [11]:
CONFIG = {'placement': 'A', 'online': False}
X, _, y, _  = mafaulda.load_source('TD', CONFIG)

num_of_features = np.arange(1, len(X.columns) + 1)
eps = np.linspace(0.05, 0.8, 8)
min_samples = np.linspace(3, 8, 2)
grid = np.array(np.meshgrid(num_of_features, eps, min_samples)).T.reshape(-1, 3)

rows = []
for f, e, s in tqdm(grid):
    row = [f, e, s]
    row.extend(cross_validate_clustering_score(X, y, f, e, s))
    rows.append(row)


results = pd.DataFrame(rows, columns=[
    'eps', 'min_samples', 'clusters', 
    'train_score_mean', 'train_score_std',
    'test_score_mean', 'test_score_std'
]).dropna()
results

Top 10 best scored parameters with silhouette score

In [None]:
results[results['clusters'] > 1].sort_values(by='train_score_mean', ascending=False).head(10)