# DBSCAN clustering of MaFaulDa dataset

Explore the separation of groups within the dataset and methods for setting DBSCAN clustering hyperparameters.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tqdm.notebook import tqdm
from kneed import KneeLocator
from typing import List, Dict

from scipy import stats
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import silhouette_score, davies_bouldin_score

import sys
sys.path.append('../')
from vibrodiagnostics import mafaulda, visualize

The recommended choice of optimal DBSCAN eps parameter is the distance at the knee of distances among points.
- Source: https://stats.stackexchange.com/questions/88872/a-routine-to-choose-eps-and-minpts-for-dbscan

In [None]:
def plot_distances_among_points(df: pd.DataFrame, n_neighbors: int) -> List[float]: 
    neighbors = NearestNeighbors(n_neighbors=n_neighbors)
    neighbors.fit(df)
    distances, indices = neighbors.kneighbors(df)
    distances = sorted(distances[:, 1], reverse=True)

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.plot(list(range(1, len(distances) + 1)), distances)
    ax.set_xlabel('Number of points')
    ax.set_ylabel('Distance')
    ax.grid(True)
    plt.show()

    return distances

Distances among points in the time domain and knee finder for the resulting curve.

In [None]:
CONFIG = {'placement': 'A', 'online': False}
K_NEIGHBORS = 6
X_train, X_test, y_train, y_test = mafaulda.load_source('TD', CONFIG)
distances = plot_distances_among_points(X_train, K_NEIGHBORS)
kneedle = KneeLocator(
    range(1, len(distances) + 1),
    distances,
    S=1.0,
    curve='convex',
    direction='decreasing'
)

In [None]:
kneedle.plot_knee_normalized()
print('Knee at:')
print('Number of points:', kneedle.elbow)
print('Distance:', kneedle.knee_y)

### Features in time domain
Scatter plots of three arbitrarily chosen features in the time domain

In [None]:
features = ['shape', 'rms', 'clearance']
X_train = X_train[features]
visualize.cross_cuts_3d_cluster(X_train, y_train, 'Ground truth')

The result of clustering for features in time domain. The Eps parameter is set optimally. The vast majority of observations ended up in the same cluster.

In [None]:
clustering = DBSCAN(eps=kneedle.knee_y, min_samples=5, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)


visualize.cross_cuts_3d_cluster(X_train, y_train_labels, 'Clusters')
print(X_train.shape)
plt.show()

Measure quality of clustering by **Silhouette score** and **Davies-Bouldin index**.  

**Silhouette Coefficient** is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

**Davies–Bouldin index**: The minimum score is zero, with lower values indicating better clustering.

In [None]:
def evaluate_clustering(
        X_train: pd.DataFrame,
        y_train: pd.DataFrame,
        y_train_labels: pd.DataFrame,
        X_test: pd.DataFrame,
        y_predict: pd.DataFrame):

    print('Silhouette score:')
    print('Train:', silhouette_score(X_train, y_train_labels, metric='euclidean'))
    print('Test:', silhouette_score(X_test, y_predict, metric='euclidean'))

    print('Davies-Bouldin index')
    print('Train:', davies_bouldin_score(X_train, y_train_labels))
    print('Test:', davies_bouldin_score(X_test, y_predict))

    print(np.unique(y_train),
        np.unique(y_train_labels))

    occurences = pd.DataFrame(
        data=contingency_matrix(y_train, y_train_labels),
        index=np.unique(y_train),
        columns=np.unique(y_train_labels)
    )
    ax = sb.heatmap(occurences, cbar=True, cmap='BuGn', annot=True, fmt='d')

evaluate_clustering(X_train, y_train, y_train_labels, X_test, y_predict)

### Features in frequency domain
Scatter plots of three arbitrarily chosen features in the frequency domain

Frequency domain features

In [None]:
CONFIG = {'placement': 'A', 'online': False}
K_NEIGHBORS = 6
X_train, X_test, y_train, y_test = mafaulda.load_source('FD', CONFIG)
distances = plot_distances_among_points(X_train, K_NEIGHBORS)
kneedle = KneeLocator(
    range(1, len(distances) + 1),
    distances,
    S=1.0,
    curve='convex',
    direction='decreasing'
)

In [None]:
kneedle.plot_knee_normalized()
print('Knee at:')
print('Number of points:', kneedle.elbow)
print('Distance:', kneedle.knee_y)

In [None]:
clustering = DBSCAN(eps=kneedle.knee_y, min_samples=10, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)

visualize.cross_cuts_3d_cluster(X_train, y_train_labels, 'Groud truth')
plt.show()

In [None]:
evaluate_clustering(X_train, y_train, y_train_labels, X_test, y_predict)

Find the best parameters for DBSCAN in supervised learning.

In [None]:
def cross_validate_clustering_score(
        X: pd.DataFrame,
        Y: pd.DataFrame,
        num_of_features: int,
        eps: float,
        min_samples: int) -> Dict[str, float]:
    train_scores = []
    test_scores = []
    crossvalid = StratifiedKFold(n_splits=5)
    clusters = []

    for train_index, test_index in crossvalid.split(X, Y):
        X_train, y_train = X.iloc[train_index], Y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], Y.iloc[test_index]

        clustering = DBSCAN(eps=eps, min_samples=int(min_samples), metric='l2')
        clustering.fit(X_train)
        y_train_labels = clustering.labels_
        y_predict = clustering.fit_predict(X_test)

        num_of_clusters_train = len(np.unique(y_train_labels))
        num_of_clusters_test = len(np.unique(y_predict))
        clusters.append(num_of_clusters_train)
    
        if num_of_clusters_train > 1 and num_of_clusters_test > 1:
            train_scores.append(silhouette_score(X_train, y_train_labels, metric='euclidean'))
            test_scores.append(silhouette_score(X_test, y_predict, metric='euclidean'))

    train_scores = np.array(train_scores)
    test_scores = np.array(test_scores)
    clusters = stats.mode(clusters).mode

    result = {
        'clusters': np.mean(clusters) if clusters else 0,
        'train_score_mean': np.mean(train_scores) if len(train_scores) > 0 else 0,
        'train_score_std': np.std(train_scores) if len(train_scores) > 0 else 0,
        'test_score_mean': np.mean(test_scores) if len(test_scores) > 0 else 0,
        'test_score_std': np.std(test_scores) if len(test_scores) >  0 else 0
    }
    return result

Grid search the features with the best clustering scores.

In [None]:
CONFIG = {'placement': 'A', 'online': False}
X, _, y, _  = mafaulda.load_source('TD', CONFIG)

num_of_features = np.arange(1, len(X.columns) + 1)
eps = np.linspace(0.05, 0.8, 8)
min_samples = np.linspace(3, 8, 2)
grid = np.array(np.meshgrid(num_of_features, eps, min_samples)).T.reshape(-1, 3)

rows = []
for f, e, s in tqdm(grid):
    row = {
        'features': f,
        'eps': e,
        'min_samples': s
    }
    row.update(cross_validate_clustering_score(X, y, f, e, s))
    rows.append(row)


results = pd.DataFrame.from_records(rows)
results

Top 10 best-scored parameters by Silhouette score

In [None]:
results[results['clusters'] > 1].sort_values(by='train_score_mean', ascending=False).head(10)