DBSCAN
- https://stats.stackexchange.com/questions/88872/a-routine-to-choose-eps-and-minpts-for-dbscan

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tqdm.notebook import tqdm

from scipy import stats
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.model_selection import StratifiedKFold
from kneed import KneeLocator

from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import silhouette_score, davies_bouldin_score

import sys
sys.path.append('../../')
from feature.selection import load_td_feat, load_fd_feat
from feature.models import (
    fault_labeling, pipeline_v1, pipeline_v1_core, filter_out_metadata_columns
)

import warnings
warnings.filterwarnings('ignore')

FEATURES_PATH =  '../../datasets/features_data/'
FAULT_CLASSES = {
    'normal': 'N',
    'imbalance': 'I',
    'horizontal-misalignment': 'HM',
    'vertical-misalignment': 'VM'
}

In [None]:
dataset = load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
dataset = fault_labeling(dataset, FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(dataset, train=0.6, nfeat=3, func_select=mutual_info_classif)

Find distances among points

In [None]:
cnt_neighbors = 6
neighbors = NearestNeighbors(n_neighbors=cnt_neighbors)
neighbors.fit(X_train)
distances, indices = neighbors.kneighbors(X_train)


Plot distances among points

In [None]:
fig, ax = plt.subplots(figsize=(5, 3))
distance_desc = sorted(distances[:, 1], reverse=True)
ax.plot(list(range(1, len(distance_desc) + 1)), distance_desc)
ax.set_xlabel('Number of points')
ax.set_ylabel('Distance')
ax.grid(True)
plt.show()

In [None]:
kneedle = KneeLocator(range(1, len(distance_desc) + 1), distance_desc,
                      S=1.0, curve='convex', direction='decreasing')
kneedle.plot_knee_normalized()
print(kneedle.elbow, kneedle.knee_y)

In [None]:
def cross_cuts_3d_cluster(X_train, cluster, title):
    df = X_train.copy()
    df['cluster'] = cluster
    df['cluster'] = df['cluster'].astype('category')

    categories = df['cluster'].cat.categories
    colors = sb.color_palette('hls', len(categories))
    fig, ax = plt.subplots(1, 3, figsize=(15, 3))
    fig.suptitle(title)

    for i, axes in enumerate(((0, 1), (0, 2), (1, 2))):
        a, b = axes
         
        for label, color in zip(categories, colors):
            rows = list(df[df['cluster'] == label].index)
            x = df.loc[rows, df.columns[a]]
            y = df.loc[rows, df.columns[b]]
            ax[i].scatter(x, y, s=1, color=color, label=label)

        ax[i].set_xlabel(df.columns[a])
        ax[i].set_ylabel(df.columns[b])
        ax[i].grid()
        ax[i].legend()

- Range of values is MinMaxScaled in range (0, 1) - eps must be smaller than 1
- Noisy samples are given the label -1.

Time domain features

In [None]:
dataset = load_td_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
dataset = fault_labeling(dataset, FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(dataset, train=0.6, nfeat=3, func_select=mutual_info_classif)
print('Features:', list(X_train.columns))

cross_cuts_3d_cluster(X_train, y_train, 'Ground truth')
plt.show()

In [None]:
clustering = DBSCAN(eps=0.1, min_samples=5, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)

cross_cuts_3d_cluster(X_train, y_train_labels, 'Clusters')
plt.show()

In [None]:

def evaluate_clustering(X_train, y_train_labels, X_test, y_predict):
    # The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample.
    # The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters
    print('Silhouette score:')
    print('Train:', silhouette_score(X_train, y_train_labels, metric='euclidean'))
    print('Test:', silhouette_score(X_test, y_predict, metric='euclidean'))

    # Davies–Bouldin index: The minimum score is zero, with lower values indicating better clustering.
    print('Davies-Bouldin index')
    print('Train:', davies_bouldin_score(X_train, y_train_labels))
    print('Test:', davies_bouldin_score(X_test, y_predict))

    occurences = pd.DataFrame(
        data=contingency_matrix(y_train, y_train_labels),
        index=np.unique(y_train),
        columns=np.unique(y_train_labels)
    )
    ax = sb.heatmap(occurences, cbar=True, cmap='BuGn', annot=True, fmt='d')


evaluate_clustering(X_train, y_train_labels, X_test, y_predict)

Frequency domain features

In [None]:
dataset = load_fd_feat(['ax', 'ay', 'az'], path=FEATURES_PATH)
dataset = fault_labeling(dataset, FAULT_CLASSES)
X_train, X_test, y_train, y_test = pipeline_v1(dataset, train=0.6, nfeat=3, func_select=mutual_info_classif)
print('Features:', list(X_train.columns))

cross_cuts_3d_cluster(X_train, y_train, 'Ground truth')
plt.show()

In [None]:
clustering = DBSCAN(eps=0.1, min_samples=5, metric='l2')
clustering.fit(X_train)
y_train_labels = clustering.labels_
y_predict = clustering.fit_predict(X_test)

cross_cuts_3d_cluster(X_train, y_train_labels, 'Clusters')
plt.show()

In [None]:
evaluate_clustering(X_train, y_train_labels, X_test, y_predict)

Find best parameters for DBSCAN in supervised learning

In [None]:
def cross_validate_clustering_score(X, y, num_of_features, eps, min_samples):
    train_scores = []
    test_scores = []
    clusters = []

    for train_index, test_index in crossvalid.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        X_train, X_test, y_train, y_test = pipeline_v1_core(
            FSEL_METHOD, int(num_of_features), 
            X_train, y_train, X_test, y_test
        )

        clustering = DBSCAN(eps=eps, min_samples=int(min_samples), metric='l2')
        clustering.fit(X_train)
        y_train_labels = clustering.labels_
        y_predict = clustering.fit_predict(X_test)

        num_of_clusters = len(np.unique(y_train_labels))
        clusters.append(num_of_clusters)
        if num_of_clusters > 1:
            train_scores.append(silhouette_score(X_train, y_train_labels, metric='euclidean'))
            #test_scores.append(silhouette_score(X_test, y_predict, metric='euclidean'))

    clusters = stats.mode(clusters).mode
    train_scores = np.array(train_scores)
    test_scores = np.array(test_scores)

    clusters = np.mean(clusters)
    train_score_mean = np.mean(train_scores)
    train_score_std = np.std(train_scores)
    #test_score_mean = np.mean(test_scores)
    #test_score_std = np.std(test_scores)

    return clusters, train_score_mean, train_score_std #test_score_mean, test_score_std


TRAINING_SET_RATIO = 0.6
FSEL_METHOD = mutual_info_classif   # f_classif, mutual_info_classif

features = fault_labeling(load_fd_feat(['az'], path=FEATURES_PATH), FAULT_CLASSES)
crossvalid = StratifiedKFold(n_splits=5)
X = filter_out_metadata_columns(features)
y = features['fault']


num_of_features = np.arange(1, len(X.columns) + 1)
eps = np.linspace(0.05, 0.8, 8)
min_samples = np.linspace(3, 8, 2)

grid = np.array(np.meshgrid(num_of_features, eps, min_samples)).T.reshape(-1, 3)

rows = []
for f, e, s in tqdm(grid):
    row = [f, e, s]
    row.extend(cross_validate_clustering_score(X, y, f, e, s))
    rows.append(row)


results = pd.DataFrame(rows, columns=[
    'num_of_features', 'eps', 'min_samples', 'clusters', 
    'train_score_mean', 'train_score_std'
]).dropna()

Top 10 best scored parameters with silhouette score

In [None]:
results[results['clusters'] > 1].sort_values(by='train_score_mean', ascending=False).head(10)