In [11]:
import pandas as pd

telemetry = pd.read_csv('./telemetry_cleaned.csv')
X = telemetry[['Easting', 'Northing', 'WaterDepth', 'Roll', 'Pitch', 'Heading']].head(10619)

## DBSCAN

In [8]:
def num_of_clusters(db):
	return len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)

### Grid Search function

This function accepts an input dataset and a list of options. The function will iterate through each combination of these options in order to find the best results.

In [9]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
import itertools

def grid_search_dbscan(data, options, verbose=False):
	results = []

	# Iterate through all combinations of options
	# And perform a DBSCAN evaluation on each
	option_combs = itertools.product(*options.values())

	for eps, min_samples in list(option_combs):
		if verbose: print(f'Running eps={eps}, min_samples={min_samples}')

		db = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
		labels = db.labels_

		# Get estimated number of clusters and noise points
		clusters = num_of_clusters(db)
		noise = list(labels).count(-1)

		# Silhouette Coefficient
		silhouette = metrics.silhouette_score(X, labels)

		# Davies-Bouldin Score
		davies_bouldin = metrics.davies_bouldin_score(X, labels)

		if verbose: print(f'Finished with silhouette={silhouette}, davies={davies_bouldin}')

		row = {
			'eps': eps,
			'min_samples': min_samples,
			'clusters': clusters,
			'noise': noise,
			'silhouette': silhouette,
			'davies_bouldin': davies_bouldin
		}

		results.append(row)

	return results

### Evaluation
DBSCAN parameters based on scikit learn [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).

Unsupervised clustering metrics based on scikit learn [documentation](https://scikit-learn.org/stable/modules/classes.html#clustering-metrics).

In [10]:
# Run grid search with the following options
options = {
	"eps": [0.13, 0.14, 0.15, 0.16, 0.17],
	"min_samples": [8, 9, 10, 11, 12]
}

results = grid_search_dbscan(X, options, True)

Running eps=0.13, min_samples=8
Finished with silhouette=-0.5886320830179866, davies=1.2538953420747236
Running eps=0.13, min_samples=9
Finished with silhouette=-0.5856643896916622, davies=1.3725254232080082
Running eps=0.13, min_samples=10


KeyboardInterrupt: 

In [5]:
results[0]

{'eps': 0.13,
 'min_samples': 8,
 'clusters': 9,
 'noise': 10532,
 'silhouette': -0.5886320830179866,
 'davies_bouldin': 1.2538953420747236}

We can then use the best parameters to perform DBSCAN and apply the labels to the dataset:

In [None]:
db = DBSCAN(eps=best['params']['eps'], min_samples=best['params']['min_samples']).fit(X)
X_labeled = X.copy(deep=True)
X_labeled['Label'] = db.labels_
X_labeled.head(5)

We can then filter out rows with a label of -1 to denoise the dataset

In [None]:
X_denoised = X_labeled[X_labeled['Label'] != -1]
print(f'{X_labeled.shape[0] - X_denoised.shape[0]} points removed')

### Plot results
The following code was obtained from [scikit learn](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py).

In [None]:
import numpy as np

def plot_clusters(db):
	unique_labels = set(db.labels_)
	core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
	core_samples_mask[db.core_sample_indices_] = True

	colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]

	for k, color in zip(unique_labels, colors):
		if k == -1:
			# Black used for noise
			color = [0, 0, 0, 1]

		class_member_mask = db.labels_ == k

		core_samples = X[class_member_mask & core_samples_mask]

		plt.plot(
			core_samples['Easting'],
			core_samples['Northing'],
			"o",
			markerfacecolor=tuple(color),
			markeredgecolor='k',
			markersize=10,
		)

		non_core_samples = X[class_member_mask & ~core_samples_mask]
		plt.plot(
			non_core_samples['Easting'],
			non_core_samples['Northing'],
			"o",
			markerfacecolor=tuple(color),
			markeredgecolor='k',
			markersize=3,
		)

	plt.title(f'Estimated clusters: {num_of_clusters(db)}')
	plt.show()

In [None]:
plot_clusters(db)

After denoised:

In [None]:
plot2d(X_denoised)

## Exporting Data

Finally, we can export the denoised data to CSV format to be used by the API

In [None]:
X_denoised.head(5)

In [None]:
# Re-add missing columns to keep Api code happy
X_denoised.assign(Date='20-02-27')
X_denoised.assign(Time='20:50:47.502')
X_denoised.assign(Roll=-6.3)
X_denoised.assign(Pitch=2.0)
X_denoised.assign(Heading=19.9)

X_denoised.to_csv('../Api/src/data/telemetry_denoised.csv', index=False)