# DBSCAN

## References
- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html?highlight=dbscan#sklearn.cluster.DBSCAN

## Step-1: Generate Some Data

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

num_centers = 5

X, y = make_blobs(n_samples=1000,  n_features=2, centers=num_centers)

print ('X.shape:', X.shape)
print ('y.shape:', y.shape)

# note the color coding of clusters
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y,
            s=25, edgecolor='k')
plt.show()

## Step-2: Run DBSScan Clustering

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np

dbscan = DBSCAN(eps=5, min_samples=2).fit(X)

labels = dbscan.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print ("labels[:few]", np.random.choice(labels, 20, replace=False))
print("Estimated number of clusters: %d" % n_clusters)
print("Points that were not clustered: %d" % n_noise)

## Step-3: Visualize Clusters

In [None]:
## plot predicted results
y_pred = dbscan.fit_predict(X)

## now observe the color coding of clusters
## do they match?
plt.scatter(X[:, 0], X[:, 1], c=y_pred)

## Experiment

```python
DBSCAN(eps=3, min_samples=2)
```

Change these parameters and re run the algorithm.  Does it change the predicted clusters?

## Iterate over a few values to find the optimal values

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import calinski_harabasz_score
import pandas as pd

df = pd.DataFrame()

eps_range = [0.5, 1, 1.5, 2, 2.5, 3]
min_samples_range = [2,3,5,7,10]

for eps in eps_range:
    for min_sample in min_samples_range:
        dbscan = DBSCAN(eps=eps, min_samples=min_sample).fit(X)
        labels = dbscan.labels_
        # Number of clusters in labels, ignoring noise if present.
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        
        y_pred = dbscan.fit_predict(X)
        ch_score = calinski_harabasz_score (X, y_pred)

        print ("eps={}, min_samples={}, num_clusters={},  not_clustered={}, calinski_score={}".format (
            eps, min_sample, n_clusters, n_noise, ch_score))
        
        row = pd.DataFrame ({'eps' : [eps], 'min_samples': [min_sample], 'num_clusters': [n_clusters], 
                             'not_clustered' : [n_noise],  'calinski_score': [ch_score]})
        df = df.append(row, ignore_index=True)

In [None]:
df

In [None]:
# Let's find the highest score

df_sorted = df.sort_values(by=['calinski_score', 'not_clustered'], ascending=[False, True])
df_sorted

## Optimal Values

In [None]:
import numpy as np

max_idx = np.argmax(df['calinski_score'])
print ("optimal max_idx", max_idx)

eps = df.iloc[max_idx]['eps']
print ("optimal eps", eps)
min_samples = df.iloc[max_idx]['min_samples']
print ("optimal min_samples", min_samples)

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit(X)

labels = dbscan.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters)
print("Points that were not clustered: %d" % n_noise)

y_pred = dbscan.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)