# MVD 8. cvičení

## 1. část - Vytvoření dat

Použijte stejný kód z minulého cvičení pro vytvoření dat. Navíc vytvořte jeden větší dataset, ve kterém sjednotíte výstupy různých funkcí (např. make_blobs + make_circles). 

In [382]:
from sklearn.datasets import make_circles, make_moons, make_blobs
import numpy as np
import pandas as pd
import plotly.express as px

In [383]:
def make_clusters(centers, n_data_points, center_box=(-10.0, 10.0)):
    blobs = make_blobs(n_data_points, centers=centers, center_box=center_box)
    x = blobs[0][:, 0]
    y = blobs[0][:, 1]
    labels = blobs[1]

    df = pd.DataFrame({'x': x, 'y': y, 'label': labels})
    df.label = df.label.apply(str)

    return df

In [384]:
def display_clusters(X):
    config = {'x': X[:, 0].astype('float64'), 'y': X[:, 1].astype('float64')}
    if X.shape[1] > 2:
        config['label'] = X[:, 2]

    df = pd.DataFrame(config)
    if 'label' in config:
        fig = px.scatter(df, x='x', y='y', color='label')
    else:
        fig = px.scatter(df, x='x', y='y')
    fig.show()

In [385]:
centers_25 = []
for i in range(5):
    for k in range(5):
        centers_25.append((i * 10, k * 10))

datasets = [
    make_clusters(centers=[(0, 0), (0, 10)], n_data_points=100)[['x', 'y']].to_numpy(),
    make_clusters(centers=[(0, 0), (0, 10), (10, 10), (10, 0)], n_data_points=200)[['x', 'y']].to_numpy(),
    make_clusters(centers=centers_25, n_data_points=1000)[['x', 'y']].to_numpy(),
    make_moons(200, noise=.03)[0],
    make_circles(200, noise=.03, factor=.1)[0],
    np.vstack([make_circles(200, noise=.03, factor=.5)[0], make_moons(200, noise=.03)[0] + 0.5]),
]

In [386]:
for X in datasets:
    display_clusters(X)

## 2. část - Implementace DBSCAN algoritmu
Dle přednášky implementujte DBSCAN algoritmus.

In [387]:
# | x | y |
# |...|...|
def knn(X):
    dist = np.zeros((X.shape[0], X.shape[0]))
    for i in range(X.shape[0]):
        dist[i, :] = np.sum((X - X[i, :]) ** 2, axis=1).T

    return dist

In [388]:
def dbscan(X, eps, min_pts):
    # core points
    distances = knn(X)
    sorted = np.sort(distances, axis=1)[:, min_pts]
    core_point_indices = np.argwhere(sorted < eps)[:, 0]

    # accessible points
    if len(core_point_indices.ravel()) > 0:
        close_to_core_pts_indices = np.unique(np.argwhere(distances[core_point_indices, :] < eps)[:, 1])
    else:
        close_to_core_pts_indices = np.array([])
    accessible_points_indices = np.array(list(set(close_to_core_pts_indices).difference(core_point_indices)))

    # inaccessible points
    inaccessible_points_indices = set(np.arange(X.shape[0])).difference(set(core_point_indices).union(accessible_points_indices))

    # categorize points to clusters
    groups = []
    placed_cores = set()

    # categorize core points
    for core_point_idx in core_point_indices:
        if len(groups) == 0:
            groups.append({core_point_idx})
            placed_cores.add(core_point_idx)
            continue

        # take nearest core point from all grouped points
        unplaced_core_indices = np.array(list(set(core_point_indices).difference(placed_cores)))
        placed_core_indices = np.array(list(placed_cores))
        distances_to_unplaced = distances[placed_core_indices, :][:, unplaced_core_indices]

        # find closest
        placed, unplaced = np.unravel_index(distances_to_unplaced.argmin(), distances_to_unplaced.shape)
        placed_idx = placed_core_indices[placed]
        unplaced_idx = unplaced_core_indices[unplaced]

        # find a group
        if distances_to_unplaced[placed, unplaced] >= eps: # not close enough -> create a new group (cluster)
            groups.append({unplaced_idx})
            placed_cores.add(unplaced_idx)
        else: # close enough to the closes core point -> add to existing group
            for group in groups:
                if placed_idx in group:
                    group.add(unplaced_idx)
                    placed_cores.add(unplaced_idx)
                    break

    # categorize accessible points
    for accessible_point_idx in accessible_points_indices:
        for group in groups:
            group_indices = np.array(list(group)).T
            if np.any(distances[group_indices, accessible_point_idx] < eps):
                group.add(accessible_point_idx)
                break

    # make labels vector
    labels = np.zeros((X.shape[0], 3), dtype='int')
    for i, group in enumerate(groups):
        indices = np.array(list(group)).T
        labels[indices, 0] = i + 1
    labels = labels.astype('str')
    if len(inaccessible_points_indices) > 0:
        labels[np.array(list(inaccessible_points_indices)).T, 0] = 'no cluster'
        labels[np.array(list(inaccessible_points_indices)).T, 1] = 'inaccessible'

    if len(core_point_indices) > 0:
        labels[np.array(list(core_point_indices)).T, 1] = 'core'

    if len(accessible_points_indices) > 0:
        labels[np.array(list(accessible_points_indices)).T, 1] = 'accessible'

    labels[:, 2] = sorted.T
    X_labelled = np.hstack([X, labels])

    return X_labelled

## 3. část - Vyhodnocení 
Aplikujte Váš DBSCAN na vytvořené datasety. Experimentálně najděte parametry ```Eps``` a ```MinPts```.

In [395]:
X_labelled = dbscan(datasets[-1], eps=.03, min_pts=3)
display_clusters(X_labelled)

In [392]:
# (eps, min_pts)
configs = [
    (4., 5),
    (4., 5),
    (4., 5),
    (0.02, 3),
    (0.1, 3),
    (.03, 3),
]

for i, X in enumerate(datasets):
    eps, min_pts = configs[i]
    X_labelled = dbscan(X, eps=eps, min_pts=min_pts)
    display_clusters(X_labelled)

## 4. část - Porovnání s výstupem z knihovny

Porovnejte výstup s výstupem z knihovny sklearn. Dokumentaci naleznete [zde](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).

In [393]:
from sklearn.cluster import DBSCAN

In [394]:
for i, X in enumerate(datasets):
    eps, min_pts = configs[i]
    model = DBSCAN(eps=eps, min_samples=min_pts)
    labels = model.fit_predict(X)
    X_labelled = np.hstack([X, labels[None].T.astype('str')])
    #display_clusters(X_labelled)

# Nerenderováno, protože PC to nějak nezvládal