# H.04 | Clustering

In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import datasets as sklearn_datasets

## Principle Components Analysis

Come up with some exercise.

In [29]:
# Generate 3D gaussian noise that is wide on one axis
np.random.seed(0)
n_samples = 1000
data = np.random.normal(size=(n_samples, 3))
data[:, 0] *= 10
data[:, 1] *= 5

# plot 3D scatter plot
fig = px.scatter_3d(x=data[:, 0], y=data[:, 1], z=data[:, 2])

# Give axes titles
fig.update_layout(scene=dict(
    xaxis_title='Number of Rooms',
    yaxis_title='Number of Bathrooms',
    zaxis_title='Cost'))

# update range on axes
fig.update_layout(
title = "Original Data",
template='plotly_white',
scene=dict(
    xaxis=dict(range=[-10, 10]),
    yaxis=dict(range=[-10, 10]),
    zaxis=dict(range=[-10, 10])
),
width=800,
height=800

)

# update sxize of marker
fig.update_traces(marker=dict(size=1))

fig.show()


In [43]:
# Run throgugh PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(data)
transformed_data = pca.transform(data)

# Plot 2D projection with size 1
fig = px.scatter(x=transformed_data[:, 0], y=transformed_data[:, 1], title="PCA Projection", template='plotly_white')
fig.show()

## Unsupervised Learning

In this section, you will be asked to implement the following functions in `clustering.py`:

1. **k_means**: Implement the K-means algorithm. The function takes the data points and the number of clusters K as input and returns the centroids of the K clusters. The algorithm should stop when the centroids do not change anymore.
2. **pairwise_manhattan_distance**: Implement a pairwise Manhattan distance function. The function takes two sets of data points as input and returns the pairwise Manhattan distance between the two sets of data points.
2. **local_silhouette_score**: Implement the silhouette score. The function takes the data points, the cluster labels, and the distance metric as input and returns the silhouette score of the clustering.
3. **dbscan**: Use the DBSCAN algorithm from sklearn. The function takes the data points, the epsilon value, and the minimum number of points as input and returns the cluster labels.

In [2]:
from clustering import k_means, local_silhouette_score

# Make blobs for KMeans.
X, y = sklearn_datasets.make_blobs(n_samples=500, centers=3, random_state=111)

# Grab three (k = 3) starting points.
start_points = X[[1, 2, 3], :]

# Run KMeans.
centers, labels = k_means(X, 3, start_points.copy(), max_iter = 5)
score = local_silhouette_score(X, labels, metric = "euclidean")

# Plot KMeans Results
fig = make_subplots(rows=1, cols=2, subplot_titles=("Original Data", f"K Means Clustering | Score {score:.2f}"))
fig.add_trace(go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', name = "Data"), row=1, col=1)
fig.add_trace(go.Scatter(x=start_points[:, 0], y=start_points[:, 1], mode='markers', marker=dict(size=10, color='hotpink', line=dict(width=2, color='DarkSlateGrey')), name = "Start Points"), row=1, col=1)
fig.add_trace(go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', name = "Data", marker=dict(color=labels)), row=1, col=2)
fig.add_trace(go.Scatter(x=centers[:, 0], y=centers[:, 1], mode='markers', marker=dict(size=10, color='hotpink', line=dict(width=2, color='DarkSlateGrey')), name = "KMeans Centers"), row=1, col=2)
fig.update_layout(title='K Means Clustering', height=600, width=1200, template='plotly_white', showlegend=True)
fig.show()

In [3]:
from clustering import dbscan

X, y = sklearn_datasets.make_circles(n_samples=500, noise=0.02, factor=0.5, random_state=111)

# KMeans for comparison.
start_points = X[[1, 2], :]
centers, kmeans_labels = k_means(X, 2, start_points.copy())

# DBSCAN
dbscan_labels = dbscan(X, eps=0.1, min_samples=5)

# Plot KMeans and DBSCAN Results.
fig = make_subplots(rows=1, cols=3, subplot_titles=("Original Data", "K-Means Clustering", "DBSCAN Clustering"))
fig.add_trace(go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', name = "Data"), row=1, col=1)
fig.add_trace(go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', text = labels, name = "K-Means", marker=dict(color=kmeans_labels)), row=1, col=2)
fig.add_trace(go.Scatter(x=centers[:, 0], y=centers[:, 1], mode='markers', marker=dict(size=10, color='blue', line=dict(width=2, color='DarkSlateGrey')), name = "KMeans Centers"), row=1, col=2)
fig.add_trace(go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', name = "DBSCAN", marker=dict(color=dbscan_labels)), row=1, col=3)
fig.update_layout(title='K-Means vs DBScan Clustering', height=600, width=1200, template='plotly_white', showlegend=True)
fig.show()
