# Lecture 11 – Clustering, Introduction to Probability

## DSC 40A, Fall 2021

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from kmeans_40a import kMeans

Here's a dataset:

In [None]:
example_data = pd.DataFrame()
example_data['x1'] = [2, 3, -4, -8, -7, 5, -5]
example_data['x2'] = [1, 4, -5, -6, -2, 3, -7]
example_data

In [None]:
px.scatter(example_data, x='x1', y='x2', size=np.ones(example_data.shape[0]), size_max=15)

Let's visualize the execution of the k-Means Clustering algorithm on this dataset, with $k = 2$ and initial clusters $\mu_1 = (2, 1)$ and $\mu_2 = (3, 4)$.

In [None]:
clustering = kMeans(example_data,
                    k=2,
                    num_iters=5,
                    initial_centroids=np.array([[2, 1], [3, 4]]))

In [None]:
clustering.show_full_animation()

### New example dataset

In [None]:
X = pd.read_csv('data/k_means_data.csv')
X['x2'] /= 10
X

In [None]:
px.scatter(X, x='x1', y='x2')

### Initialization

Sometimes, all is well:

In [None]:
np.random.seed(144)
kMeans(X, k=3, num_iters=3).show_full_animation()

Sometimes, we're not so lucky...

In [None]:
np.random.seed(42)
kMeans(X, k=3, num_iters=10).show_full_animation()

### Choosing a value of $k$

In [None]:
np.random.seed(1000) # Done to "undo" the randomness, so that we get the same result each time we run this

ks = np.arange(1, 8)
inertias = np.array([])

for k in ks:
    clustering = kMeans(X, k=k)
    clustering.iterate()
    inertias = np.append(inertias, clustering.inertia())

In [None]:
px.line(x=ks, y=inertias, labels={'x': 'k (number of clusters)', 'y': 'inertia'})