In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import sys
from sklearn.datasets import make_blobs
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio
import torch

STATE = 2023
TEMPLATE = "ggplot2"
N_CLUST = 3
N_SAMPLES = 100
STD = 1.0
PALETTE = px.colors.qualitative.Plotly
MARKER = dict(color="black", size=14)

pio.templates.default = TEMPLATE
figKwargs = dict(
    width=600,
    height=500,
    color_discrete_sequence=PALETTE,
)
sys.path.append("../")


In [3]:
X, y = make_blobs(N_SAMPLES, cluster_std=STD, random_state=STATE, centers=N_CLUST)
X = (X - X.mean(0, keepdims=True)) / X.std(0, keepdims=True)
color = y.astype(str)
X = torch.from_numpy(X)
fig = px.scatter(x=X[:, 0], y=X[:, 1], color=color, **figKwargs)
fig.show()


In [4]:
from ml.cluster.functional import init_centroids, plus_plus_init_centroids
from ml.distance.functional import eucledian_dist, eucledian_dist_loops


naive_kwargs = {"marker": {"size": 14, "color": "cyan"}, "name": "Naive init"}
plus_plus_kwargs = {"marker": {"size": 14, "color": "black"}, "name": "Plus plus init"}
generator = torch.manual_seed(STATE)
naive_init = init_centroids(X, N_CLUST, generator)
plus_plus_init = plus_plus_init_centroids(X, N_CLUST, generator)

fig = px.scatter(x=X[:, 0], y=X[:, 1], color=color, **figKwargs)
fig.add_trace(
    go.Scatter(x=naive_init[:, 0], y=naive_init[:, 1], mode="markers", **naive_kwargs)
)
fig.add_trace(
    go.Scatter(
        x=plus_plus_init[:, 0],
        y=plus_plus_init[:, 1],
        mode="markers",
        **plus_plus_kwargs
    )
)
fig.show()


In [5]:
from ml.cluster import KMeans

clusterer = KMeans(N_CLUST, random_state=STATE)
clusterer.fit(X)
labels = clusterer.predict(X)
fig = px.scatter(x=X[:, 0], y=X[:, 1], color=labels.numpy().astype("str"), **figKwargs)
fig.add_trace(
    go.Scatter(
        x=clusterer.centroids[:, 0],
        y=clusterer.centroids[:, 1],
        mode="markers",
        name="centroids",
        marker=MARKER,
    )
)
fig.show()


### Animation

In [6]:
import pandas as pd
import numpy as np

from ml.cluster.functional import (
    recalculate_centroids,
    find_closest_centroid,
    _k_means,
    init_centroids,
    plus_plus_init_centroids,
)
# init_fn = plus_plus_init_centroids
init_fn = init_centroids
max_iter = 20

In [7]:
all_centroids = [init_fn(X, N_CLUST, torch.manual_seed(STATE))]
iters = [torch.zeros(all_centroids[-1].shape[0], 1)]
df = pd.DataFrame(X.numpy(), columns=["x1", "x2"])
dists = eucledian_dist(X, all_centroids[-1])
labels = find_closest_centroid(dists)
df["color"] = labels.numpy().astype(str)
dfs = []

for i in range(max_iter-1):
    cur_df = df.copy()
    cur_df["iter"] = float(i)
    cur_df["size"] = 1
    cur_df["color"] = labels.numpy().astype(str)
    dfs.append(cur_df)
    all_centroids.append(_k_means(X, all_centroids[-1]))
    iters.append(iters[-1] + 1)
    dists = eucledian_dist(X, all_centroids[-1])
    labels = find_closest_centroid(dists)

cur_df = df.copy()
dists = eucledian_dist(X, all_centroids[-1])
labels = find_closest_centroid(dists)
cur_df = df.copy()
cur_df["iter"] = float(i+1)
cur_df["size"] = 1
cur_df["color"] = labels.numpy().astype(str)
dfs.append(cur_df)
dfs = pd.concat(dfs)


iters = torch.stack(iters)
all_centroids = torch.stack(all_centroids)
data = torch.dstack([all_centroids, iters])



data = data.view(max_iter * all_centroids.shape[1], 3).numpy()
data = pd.DataFrame(data, columns=["x1", "x2", "iter"])
data['size'] = 14
data['color'] = list(range(0, N_CLUST)) * max_iter
data['color'] = data['color'].astype(str)


dfs = pd.concat([dfs, data])
colormap = {k: PALETTE[int(k)] for k in dfs.color}
dfs.head()

Unnamed: 0,x1,x2,color,iter,size
0,1.053744,-1.320675,2,0.0,1
1,1.03043,-1.179708,2,0.0,1
2,-0.724333,0.080079,2,0.0,1
3,0.828432,-1.322086,2,0.0,1
4,-1.067982,-0.444162,2,0.0,1


In [8]:
fig = px.scatter(
    dfs,
    x="x1",
    y="x2",
    animation_frame="iter",
    size="size",
    color="color",
    color_discrete_map=colormap, 
    **figKwargs
)
fig.show()