-
Notifications
You must be signed in to change notification settings - Fork 64
/
plot_kmeans_numpy.py
104 lines (76 loc) · 3.3 KB
/
plot_kmeans_numpy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
===============================
K-means clustering - NumPy API
===============================
The :meth:`.argmin()` reduction supported by KeOps :mod:`LazyTensors <pykeops.common.lazy_tensor.LazyTensor>` allows us
to perform **bruteforce nearest neighbor search** with four lines of code.
It can thus be used to implement a **large-scale**
`K-means clustering <https://en.wikipedia.org/wiki/K-means_clustering>`_,
**without memory overflows**.
.. note::
For large and high dimensional datasets, this script
**is outperformed by its PyTorch counterpart**
which avoids transfers between CPU (host) and GPU (device) memories.
"""
#########################################################################
# Setup
# -----------------
# Standard imports:
import time
import numpy as np
from matplotlib import pyplot as plt
from pykeops.numpy import LazyTensor
from pykeops.numpy.utils import IsGpuAvailable
dtype = 'float32' # May be 'float32' or 'float64'
##########################################################################
# Simple implementation of the K-means algorithm:
def KMeans(x, K=10, Niter=10, verbose=True):
N, D = x.shape # Number of samples, dimension of the ambient space
# K-means loop:
# - x is the point cloud,
# - cl is the vector of class labels
# - c is the cloud of cluster centroids
start = time.time()
c = np.copy(x[:K, :]) # Simplistic random initialization
x_i = LazyTensor( x[:,None,:] ) # (Npoints, 1, D)
for i in range(Niter):
c_j = LazyTensor( c[None,:,:] ) # (1, Nclusters, D)
D_ij = ((x_i - c_j)**2).sum(-1) # (Npoints, Nclusters) symbolic matrix of squared distances
cl = D_ij.argmin(axis=1).astype(int).reshape(N) # Points -> Nearest cluster
Ncl = np.bincount(cl).astype(dtype) # Class weights
for d in range(D): # Compute the cluster centroids with np.bincount:
c[:, d] = np.bincount(cl, weights=x[:, d]) / Ncl
end = time.time()
if verbose:
print("K-means example with {:,} points in dimension {:,}, K = {:,}:".format(N, D, K))
print('Timing for {} iterations: {:.5f}s = {} x {:.5f}s\n'.format(
Niter, end - start, Niter, (end - start) / Niter))
return cl, c
###############################################################
# K-means in 2D
# ----------------------
# First experiment with N=10,000 points in dimension D=2, with K=50 classes:
#
N, D, K = 10000, 2, 50
###############################################################
# Define our dataset:
x = np.random.randn(N, D).astype(dtype) / 6 + .5
##############################################################
# Perform the computation:
cl, c = KMeans(x, K)
##############################################################
# Fancy display:
plt.figure(figsize=(8, 8))
plt.scatter(x[:, 0], x[:, 1], c=cl, s=30000 / len(x), cmap="tab10")
plt.scatter(c[:, 0], c[:, 1], c='black', s=50, alpha=.8)
plt.axis([0, 1, 0, 1]);
plt.tight_layout();
plt.show()
####################################################################
# K-means in dimension 100
# -------------------------
# Second experiment with N=1,000,000 points in dimension D=100, with K=1,000 classes:
if IsGpuAvailable():
N, D, K = 1000000, 100, 1000
x = np.random.randn(N, D).astype(dtype)
cl, c = KMeans(x, K)