# KMeans

In [1]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [2]:
n_rows = 1000000
n_cols = 500
n_clusters_data = 200
cluster_std = 1.0
dtype='float32'
from sklearn.datasets import make_blobs
data, _ = make_blobs(
        n_samples=n_rows, n_features=n_cols, centers=n_clusters_data, cluster_std=cluster_std, random_state=0
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.astype(dtype)

### Convert dataset to Spark DataFrame

In [3]:
pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)
df.write.mode('overwrite').parquet("/tmp/kmeans_notebook_data.parquet")
df = spark.read.parquet("/tmp/kmeans_notebook_data.parquet")

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


23/05/05 13:03:09 WARN TaskSetManager: Stage 0 contains a task of very large size (163091 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

### We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) linear estimator objects, demonstrating the common API

In [4]:
def build_kmeans_estimator(estimator_class):
    return ( 
            estimator_class()
            .setTol(1.0e-20)
            .setK(200)
            .setFeaturesCol("features")
            .setMaxIter(15)
           )

## Spark RAPIDS ML (GPU)

In [5]:
from spark_rapids_ml.clustering import KMeans
gpu_kmeans = build_kmeans_estimator(KMeans)

Estimator can be persisted and reloaded.

In [6]:
estimator_path = "/tmp/kmeans-estimator"

In [7]:
gpu_kmeans.write().overwrite().save(estimator_path)
gpu_kmeans_loaded = KMeans.load(estimator_path)

### Fit

In [8]:
start_time = time.time()
gpu_model = gpu_kmeans_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

2023-05-05 13:03:31,954 - spark_rapids_ml.clustering.KMeans - INFO - Initializing cuml context
2023-05-05 13:03:36,016 - spark_rapids_ml.clustering.KMeans - INFO - Loading data into python worker memory
2023-05-05 13:03:50,240 - spark_rapids_ml.clustering.KMeans - INFO - Invoking cuml fit
500000000
2023-05-05 13:03:54,337 - spark_rapids_ml.clustering.KMeans - INFO - iterations: 16, inertia: 0.0
2023-05-05 13:03:54,422 - spark_rapids_ml.clustering.KMeans - INFO - Cuml fit complete


Fit took: 30.066962003707886 sec


                                                                                

In [9]:
gpu_kmeans_loaded.getK()

200

In [10]:
sorted_clusters = sorted([vec.tolist() for vec in gpu_model.clusterCenters()])

In [11]:
[vec[0:10] for vec in sorted_clusters[0:2]]

[[-9.976842880249023,
  -7.760704517364502,
  7.908307075500488,
  3.0621135234832764,
  2.5098681449890137,
  -5.862492084503174,
  -7.247006416320801,
  -5.204965114593506,
  -8.941619873046875,
  7.161828994750977],
 [-9.88961410522461,
  -4.136746406555176,
  0.37713441252708435,
  -3.5201406478881836,
  1.0691207647323608,
  -0.8834641575813293,
  7.633277416229248,
  5.797126770019531,
  7.815406322479248,
  -6.475502967834473]]

### Transform

In [12]:
model_path = "/tmp/kmeans-model"

In [13]:
gpu_model.write().overwrite().save(model_path)

23/05/05 13:03:56 WARN TaskSetManager: Stage 9 contains a task of very large size (1935 KiB). The maximum recommended task size is 1000 KiB.


In [14]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [15]:
[vec[0:10] for vec in sorted(gpu_model_loaded.cluster_centers_)[0:2]]

[[-9.976842880249023,
  -7.760704517364502,
  7.908307075500488,
  3.0621135234832764,
  2.5098681449890137,
  -5.862492084503174,
  -7.247006416320801,
  -5.204965114593506,
  -8.941619873046875,
  7.161828994750977],
 [-9.88961410522461,
  -4.136746406555176,
  0.37713441252708435,
  -3.5201406478881836,
  1.0691207647323608,
  -0.8834641575813293,
  7.633277416229248,
  5.797126770019531,
  7.815406322479248,
  -6.475502967834473]]

In [16]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [17]:
transformed_df.printSchema()

root
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- transformed: integer (nullable = true)



In [18]:
transformed_df.count()

1000000

In [19]:
transformed_df.show(10)

[Stage 15:>                                                         (0 + 1) / 1]

+--------------------+-----------+
|            features|transformed|
+--------------------+-----------+
|[-1.553751, 4.571...|         53|
|[5.4484706, -6.15...|        175|
|[9.699093, -6.353...|        157|
|[-4.8742313, -1.3...|        106|
|[4.446493, 1.5624...|         76|
|[-9.740857, -5.34...|        102|
|[9.084129, -5.561...|        146|
|[-6.112283, -4.64...|         99|
|[2.9649417, -4.11...|        156|
|[9.647127, -0.356...|         31|
+--------------------+-----------+
only showing top 10 rows



                                                                                

## Spark ML (CPU)

In [20]:
from pyspark.ml.clustering import KMeans
cpu_kmeans = build_kmeans_estimator(KMeans)

Convert array sql type to VectorUDT Dataframe expected by Spark ML algos (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [21]:
from pyspark.ml.functions import array_to_vector

In [22]:
vector_df = df.select(array_to_vector(df.features).alias("features"))

### Fit

In [23]:
start_time = time.time()
cpu_kmeans_model = cpu_kmeans.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")



Fit took: 207.92213225364685 sec




In [24]:
type(cpu_kmeans_model.clusterCenters()[0])

numpy.ndarray

In [25]:
sorted_cpu_cluster_centers = sorted([vec.tolist() for vec in cpu_kmeans_model.clusterCenters()])
[vec[0:10] for vec in sorted_cpu_cluster_centers[0:2]]

[[-9.976832350349428,
  -7.760703904533386,
  7.908303107261658,
  3.062114779834077,
  2.5098655241448435,
  -5.862500883340836,
  -7.247006791496277,
  -5.204970568275452,
  -8.941602145195008,
  7.161824905586243],
 [-9.889609041881561,
  -4.1367452923536305,
  0.37713481084262607,
  -3.52013919489719,
  1.069119877723897,
  -0.8834634504925518,
  7.633273174285889,
  5.797124461078644,
  7.815398098993302,
  -6.475500747919083]]

### Transform

In [26]:
spark_transformed = cpu_kmeans_model.setPredictionCol("transformed").transform(vector_df)

In [27]:
spark_transformed.filter(spark_transformed.transformed >= 0).count()

                                                                                

1000000

In [28]:
spark_transformed.show(10)

+--------------------+-----------+
|            features|transformed|
+--------------------+-----------+
|[-1.5537509918212...|        147|
|[5.44847059249877...|         68|
|[9.69909286499023...|        153|
|[-4.8742313385009...|         81|
|[4.44649314880371...|        117|
|[-9.7408571243286...|        115|
|[9.08412933349609...|         88|
|[-6.1122832298278...|        167|
|[2.96494174003601...|          4|
|[9.64712715148925...|        113|
+--------------------+-----------+
only showing top 10 rows

