# KMeans

In [0]:
import numpy as np
import pandas as pd
import time

### Create synthetic dataset

In [0]:
n_rows = 1000000
n_cols = 500
n_clusters_data = 400
cluster_std = 1.0
dtype='float32'
from sklearn.datasets import make_blobs
data, _ = make_blobs(
        n_samples=n_rows, n_features=n_cols, centers=n_clusters_data, cluster_std=cluster_std, random_state=0
    )  # make_blobs creates a random dataset of isotropic gaussian blobs.

data = data.astype(dtype)

### Convert dataset to Spark DataFrame

In [0]:
pd_data = pd.DataFrame({"features": list(data)})
df = spark.createDataFrame(pd_data)
df.write.mode('overwrite').parquet("/tmp/kmeans_notebook_data.parquet")
df = spark.read.parquet("/tmp/kmeans_notebook_data.parquet")

### We will use this function to build both the Spark RAPIDS ML (GPU) and Spark ML (CPU) linear estimator objects, demonstrating the common API

In [0]:
def build_kmeans_estimator(estimator_class):
    return ( 
            estimator_class()
            .setTol(1.0e-20)
            .setK(200)
            .setFeaturesCol("features")
            .setMaxIter(15)
           )

## Spark RAPIDS ML (GPU)

In [0]:
from spark_rapids_ml.clustering import KMeans
gpu_kmeans = build_kmeans_estimator(KMeans)

Estimator can be persisted and reloaded.

In [0]:
estimator_path = "/tmp/kmeans-estimator"

In [0]:
gpu_kmeans.write().overwrite().save(estimator_path)
gpu_kmeans_loaded = KMeans.load(estimator_path)

### Fit

In [0]:
start_time = time.time()
gpu_model = gpu_kmeans_loaded.fit(df)
print(f"Fit took: {time.time() - start_time} sec")

In [0]:
gpu_kmeans_loaded.getK()

In [0]:
sorted_clusters = sorted([vec.tolist() for vec in gpu_model.clusterCenters()])

In [0]:
[vec[0:10] for vec in sorted_clusters[0:2]]

### Transform

In [0]:
model_path = "/tmp/kmeans-model"

In [0]:
gpu_model.write().overwrite().save(model_path)

In [0]:
gpu_model_loaded = gpu_model.read().load(model_path)

In [0]:
[vec[0:10] for vec in sorted(gpu_model_loaded.cluster_centers_)[0:2]]

In [0]:
transformed_df = gpu_model_loaded.setPredictionCol("transformed").transform(df)

In [0]:
transformed_df.printSchema()

In [0]:
transformed_df.count()

In [0]:
transformed_df.show(10)

## Spark ML (CPU)

In [0]:
from pyspark.ml.clustering import KMeans
cpu_kmeans = build_kmeans_estimator(KMeans)

Convert array sql type to VectorUDT Dataframe expected by Spark ML algos (Note: Spark RAPIDS ML also accepts VectorUDT Dataframes in addition to array type Dataframe above, along with a scalar column format - see docs).

In [0]:
from pyspark.ml.functions import array_to_vector

In [0]:
vector_df = df.select(array_to_vector(df.features).alias("features"))

### Fit

In [0]:
start_time = time.time()
cpu_kmeans_model = cpu_kmeans.fit(vector_df)
print(f"Fit took: {time.time() - start_time} sec")

In [0]:
type(cpu_kmeans_model.clusterCenters()[0])

In [0]:
sorted_cpu_cluster_centers = sorted([vec.tolist() for vec in cpu_kmeans_model.clusterCenters()])
[vec[0:10] for vec in sorted_cpu_cluster_centers[0:2]]

### Transform

In [0]:
spark_transformed = cpu_kmeans_model.setPredictionCol("transformed").transform(vector_df)

In [0]:
spark_transformed.filter(spark_transformed.transformed >= 0).count()

In [0]:
spark_transformed.show(10)