In [16]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark K-means example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [17]:
df = spark.read.format('csv').\
                       options(header='true', \
                       inferschema='true').\
            load("data/iris.csv",header=True);

In [18]:
df.show(5,True)
df.printSchema()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



You can also get the Statistical resutls from the data frame (Unfortunately, it only works for numerical).

In [19]:
df.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|     null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|     null|
|    min|               4.3|                2.0|               1.0|               0.1|   setosa|
|    max|               7.9|                4.4|               6.9|               2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+



In [20]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

transformed=df.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label']).select("features")
transformed.show(5)

                                                                                

+-----------------+
|         features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
+-----------------+
only showing top 5 rows



[Stage 48:>                                                         (0 + 1) / 1]                                                                                

Deal With Categorical Variables

In [21]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)
data.show(5,True)

[Stage 50:>                                                         (0 + 1) / 1]                                                                                

+-----------------+-----------------+
|         features|  indexedFeatures|
+-----------------+-----------------+
|[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|[5.0,3.6,1.4,0.2]|
+-----------------+-----------------+
only showing top 5 rows



Since clustering algorithms including k-means use distance-based measurements to determine the similarity between data points, It’s strongly recommended to standardize the data to have a mean of zero and a standard deviation of one.



### StandardScaler

When your dataset has uneven distribution of values, for example, some feature columns have values in millions, other feature columns have values in range between 0 and 1, you'd better to normalize all columns to have similar range of value distribuiton, Spark StandardScaler is one of such tools.
transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation and/or zero mean. It takes parameters:

withStd: True by default. Scales the data to unit standard deviation.
withMean: False by default. Centers the data with mean before scaling. It will build a dense output, so take care 
when applying to sparse input.

StandardScaler is an Estimator which can be fit on a dataset to produce a StandardScalerModel; this amounts to computing summary statistics. The model can then transform a Vector column in a dataset to have unit standard deviation and/or zero mean features.


In [22]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="indexedFeatures",outputCol="scaledFeatures",withStd=True, withMean=True) #check source code to confirm
scalerModel = scaler.fit(data)
#Normalize each feature to have unit standard deviation. 
scaledData = scalerModel.transform(data) 
scaledData.show(n=3, truncate=False)



[Stage 52:>                                                         (0 + 1) / 1]                                                                                

+-----------------+-----------------+---------------------------------------------------------------------------------+
|features         |indexedFeatures  |scaledFeatures                                                                   |
+-----------------+-----------------+---------------------------------------------------------------------------------+
|[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]|[-0.8976738791967643,1.0286112808972372,-1.3367940202882502,-1.308592819437957]  |
|[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]|[-1.1392004834649512,-0.12454037930145648,-1.3367940202882502,-1.308592819437957]|
|[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]|[-1.3807270877331392,0.33672028477802146,-1.393469854952817,-1.308592819437957]  |
+-----------------+-----------------+---------------------------------------------------------------------------------+
only showing top 3 rows



### clustering algorithm is for unsupervised learning, therefore, no training and testing splits necessary

In [23]:
from pyspark.ml.clustering import KMeans, KMeansModel

kmeans = KMeans() \
          .setK(3) \
          .setFeaturesCol("scaledFeatures")\
          .setPredictionCol("cluster")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[kmeans])

model = pipeline.fit(scaledData)

cluster = model.transform(scaledData)

                                                                                

In [24]:
cluster.show(5)

+-----------------+-----------------+--------------------+-------+
|         features|  indexedFeatures|      scaledFeatures|cluster|
+-----------------+-----------------+--------------------+-------+
|[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]|[-0.8976738791967...|      1|
|[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]|[-1.1392004834649...|      1|
|[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]|[-1.3807270877331...|      1|
|[4.6,3.1,1.5,0.2]|[4.6,3.1,1.5,0.2]|[-1.5014903898672...|      1|
|[5.0,3.6,1.4,0.2]|[5.0,3.6,1.4,0.2]|[-1.0184371813308...|      1|
+-----------------+-----------------+--------------------+-------+
only showing top 5 rows



In [25]:
cluster.select("features", "cluster").show(truncate=False)

+-----------------+-------+
|features         |cluster|
+-----------------+-------+
|[5.1,3.5,1.4,0.2]|1      |
|[4.9,3.0,1.4,0.2]|1      |
|[4.7,3.2,1.3,0.2]|1      |
|[4.6,3.1,1.5,0.2]|1      |
|[5.0,3.6,1.4,0.2]|1      |
|[5.4,3.9,1.7,0.4]|1      |
|[4.6,3.4,1.4,0.3]|1      |
|[5.0,3.4,1.5,0.2]|1      |
|[4.4,2.9,1.4,0.2]|1      |
|[4.9,3.1,1.5,0.1]|1      |
|[5.4,3.7,1.5,0.2]|1      |
|[4.8,3.4,1.6,0.2]|1      |
|[4.8,3.0,1.4,0.1]|1      |
|[4.3,3.0,1.1,0.1]|1      |
|[5.8,4.0,1.2,0.2]|1      |
|[5.7,4.4,1.5,0.4]|1      |
|[5.4,3.9,1.3,0.4]|1      |
|[5.1,3.5,1.4,0.3]|1      |
|[5.7,3.8,1.7,0.3]|1      |
|[5.1,3.8,1.5,0.3]|1      |
+-----------------+-------+
only showing top 20 rows



In [26]:
model.stages[0].clusterCenters()

[array([-0.0113575 , -0.86997056,  0.37562584,  0.31061296]),
 array([-1.01119138,  0.83949441, -1.30052149, -1.25093786]),
 array([1.16353612, 0.15326434, 0.99979607, 1.02619471])]

https://en.wikipedia.org/wiki/Silhouette_(clustering)

In [27]:
from pyspark.ml.evaluation import ClusteringEvaluator
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator(featuresCol='scaledFeatures', predictionCol='cluster')

silhouette = evaluator.evaluate(cluster)
print("Silhouette with squared euclidean distance = " + str(silhouette))


                                                                                

Silhouette with squared euclidean distance = 0.6535875501205956
