In [41]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark K-means example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [42]:
df = spark.read.format('csv').\
                       options(header='true', \
                       inferschema='true').\
            load("data/wine_dataset.csv",header=True);

In [43]:
df.show(3)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-----+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_dioxide|total_sulfur_dioxide|density|  pH|sulphates|alcohol|quality|style|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-----+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|  red|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|  red|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|  red|
+-------------+----------------+-----------+--------------+-----

In [44]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

transformed=df.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label']).select("features")
transformed.show(5)

                                                                                

+--------------------+
|            features|
+--------------------+
|[7.4,0.7,0.0,1.9,...|
|[7.8,0.88,0.0,2.6...|
|[7.8,0.76,0.04,2....|
|[11.2,0.28,0.56,1...|
|[7.4,0.7,0.0,1.9,...|
+--------------------+
only showing top 5 rows



### Not need Standard Scaler for BisectKMeans, just original dataframe is fine, which is dataframe "transformed"

### While BisecKMeans can find the best K by giving a range of K value from 1, it is not always can find the convergence that a best K is found within a range given.

### below helper function RunBisectKM runs a number of BisecKMeans with K value ranging from 2 to maximum K value and find the best K value based on the K value when its silhouette score reaches maximum.



In [53]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

def RunBisectKM(featuresCol='features', initK=20, dataSet=scaledData):
    #Store silhouette scores in dictionary, with key to be K, its silhouette scores to be value
    silhouette_scores=dict()  
    #evaluator will return silhouette score
    evaluator = ClusteringEvaluator(featuresCol=featuresCol, metricName='silhouette') 
    for K in range(2,initK+1):  # loop over from 2 to maxmium K value
        BKMeans=BisectingKMeans(featuresCol=featuresCol, k=K, minDivisibleClusterSize =1)
        BKMeans_fit=BKMeans.fit(dataSet)
        BKMeans_transform=BKMeans_fit.transform(dataSet) 
        evaluation_score=evaluator.evaluate(BKMeans_transform)
        silhouette_scores[K]=evaluation_score  #store silhouette score in dictionary key K
    #Get the best K when silhouette score is maximum of the list
    bestK=[key for key, value in silhouette_scores.items() if value==max(silhouette_scores.values())][0]
    #Return BisectingKMeans model with bestK
    return BisectingKMeans(featuresCol=featuresCol, k=bestK, minDivisibleClusterSize =1)

In [54]:
bKM=RunBisectKM(featuresCol='features', initK=20, dataSet=transformed)
model=bKM.fit(transformed)

                                                                                

In [55]:
predictions=model.transform(transformed)

In [56]:
predictions.show(5)

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[7.4,0.7,0.0,1.9,...|         0|
|[7.8,0.88,0.0,2.6...|         0|
|[7.8,0.76,0.04,2....|         0|
|[11.2,0.28,0.56,1...|         0|
|[7.4,0.7,0.0,1.9,...|         0|
+--------------------+----------+
only showing top 5 rows



[Stage 23063:>                                                      (0 + 1) / 1]                                                                                

In [57]:
predictions.select("prediction").distinct().show()

+----------+
|prediction|
+----------+
|         1|
|         0|
+----------+



[Stage 23064:>                                                      (0 + 1) / 1]                                                                                

In [58]:
# Evaluate clustering by computing Silhouette score 
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = {}".format(silhouette))
# Shows the result. 
print("Cluster Centers: ") 
centers = model.clusterCenters()
for i in centers:
    print(i)

[Stage 23068:>                                                      (0 + 1) / 1]                                                                                

Silhouette with squared euclidean distance = 0.6886171250780977
Cluster Centers: 
[7.57680277e+00 4.02958512e-01 2.94013829e-01 3.20113599e+00
 6.47797168e-02 1.92160026e+01 6.60622325e+01 9.94579030e-01
 3.25098452e+00 5.71241357e-01 1.07755888e+01 5.81330260e+00]
[6.91325145e+00 2.87182081e-01 3.40549133e-01 7.32936416e+00
 4.88852601e-02 4.00007225e+01 1.57086416e+02 9.94869611e-01
 3.19067341e+00 5.01086705e-01 1.02469374e+01 5.81358382e+00]
