In [21]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark K-means example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [24]:
df = spark.read.format('csv').\
                       options(header='true', \
                       inferschema='true').\
            load("data/wine_dataset.csv",header=True);

In [25]:
df.show(3)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-----+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_dioxide|total_sulfur_dioxide|density|  pH|sulphates|alcohol|quality|style|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-----+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|  red|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|  red|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|  red|
+-------------+----------------+-----------+--------------+-----

In [26]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

transformed=df.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label']).select("features")
transformed.show(5)

[Stage 5660:>                                                       (0 + 1) / 1]                                                                                

+--------------------+
|            features|
+--------------------+
|[7.4,0.7,0.0,1.9,...|
|[7.8,0.88,0.0,2.6...|
|[7.8,0.76,0.04,2....|
|[11.2,0.28,0.56,1...|
|[7.4,0.7,0.0,1.9,...|
+--------------------+
only showing top 5 rows



In [27]:
from pyspark.ml.feature import StandardScaler

In [28]:
scaler = StandardScaler(inputCol="features",outputCol="scaledFeatures",withStd=True, withMean=True)
scalerModel = scaler.fit(transformed)
#Normalize each feature to have unit standard deviation. 
scaledData = scalerModel.transform(transformed) 
scaledData.show(n=3)

[Stage 5662:>                                                       (0 + 1) / 1]                                                                                

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[7.4,0.7,0.0,1.9,...|[0.14246230020601...|
|[7.8,0.88,0.0,2.6...|[0.45100101079840...|
|[7.8,0.76,0.04,2....|[0.45100101079840...|
+--------------------+--------------------+
only showing top 3 rows



In [35]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

def RunBisectKM(featuresCol='features', initK=20, dataSet=scaledData):
    silhouette_scores=dict()
    evaluator = ClusteringEvaluator(featuresCol=featuresCol, metricName='silhouette')
    for K in range(2,initK):
        BKMeans_=BisectingKMeans(featuresCol=featuresCol, k=K, minDivisibleClusterSize =1)
        BKMeans_fit=BKMeans_.fit(dataSet)
        BKMeans_transform=BKMeans_fit.transform(dataSet) 
        evaluation_score=evaluator.evaluate(BKMeans_transform)
        silhouette_scores[K]=evaluation_score
    bestK=[key for key, value in silhouette_scores.items() if value==max(silhouette_scores.values())][0]
    return BisectingKMeans(featuresCol=featuresCol, k=bestK, minDivisibleClusterSize =1)

In [36]:
bKM=RunBisectKM(featuresCol='features', initK=20, dataSet=scaledData)
model=bKM.fit(scaledData)

                                                                                

In [37]:
predictions=model.transform(scaledData)

In [38]:
predictions.show(5)

+--------------------+--------------------+----------+
|            features|      scaledFeatures|prediction|
+--------------------+--------------------+----------+
|[7.4,0.7,0.0,1.9,...|[0.14246230020601...|         0|
|[7.8,0.88,0.0,2.6...|[0.45100101079840...|         0|
|[7.8,0.76,0.04,2....|[0.45100101079840...|         0|
|[11.2,0.28,0.56,1...|[3.07358005083372...|         0|
|[7.4,0.7,0.0,1.9,...|[0.14246230020601...|         0|
+--------------------+--------------------+----------+
only showing top 5 rows



In [39]:
predictions.select("prediction").distinct().show()

+----------+
|prediction|
+----------+
|         1|
|         0|
+----------+



In [40]:
# Evaluate clustering by computing Silhouette score 
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = {}".format(silhouette))
# Shows the result. 
print("Cluster Centers: ") 
centers = model.clusterCenters()
for i in centers:
    print(i)

[Stage 12725:>                                                      (0 + 1) / 1]                                                                                

Silhouette with squared euclidean distance = 0.6857870702274987
Cluster Centers: 
[7.53956674e+00 3.94329404e-01 2.95489009e-01 3.34114368e+00
 6.38212807e-02 1.94424976e+01 6.97107359e+01 9.94454995e-01
 3.24803759e+00 5.64593820e-01 1.07776946e+01 5.82924498e+00]
[6.91374330e+00 2.87223049e-01 3.41637880e-01 7.44673913e+00
 4.88454437e-02 4.08683740e+01 1.59826236e+02 9.94937181e-01
 3.19082192e+00 5.01122692e-01 1.02243171e+01 5.81596188e+00]
