In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('kmeans').getOrCreate()

In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
data = spark.read.format('libsvm').load('../data/sample_kmeans_data.txt')

In [5]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
data_features = data.select('features')
data_features.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [7]:
kmeans = KMeans(k=2, seed=42)

In [8]:
kmeans_model = kmeans.fit(data_features)

In [9]:
centers = kmeans_model.clusterCenters()
print(f'Cluster centers: {centers}')

Cluster centers: [array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]


In [10]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [11]:
evaluator = ClusteringEvaluator()

In [12]:
predictions = kmeans_model.transform(data)

In [13]:
predictions.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|           (3,[],[])|         0|
|  1.0|(3,[0,1,2],[0.1,0...|         0|
|  2.0|(3,[0,1,2],[0.2,0...|         0|
|  3.0|(3,[0,1,2],[9.0,9...|         1|
|  4.0|(3,[0,1,2],[9.1,9...|         1|
|  5.0|(3,[0,1,2],[9.2,9...|         1|
+-----+--------------------+----------+



In [14]:
silhouette = evaluator.evaluate(predictions)
print(f'Silhouette with squared euclidean distance = {silhouette}')

Silhouette with squared euclidean distance = 0.9997530305375207


In [15]:
seeds_data = spark.read.csv('../data/seeds_dataset.csv', inferSchema=True, header=True)

In [16]:
seeds_data.printSchema()
seeds_data.describe().show()
seeds_data.show(5)

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)

+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|summary|              area|         perimeter|         compactness|   length_of_kernel|   width_of_kernel|asymmetry_coefficient|   length_of_groove|
+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|  count|               210|               210|                 210|                210|               210|                  210|                210|
|   mean|14.847523809523816|14.559285714285718|  0.8709985714285714|  5.628533

In [17]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler, VectorAssembler

In [18]:
assembler = VectorAssembler(inputCols=seeds_data.columns, outputCol='features')

seeds_data = assembler.transform(seeds_data)

In [19]:
seeds_data.show(5)

+-----+---------+-----------+------------------+------------------+---------------------+----------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|[13.84,13.94,0.89...|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.17

In [20]:
scaler = StandardScaler(inputCol='features', outputCol='features_scaled')

seeds_data = scaler.fit(seeds_data).transform(seeds_data)

In [21]:
seeds_data.select('features', 'features_scaled').show(5, truncate=False)

+---------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
|features                                                 |features_scaled                                                                                                                   |
+---------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
|[15.26,14.84,0.871,5.763,3.312,2.221,5.22]               |[5.244527953320284,11.363299389287777,36.860833906302894,13.007165541092315,8.76852883087142,1.4771618831975104,10.62097073949694]|
|[14.88,14.57,0.8811,5.553999999999999,3.333,1.018,4.956] |[5.113930271651758,11.156554723849252,37.28826722714521,12.53544983779745,8.824126386864265,0.6770602418257837,10.08381819634997] |
|[14.29,14.09,0.905,5.291,3.3369999999999997,

In [22]:
kmeans = KMeans(k=3, seed=42)
kmeans_model = kmeans.fit(seeds_data)

In [23]:
centers = kmeans_model.clusterCenters()
print(f'Cluster centers: {centers}')

Cluster centers: [array([18.72180328, 16.29737705,  0.88508689,  6.20893443,  3.72267213,
        3.60359016,  6.06609836]), array([11.96441558, 13.27480519,  0.8522    ,  5.22928571,  2.87292208,
        4.75974026,  5.08851948]), array([14.64847222, 14.46041667,  0.87916667,  5.56377778,  3.27790278,
        2.64893056,  5.19231944])]


In [24]:
predictions = kmeans_model.transform(seeds_data)
predictions.show(10)

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+----------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|     features_scaled|prediction|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+----------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|         2|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|         2|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|         2|
|13.84|    13.94|     0.8955

In [25]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   77|
|         2|   72|
|         0|   61|
+----------+-----+



In [26]:
silhouette = evaluator.evaluate(predictions)
print(f'Silhouette with squared euclidean distance = {silhouette}')

Silhouette with squared euclidean distance = 0.663217344600641
