In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('kmeans').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/25 20:54:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
from math import sqrt

In [5]:
dataset = spark.read.format('libsvm').load('data/sample_kmeans_data.txt')
print(dataset.count(), len(dataset.columns))
dataset.show()

23/02/25 20:54:14 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


                                                                                

6 2
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
final_data = dataset.select('features')
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



### Model

In [7]:
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(final_data)
centers = model.clusterCenters()
print(centers)

[array([9.1, 9.1, 9.1]), array([0.05, 0.05, 0.05]), array([0.2, 0.2, 0.2])]


In [8]:
results = model.transform(final_data)
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



In [18]:
# # Evaluate clustering by computing Within Set Sum of Squared Errors
# def error(point):
#     center = model.centers[model.predict(point)]
#     return sqrt(sum([x**2 for x in (point - center)]))

# WSSSE = final_data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
# print("Within Set Sum of Squared Error = " + str(WSSSE))

# 53 Clustering Example Code Along

In [9]:
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

23/02/25 20:54:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [11]:
dataset = spark.read.csv('data/seeds_dataset.csv', inferSchema=True, header=True)
print(dataset.count(), len(dataset.columns))
dataset.show(5)
dataset.printSchema()

210 7
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.175|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
only showing top 5 rows

root
 

In [12]:
dataset.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [13]:
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [14]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=dataset.columns, outputCol='features')
final_data = assembler.transform(dataset)
final_data.show(5)

+-----+---------+-----------+------------------+------------------+---------------------+----------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|[13.84,13.94,0.89...|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.17

### Before Scaling

In [15]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(final_data)
centers = model.clusterCenters()
print(centers)

[array([14.81910448, 14.53716418,  0.88052239,  5.59101493,  3.29935821,
        2.70658209,  5.21753731]), array([11.98865854, 13.28439024,  0.85273659,  5.22742683,  2.88008537,
        4.58392683,  5.0742439 ]), array([18.72180328, 16.29737705,  0.88508689,  6.20893443,  3.72267213,
        3.60359016,  6.06609836])]


### Scale

In [16]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(final_data)
scaled_final_data = scaler_model.transform(final_data)
scaled_final_data.head(5)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621])),
 Row(area=14.88, perimeter=14.57, compactness=0.8811, length_of_kernel=5.553999999999999, width_of_kernel=3.333, asymmetry_coefficient=1.018, length_of_groove=4.956, features=DenseVector([14.88, 14.57, 0.8811, 5.554, 3.333, 1.018, 4.956]), scaledFeatures=DenseVector([5.1139, 11.1566, 37.2883, 12.5354, 8.8241, 0.6771, 10.0838])),
 Row(area=14.29, perimeter=14.09, compactness=0.905, length_of_kernel=5.291, width_of_kernel=3.3369999999999997, asymmetry_coefficient=2.699, length_of_groove=4.825, features=DenseVector([14.29, 14.09, 0.905, 5.291, 3.337, 2.699, 4.825]), scaledFeatures=DenseVector([4.9112, 10.789, 38.2997, 11.9419, 8.8347, 1.7951, 9.8173])),
 Row(area=13.84, perimeter=

### Predict Scaled

In [27]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol='scaledFeatures', k=3)
model = kmeans.fit(scaled_final_data)
centers = model.clusterCenters()
print(model.summary.trainingCost)
print(centers)

428.6082011872446
[array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107]), array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
        1.80061978, 10.41913733]), array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
        3.15410901, 10.38031464])]


In [19]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()
prediction = model.transform(scaled_final_data)
silhouette = evaluator.evaluate(prediction)
print(silhouette)
prediction.select('prediction').show()

0.6300001033389961
+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         2|
+----------+
only showing top 20 rows



### Compare

In [39]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

for i in range(2,8):
    kmeans = KMeans(featuresCol='scaledFeatures', k=i)
    model = kmeans.fit(scaled_final_data)
    centers = model.clusterCenters()
    # print(centers)

    evaluator = ClusteringEvaluator()
    prediction = model.transform(scaled_final_data)
    silhouette = evaluator.evaluate(prediction)
    print(i, silhouette, model.summary.trainingCost)

print(prediction.select('prediction').groupBy('prediction').count().show(truncate=False))

2 0.709845635070088 656.0328395397873
3 0.6300001033389961 428.6082011872446
4 0.49894462346666957 370.44428208908664
5 0.35860684299522133 330.5785888229312
6 0.40117388050151653 288.72206849763154
7 0.23492817609483754 276.4910988286958
+----------+-----+
|prediction|count|
+----------+-----+
|1         |31   |
|6         |15   |
|3         |45   |
|5         |20   |
|4         |38   |
|2         |36   |
|0         |25   |
+----------+-----+

None


In [40]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

for i in range(2,8):
    kmeans = KMeans(featuresCol='features', k=i)
    model = kmeans.fit(final_data)
    centers = model.clusterCenters()
    # print(centers)

    evaluator = ClusteringEvaluator()
    prediction = model.transform(final_data)
    silhouette = evaluator.evaluate(prediction)
    print(i, silhouette, model.summary.trainingCost)

print(prediction.select('prediction').groupBy('prediction').count().show(truncate=False))

2 0.7130696287471148 1011.965714423923
3 0.6583884755012417 588.7827507911767
4 0.5952568216369759 526.4477715556183
5 0.5362045150386681 400.6842435662612
6 0.5377008391909751 323.8708673531865
7 0.5441231096866248 280.5003686112428
+----------+-----+
|prediction|count|
+----------+-----+
|1         |15   |
|6         |33   |
|3         |12   |
|5         |28   |
|4         |25   |
|2         |47   |
|0         |50   |
+----------+-----+

None
