# K Means

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("kmeans").getOrCreate()

24/04/03 16:36:18 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.108 instead (on interface wlo1)
24/04/03 16:36:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/03 16:36:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
iris = spark.read.csv("../0_data/iris.csv", header=True, inferSchema=True, sep=",")
print(iris.count())
iris.show(5)

150
+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [4]:
vector_assembler = VectorAssembler(inputCols=["sepallength", "sepalwidth", "petallength", "petalwidth"], outputCol="features")
iris_assembled = vector_assembler.transform(iris)
iris_assembled.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|
+-----------+----------+-----------+----------+-----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+-----------+----------+-----------+----------+-----------+-----------------+
only showing top 5 rows



In [5]:
ind = StringIndexer(inputCol="class", outputCol="dependant")
iris_assembled = ind.fit(iris_assembled).transform(iris_assembled)
iris_assembled.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+---------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|dependant|
+-----------+----------+-----------+----------+-----------+-----------------+---------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|      0.0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|      0.0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|      0.0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|      0.0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|      0.0|
+-----------+----------+-----------+----------+-----------+-----------------+---------+
only showing top 5 rows



In [6]:
iris_assembled = iris_assembled.withColumn("dependant", col("dependant").cast(IntegerType()))
iris_assembled.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+---------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|dependant|
+-----------+----------+-----------+----------+-----------+-----------------+---------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|        0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|        0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|        0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|        0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|        0|
+-----------+----------+-----------+----------+-----------+-----------------+---------+
only showing top 5 rows



In [7]:
km = KMeans(k=3,
            maxIter=100,
            predictionCol="prediction",
            featuresCol="features",
            seed=42)
model = km.fit(iris_assembled)

In [8]:
groups = model.transform(iris_assembled)

In [9]:
groups.show(150)

+-----------+----------+-----------+----------+---------------+-----------------+---------+----------+
|sepallength|sepalwidth|petallength|petalwidth|          class|         features|dependant|prediction|
+-----------+----------+-----------+----------+---------------+-----------------+---------+----------+
|        5.1|       3.5|        1.4|       0.2|    Iris-setosa|[5.1,3.5,1.4,0.2]|        0|         0|
|        4.9|       3.0|        1.4|       0.2|    Iris-setosa|[4.9,3.0,1.4,0.2]|        0|         0|
|        4.7|       3.2|        1.3|       0.2|    Iris-setosa|[4.7,3.2,1.3,0.2]|        0|         0|
|        4.6|       3.1|        1.5|       0.2|    Iris-setosa|[4.6,3.1,1.5,0.2]|        0|         0|
|        5.0|       3.6|        1.4|       0.2|    Iris-setosa|[5.0,3.6,1.4,0.2]|        0|         0|
|        5.4|       3.9|        1.7|       0.4|    Iris-setosa|[5.4,3.9,1.7,0.4]|        0|         0|
|        4.6|       3.4|        1.4|       0.3|    Iris-setosa|[4.6,3.4,1

In [10]:
classes = groups.select("dependant").collect()
grouped = groups.select("prediction").collect()
cm = confusion_matrix(classes, grouped)
print(cm)

[[50  0  0]
 [ 0  3 47]
 [ 0 36 14]]


In [11]:
accuracy_score = (cm[0, 1] + cm[1, 0] + cm[2, 2] / 150)
print(f"accuracy_score: {accuracy_score}")

accuracy_score: 0.09333333333333334
