# Imports

In [1]:
import findspark

findspark.init('C:/spark')

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler

# Create a session

In [3]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

# Read data

In [4]:
data = spark.read.csv('../../data/seeds_dataset.csv',
                      inferSchema=True,
                      header=True)

In [5]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [6]:
data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

# Preprocessing

In [7]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [8]:
assembler = VectorAssembler(inputCols=data.columns,
                            outputCol='features')
final_data = assembler.transform(data)

In [9]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
scaler = StandardScaler(inputCol='features',
                        outputCol='scaledFeatures')

scaler_model = scaler.fit(final_data)
final_data = scaler_model.transform(final_data)

In [11]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

# Model

In [12]:
kmeans = KMeans(featuresCol='scaledFeatures',
                k=3)

In [13]:
model = kmeans.fit(final_data)

In [18]:
print('WSSSE')
print(model.summary.trainingCost)

WSSSE
428.76536612896285


In [19]:
centers = model.clusterCenters()

In [20]:
print(centers)

[array([ 4.9360523 , 10.94499696, 37.33487983, 12.40173794,  8.61516278,
        1.7804233 , 10.36535821]), array([ 6.32636687, 12.38115343, 37.39222755, 13.9206997 ,  9.75485787,
        2.41428142, 12.28078861]), array([ 4.078007  , 10.15076404, 35.87686106, 11.81860981,  7.5430707 ,
        3.17727834, 10.39174095])]


In [23]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows

