In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('codeex').getOrCreate()

In [4]:
data = spark.read.csv('seeds_dataset.csv', header=True,inferSchema=True)

In [6]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [8]:
data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [9]:
from pyspark.ml.clustering import KMeans

In [10]:
from pyspark.ml.feature import VectorAssembler

In [11]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [12]:
assembler = VectorAssembler(inputCols=data.columns,outputCol='features')

In [13]:
dataset = assembler.transform(data)

In [15]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [16]:
from pyspark.ml.feature import StandardScaler

In [17]:
scaler =  StandardScaler(inputCol='features',outputCol='scaledFeatures')

In [18]:
model = scaler.fit(dataset)

In [19]:
dataset = model.transform(dataset)

In [21]:
dataset.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [28]:
kmean = KMeans(featuresCol='scaledFeatures',k=3)

In [29]:
model = kmean.fit(dataset)

In [30]:
model.computeCost(dataset)

428.60820118716356

In [32]:
centers = model.clusterCenters()

In [33]:
centers

[array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
         3.15410901, 10.38031464]),
 array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
         1.80061978, 10.41913733]),
 array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585013, 12.29286107])]

In [37]:
model.transform(dataset).select('prediction').show()

+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         2|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
+----------+
only showing top 20 rows

