In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans

In [2]:
spark=SparkSession.builder.appName('SparkCluster').getOrCreate()

In [11]:
df =  spark.read.csv('seeds_dataset.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length of kernel: double (nullable = true)
 |-- width of kernel: double (nullable = true)
 |-- asymmetry coefficient: double (nullable = true)
 |-- length of kernel groove: double (nullable = true)



In [12]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [13]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length of kernel',
 'width of kernel',
 'asymmetry coefficient',
 'length of kernel groove']

In [14]:
featureAssembler = VectorAssembler(inputCols=['area', 'perimeter','length of kernel'], outputCol='independentFeatures')
df2 = featureAssembler.transform(df)

In [18]:
df3 = df2.select(['area','perimeter','length of kernel','independentFeatures'])

In [19]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler().setInputCol('independentFeatures').setOutputCol('scaledFeatures')

In [23]:
scaledModel = scaler.fit(df3).transform(df3)

### Train and Evaluate

In [29]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)
model = kmeans.fit(scaledModel)

In [30]:
pred = model.transform(scaledModel)

In [31]:
from pyspark.ml.evaluation import ClusteringEvaluator

eval = ClusteringEvaluator()

In [36]:
centers = model.clusterCenters() 
for center in centers:
    print(center)

[ 5.14055169 11.18463116  7.80398591]
[ 6.44425095 12.4924826   8.64124309]
[ 4.15024895 10.19509081  7.05287611]
