In [1]:
from pyspark import SparkContext
sc=SparkContext(master='local')

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Pyspark cluster').config('spark.some.config.option','some-value').getOrCreate()

# 1.Kmeans on data

In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors


In [4]:
data=[(Vectors.dense([0.1,0.2]),),(Vectors.dense([0.4,0.6]),),(Vectors.dense([0.8,0.5]),)]
df=spark.createDataFrame(data,['features'])
df.show()

+---------+
| features|
+---------+
|[0.1,0.2]|
|[0.4,0.6]|
|[0.8,0.5]|
+---------+



In [5]:
#kmeans model
kmeans=KMeans(k=2,seed=123)
kmodel=kmeans.fit(df)

#no of clusters
centres=kmodel.clusterCenters()

In [6]:
len(centres)

2

In [7]:
transformed=kmodel.transform(df)
transformed.select('features','prediction').show()

+---------+----------+
| features|prediction|
+---------+----------+
|[0.1,0.2]|         1|
|[0.4,0.6]|         0|
|[0.8,0.5]|         0|
+---------+----------+



# 2.Pyspark cluster for IRIS data

In [8]:
iris_df=spark.read.csv('Iris-data.csv',inferSchema=True,header=True)
iris_df.show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [9]:
iris_df=iris_df.drop('Id')

In [10]:
from pyspark.ml.linalg import Vectors
iris_df2=iris_df.rdd.map(lambda x:(Vectors.dense(x[:-1]),x[-1])).toDF(['features','species'])
iris_df2.show(5)

+-----------------+-----------+
|         features|    species|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
+-----------------+-----------+
only showing top 5 rows



In [13]:
from pyspark.ml.feature import StringIndexer
strindexer=StringIndexer(inputCol='species',outputCol='label')
iris_df3=strindexer.fit(iris_df2).transform(iris_df2)
iris_df3.show(5)

+-----------------+-----------+-----+
|         features|    species|label|
+-----------------+-----------+-----+
|[5.1,3.5,1.4,0.2]|Iris-setosa|  0.0|
|[4.9,3.0,1.4,0.2]|Iris-setosa|  0.0|
|[4.7,3.2,1.3,0.2]|Iris-setosa|  0.0|
|[4.6,3.1,1.5,0.2]|Iris-setosa|  0.0|
|[5.0,3.6,1.4,0.2]|Iris-setosa|  0.0|
+-----------------+-----------+-----+
only showing top 5 rows



In [14]:
iris_df4=iris_df3.drop('species')

In [15]:
iris_df4.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
+-----------------+-----+
only showing top 5 rows



In [16]:
iris_kmeans=KMeans(k=3,seed=1234)
iris_model=iris_kmeans.fit(iris_df4)
transformed_iris=iris_model.transform(iris_df4)
transformed_iris.show(5)

+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[5.1,3.5,1.4,0.2]|  0.0|         1|
|[4.9,3.0,1.4,0.2]|  0.0|         1|
|[4.7,3.2,1.3,0.2]|  0.0|         1|
|[4.6,3.1,1.5,0.2]|  0.0|         1|
|[5.0,3.6,1.4,0.2]|  0.0|         1|
+-----------------+-----+----------+
only showing top 5 rows



In [17]:
transformed_iris.groupBy(['label','prediction']).count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|         0|   48|
|  1.0|         2|    2|
|  2.0|         2|   36|
|  2.0|         0|   14|
|  0.0|         1|   50|
+-----+----------+-----+



**By looking at above groupBy, the predictions 0,1 are reversed with respect to their label.So, we need to reverse the column values of 0s and 1s.**

In [19]:
from pyspark.ml.feature import IndexToString
indextostr=IndexToString(inputCol='label',outputCol='species')
transformed_iris_idx=indextostr.transform(transformed_iris)
transformed_iris_idx.show(5)

+-----------------+-----+----------+-----------+
|         features|label|prediction|    species|
+-----------------+-----+----------+-----------+
|[5.1,3.5,1.4,0.2]|  0.0|         1|Iris-setosa|
|[4.9,3.0,1.4,0.2]|  0.0|         1|Iris-setosa|
|[4.7,3.2,1.3,0.2]|  0.0|         1|Iris-setosa|
|[4.6,3.1,1.5,0.2]|  0.0|         1|Iris-setosa|
|[5.0,3.6,1.4,0.2]|  0.0|         1|Iris-setosa|
+-----------------+-----+----------+-----------+
only showing top 5 rows



In [41]:
# reversing the values of 0s and 1s
from pyspark.sql import functions as f
true_preds=transformed_iris_idx.select('*',f.when(transformed_iris_idx.prediction==1,0).when(transformed_iris_idx.prediction==0,1).otherwise(2).alias('preds'))



In [42]:
true_preds.show()

+-----------------+-----+----------+-----------+-----+
|         features|label|prediction|    species|preds|
+-----------------+-----+----------+-----------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|         1|Iris-setosa|    0|
|[4.9,3.0,1.4,0.2]|  0.0|         1|Iris-setosa|    0|
|[4.7,3.2,1.3,0.2]|  0.0|         1|Iris-setosa|    0|
|[4.6,3.1,1.5,0.2]|  0.0|         1|Iris-setosa|    0|
|[5.0,3.6,1.4,0.2]|  0.0|         1|Iris-setosa|    0|
|[5.4,3.9,1.7,0.4]|  0.0|         1|Iris-setosa|    0|
|[4.6,3.4,1.4,0.3]|  0.0|         1|Iris-setosa|    0|
|[5.0,3.4,1.5,0.2]|  0.0|         1|Iris-setosa|    0|
|[4.4,2.9,1.4,0.2]|  0.0|         1|Iris-setosa|    0|
|[4.9,3.1,1.5,0.1]|  0.0|         1|Iris-setosa|    0|
|[5.4,3.7,1.5,0.2]|  0.0|         1|Iris-setosa|    0|
|[4.8,3.4,1.6,0.2]|  0.0|         1|Iris-setosa|    0|
|[4.8,3.0,1.4,0.1]|  0.0|         1|Iris-setosa|    0|
|[4.3,3.0,1.1,0.1]|  0.0|         1|Iris-setosa|    0|
|[5.8,4.0,1.2,0.2]|  0.0|         1|Iris-setosa|    0|
|[5.7,4.4,

## Accuracy

In [43]:
train_err=true_preds.filter(true_preds['label']!=true_preds['preds']).count()
total=true_preds.count()

In [47]:
print('accuracy:',str((1-(train_err/total))*100))


accuracy: 89.33333333333333
