# K-Means Clustering Spark - Iris Dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("K-Means Clustering Iris").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

In [None]:
# Read the dataset file which is in csv - comma separated values format
sdf = spark.read.format('csv').options(header='false', inferSchema='true').load('../datasets/iris.csv')

In [None]:
sdf.printSchema()

In [None]:
sdf.show(3)

## Rename Columns

In [None]:
cols = ['Sepal_Len', 'Sepal_Width', 'Petal_Len', 'Petal_Width', 'class']

In [None]:
sdf = sdf.toDF(*cols)

In [None]:
sdf.show(3)

## Explore Data

In [None]:
classes = sdf.select('class').distinct()
classes.show()

In [None]:
sdf.groupby('class').count().collect()

In [None]:
sdf.show(3)

In [None]:
feat_cols = cols[:-1]
feat_cols

In [None]:
# Create a vector of features for each row and set it to new dataframe
from pyspark.ml.feature import VectorAssembler, StringIndexer
strIdx = StringIndexer(inputCol="class", outputCol="classNumber")
strIdxModel = strIdx.fit(sdf)
sdf = strIdxModel.transform(sdf)

In [None]:
sdf.groupby('classNumber', 'class').count().collect()

In [None]:
vassemb = VectorAssembler(inputCols = feat_cols, outputCol = 'features')
ndf = vassemb.transform(sdf)
ndf = ndf.select(['class', 'classNumber', 'features'])

#ndf.show(3, truncate = False)
ndf.printSchema()

In [None]:
ndf.show(3)

## WCSS - Within Cluster Sum of Squares<br>

<span style="font-family:times, serif; font-size:16pt; font-style:bold">
<ul>
    <li>Within a cluster take the distance of each point from the centriod, square it and then add them up</li>
    <li>WCSS will help in determining the goodness of fit</li>
</ul>
</span>

In [None]:
from pyspark.ml.clustering import KMeans
import numpy as np
cost = np.zeros(5)
for k in range(2,5):
    kmeans = KMeans()\
            .setK(k)\
            .setSeed(100) \
            .setFeaturesCol("features")\
            .setPredictionCol("cluster")

    model = kmeans.fit(ndf)
    cost[k] = model.computeCost(ndf)

## Elbow Method

In [None]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sbs
from matplotlib.ticker import MaxNLocator

fig, ax = plt.subplots(1,1, figsize =(8,6))
ax.plot(range(2,5),cost[2:5])
ax.set_xlabel('k')
ax.set_ylabel('cost')
ax.set_title('Elbow Method')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()

## Apply KMeans and create groupings

In [None]:
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.evaluation import ClusteringEvaluator

# Instantiate the algorithm
kmeans = KMeans() \
          .setK(3) \
          .setFeaturesCol("features")\
          .setPredictionCol("prediction")

# Train Model
model = kmeans.fit(ndf)

# Evaluate Model
predictions = model.transform(ndf)

In [None]:
predictions.printSchema()

## Explore Results<br>
<span style="font-family:times, serif; font-size:16pt; font-style:bold">
<ul>
<li> Display the Class and Predicted class</li> 
<li> Cluster numbers and flower types are not going to be the same</li> 
<li> Idea of K-Means is to group data and not make predictions</li> 
</ul>
</span>

In [None]:
predictions.select(['class', 'classNumber', 'prediction']).show(150)

In [None]:
# Each type of flower has a count of 50 each.
predictions.groupby('class').count().collect()

In [None]:
predictions.groupby('prediction').count().collect()