# K-Means Clustering Spark

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("K-Means Clustering Example").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

## Load Data

In [None]:
# Read the dataset file which is in csv - comma separated values format
sdf = spark.read.format('csv').options(header='true', 
                                       inferSchema='true').load('../datasets/Mall_Customers.csv')

In [None]:
sdf.show(10)

In [None]:
pd_df = sdf.toPandas()

In [None]:
sdf = sdf.drop('CustomerID')

In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Genre', outputCol="Genre_numeric").fit(sdf)
sdf = indexer.transform(sdf)
sdf.select('Genre', 'Genre_numeric').show()

In [None]:
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = ['Genre_numeric', 'Age', 'Annual Income (k$)', \
                                      'Spending Score (1-100)'], outputCol = 'features')
ndf = vassemb.transform(sdf)
ndf = ndf.select(['features'])
ndf.show(3)

## WCSS - Within Cluster Sum of Squares<br>

<span style="font-family:times, serif; font-size:16pt; font-style:bold">
<ul>
    <li>Within a cluster take the distance of each point from the centriod, square it and then add them up</li>
    <li>WCSS will help in determining the goodness of fit</li>
</ul>
</span>

In [None]:
from pyspark.ml.clustering import KMeans
import numpy as np
cost = np.zeros(10)
for k in range(2,10):
    kmeans = KMeans()\
            .setK(k)\
            .setSeed(1) \
            .setFeaturesCol("features")\
            .setPredictionCol("cluster")

    model = kmeans.fit(ndf)
    cost[k] = model.computeCost(ndf)

## Elbow Method

In [None]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sbs
from matplotlib.ticker import MaxNLocator

fig, ax = plt.subplots(1,1, figsize =(8,6))
ax.plot(range(2,10),cost[2:10])
ax.set_xlabel('k')
ax.set_ylabel('cost')
ax.set_title('Elbow Method')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()

## Apply KMeans and make predictions (Groupings)

In [None]:
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans() \
          .setK(5) \
          .setFeaturesCol("features")\
          .setPredictionCol("prediction")

model = kmeans.fit(ndf)

predictions = model.transform(ndf)

In [None]:
predictions.printSchema()

In [None]:
evaluator = ClusteringEvaluator()
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

In [None]:
sqd = evaluator.evaluate(predictions)
print("squared euclidean distance = " + str(sqd))

In [None]:
predictions.show()

In [None]:
y_kmeans = predictions.select('prediction').toPandas().values

In [None]:
#y_kmeans

In [None]:
y_kmeans = y_kmeans.flatten()

In [None]:
y_kmeans

## Clusters
<span style="font-family:times, serif; font-size:16pt; font-style:bold">
<ul>
    <li><b>Careful</b> - High income and low spending score - Cluster 0</li>
    <li><b>Standard</b> - Average income and average spending score - Cluster 1</li>
    <li><b>Target</b> - Ligh income and high spending score - Cluster 2</li>
    <li><b>Careless</b> - Low income and high spending score - Cluster 3</li>
    <li><b>Sensible</b> - Low income and low spending score - Cluster 4</li>
    </ul>
</span>

In [None]:
X = pd_df.iloc[:, [3, 4]].values

In [None]:
plt.figure(figsize = (10,8))
# Cluster 1 - high income and low spending score
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, 
            c = 'red', label = 'Cluster 0')

# Cluster 2 - Average income and average spending score
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, 
            c = 'blue', label = 'Cluster 1')

# Cluster 3 - high income and high spending score
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, 
            c = 'green', label = 'Cluster 2')

# Cluster 4 - low income and high spending score
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, 
            c = 'cyan', label = 'Cluster 3')

# Cluster 5 - low income and low spending score
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, 
            c = 'magenta', label = 'Cluster 4')

plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()