# K-Means Clustering in spark

In [2]:
from pyspark.mllib.clustering import KMeans
from numpy import array, random
from math import sqrt
from pyspark import SparkConf, SparkContext
from sklearn.preprocessing import scale

In [14]:
sc = SparkContext.getOrCreate()
k = 5

In [31]:
def createClusteredData(N, k):
    random.seed(10)
    pointsPerCluster = float(N)/k
    x = []
    for i in range(k):
        incomeCentroid = random.uniform(20000.0, 200000.0)
        ageCentroid = random.uniform(20.0, 70.0)
        for j in range(int(pointsPerCluster)):
            x.append([random.normal(incomeCentroid, 10000.0), random.normal(ageCentroid, 2.0)])
    x = array(x)
    return x 

In [32]:
# x = []
# for j in range(int(5)):
#     x.append([random.normal(40000, 10000.0), random.normal(50, 2.0)])
# x

In [35]:
# Load the data; note I am normalizing it with scale() - very important!
data = sc.parallelize(scale(createClusteredData(100, K)))
data.take(5)

[array([ 1.74252979, -1.28173018]),
 array([ 1.09846556, -1.3608634 ]),
 array([ 1.58352768, -1.43868866]),
 array([ 1.50387012, -1.34807673]),
 array([ 1.44539138, -1.37903932])]

In [37]:
# Build the model (cluster the data)
clusters = KMeans.train(data, K, maxIterations=10, initializationMode="random")

In [39]:
# Print out the cluster assignments
resultRDD = data.map(lambda point: clusters.predict(point))

In [42]:
resultRDD.take(50)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [40]:
print("Counts by value:")
counts = resultRDD.countByValue()
print(counts)

Counts by value:
defaultdict(<class 'int'>, {0: 20, 1: 20, 2: 23, 3: 20, 4: 17})


In [44]:
print("Cluster assignments:")
results = resultRDD.collect()
print(results)

Cluster assignments:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4]


In [45]:
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

In [46]:
WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

Within Set Sum of Squared Error = 20.385885709661075
