In [123]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark K-means example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [124]:
df = spark.read.format('csv').\
                       options(header='true', \
                       inferschema='true').\
            load("data/wine_dataset.csv",header=True);

In [125]:
df.show(3)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-----+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_dioxide|total_sulfur_dioxide|density|  pH|sulphates|alcohol|quality|style|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-----+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|  red|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|  red|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|  red|
+-------------+----------------+-----------+--------------+-----

In [126]:
#df=df.select("longitude", "latitude", "median_income", "ocean_proximity")
#df.select("ocean_proximity").distinct().show()

In [127]:
#long=[]
#lad=[]
#for x in df.collect():
#    long.append(x[0])
#    lad.append(x[1])

In [128]:
#import numpy as np
#import pandas as pd
#import matplotlib.pyplot as plt
#%matplotlib inline
#plt.figure(figsize = (10, 8))
#plt.scatter(long, lad);


In [129]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

transformed=df.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label']).select("features")
transformed.show(5)

+--------------------+
|            features|
+--------------------+
|[7.4,0.7,0.0,1.9,...|
|[7.8,0.88,0.0,2.6...|
|[7.8,0.76,0.04,2....|
|[11.2,0.28,0.56,1...|
|[7.4,0.7,0.0,1.9,...|
+--------------------+
only showing top 5 rows



[Stage 13525:>                                                      (0 + 1) / 1]                                                                                

In [130]:
from pyspark.ml.feature import StandardScaler

In [131]:
scaler = StandardScaler(inputCol="features",outputCol="scaledFeatures",withStd=True, withMean=True)
scalerModel = scaler.fit(transformed)
#Normalize each feature to have unit standard deviation. 
scaledData = scalerModel.transform(transformed) 
scaledData.show(n=3)

[Stage 13526:>                                                      (0 + 1) / 1]                                                                                

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[7.4,0.7,0.0,1.9,...|[0.14246230020601...|
|[7.8,0.88,0.0,2.6...|[0.45100101079840...|
|[7.8,0.76,0.04,2....|[0.45100101079840...|
+--------------------+--------------------+
only showing top 3 rows



In [132]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import GaussianMixture

def RunBisectKM(featuresCol='features', initK=20, dataSet=scaledData):
    silhouette_scores=dict()
#    evaluator = ClusteringEvaluator(featuresCol=featuresCol, metricName='silhouette', distanceMeasure='squaredEuclidean')
    evaluator = ClusteringEvaluator(featuresCol=featuresCol, distanceMeasure='squaredEuclidean')

    for K in range(2,initK):
        KMeans_=KMeans(k=K)
        KMeans_fit=KMeans_.fit(dataSet)
        KMeans_transform=KMeans_fit.transform(dataSet) 
        evaluation_score=evaluator.evaluate(KMeans_transform)
        silhouette_scores[K]=evaluation_score
    bestK=[key for key, value in silhouette_scores.items() if value==max(silhouette_scores.values())][0]
    return KMeans(featuresCol=featuresCol, k=bestK)

In [133]:
bKM=RunBisectKM(featuresCol='features', initK=20, dataSet=scaledData)
model=bKM.fit(scaledData)

                                                                                

In [134]:
predictions=model.transform(scaledData)

In [135]:
predictions.show(5)

+--------------------+--------------------+----------+
|            features|      scaledFeatures|prediction|
+--------------------+--------------------+----------+
|[7.4,0.7,0.0,1.9,...|[0.14246230020601...|         0|
|[7.8,0.88,0.0,2.6...|[0.45100101079840...|         0|
|[7.8,0.76,0.04,2....|[0.45100101079840...|         0|
|[11.2,0.28,0.56,1...|[3.07358005083372...|         0|
|[7.4,0.7,0.0,1.9,...|[0.14246230020601...|         0|
+--------------------+--------------------+----------+
only showing top 5 rows



In [136]:
predictions.select("prediction").distinct().show()

+----------+
|prediction|
+----------+
|         1|
|         0|
+----------+



In [137]:
# Evaluate clustering by computing Silhouette score 
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = {}".format(silhouette))
# Shows the result. 
print("Cluster Centers: ") 
centers = model.clusterCenters()
for i in centers:
    print(i)

[Stage 14580:>                                                      (0 + 1) / 1]                                                                                

Silhouette with squared euclidean distance = 0.6891948370236831
Cluster Centers: 
[ 7.59835334  0.40544768  0.29165352  3.13166381  0.06525729 18.59571184
 64.81783877  0.99457328  3.25336878  0.5701578  10.76679245  5.81303602]
[6.90813791e+00 2.87036572e-01 3.41208822e-01 7.28183975e+00
 4.83257956e-02 4.00135399e+01 1.56531965e+02 9.94814506e-01
 3.19086265e+00 4.99919040e-01 1.02581649e+01 5.82049135e+00]


In [138]:
help(ClusteringEvaluator)

Help on class ClusteringEvaluator in module pyspark.ml.evaluation:

class ClusteringEvaluator(JavaEvaluator, pyspark.ml.param.shared.HasPredictionCol, pyspark.ml.param.shared.HasFeaturesCol, pyspark.ml.param.shared.HasWeightCol, pyspark.ml.util.JavaMLReadable, pyspark.ml.util.JavaMLWritable)
 |  ClusteringEvaluator(*, predictionCol: str = 'prediction', featuresCol: str = 'features', metricName: 'ClusteringEvaluatorMetricType' = 'silhouette', distanceMeasure: str = 'squaredEuclidean', weightCol: Optional[str] = None)
 |  
 |  Evaluator for Clustering results, which expects two input
 |  columns: prediction and features. The metric computes the Silhouette
 |  measure using the squared Euclidean distance.
 |  
 |  The Silhouette is a measure for the validation of the consistency
 |  within clusters. It ranges between 1 and -1, where a value close to
 |  1 means that the points in a cluster are close to the other points
 |  in the same cluster and far from the points of the other clusters.

23/05/21 03:39:42 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 7202858 ms exceeds timeout 120000 ms
23/05/21 03:39:42 WARN SparkContext: Killing executors is not supported by current scheduler.
