In [1]:
import findspark 
findspark.init("/Users/valentinaporcu/spark/spark-2.4.1-bin-hadoop2.7")
import pyspark 
from pyspark.sql import DataFrameNaFunctions 
from pyspark.sql.functions import lit 
from pyspark.ml.feature import StringIndexer  
from pyspark.ml import Pipeline 
from pyspark.sql import SparkSession
from pyspark.sql import functions
import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.appName('KMeans').getOrCreate()

In [4]:
# carichiamo i dati

In [5]:
df = spark.read.csv("/Users/valentinaporcu/Dropbox/topic/12.\ Guida\ ai\ Big\ Data\ con\ Python/codice\ -\ guida\ ai\ big\ data\ con\ Python/Sezione\ 4/iris.csv",
                    inferSchema=True, header=True)

In [6]:
df.head()

Row(_c0=1, Sepal_Length=5.1, Sepal_Width=3.5, Petal_Length=1.4, Petal_Width=0.2, Species='setosa')

In [7]:
df = df.drop('Species')

In [8]:
df.take(2)

[Row(_c0=1, Sepal_Length=5.1, Sepal_Width=3.5, Petal_Length=1.4, Petal_Width=0.2),
 Row(_c0=2, Sepal_Length=4.9, Sepal_Width=3.0, Petal_Length=1.4, Petal_Width=0.2)]

In [9]:
# formattiamo i dati

In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
df.columns

['_c0', 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']

In [12]:
assembler = VectorAssembler(inputCols = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'], 
                            outputCol = 'features')

In [13]:
output = assembler.transform(df)

In [14]:
output.show()

+---+------------+-----------+------------+-----------+-----------------+
|_c0|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|         features|
+---+------------+-----------+------------+-----------+-----------------+
|  1|         5.1|        3.5|         1.4|        0.2|[5.1,3.5,1.4,0.2]|
|  2|         4.9|        3.0|         1.4|        0.2|[4.9,3.0,1.4,0.2]|
|  3|         4.7|        3.2|         1.3|        0.2|[4.7,3.2,1.3,0.2]|
|  4|         4.6|        3.1|         1.5|        0.2|[4.6,3.1,1.5,0.2]|
|  5|         5.0|        3.6|         1.4|        0.2|[5.0,3.6,1.4,0.2]|
|  6|         5.4|        3.9|         1.7|        0.4|[5.4,3.9,1.7,0.4]|
|  7|         4.6|        3.4|         1.4|        0.3|[4.6,3.4,1.4,0.3]|
|  8|         5.0|        3.4|         1.5|        0.2|[5.0,3.4,1.5,0.2]|
|  9|         4.4|        2.9|         1.4|        0.2|[4.4,2.9,1.4,0.2]|
| 10|         4.9|        3.1|         1.5|        0.1|[4.9,3.1,1.5,0.1]|
| 11|         5.4|        3.7|        

In [15]:
# effettuiamo lo scaling

In [16]:
from pyspark.ml.feature import StandardScaler

In [17]:
scaler = StandardScaler(inputCol="features", 
                        outputCol="scaledFeatures", 
                        withStd=True, withMean=False)

In [18]:
scalerModel = scaler.fit(output)

In [19]:
output = scalerModel.transform(output)

In [20]:
# creiamo il modello

In [3]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [21]:
kmeans3 = KMeans(featuresCol='scaledFeatures', k=3)
model3 = kmeans3.fit(output)

In [22]:
# computiamo le predizioni

In [23]:
predictions3 = model3.transform(output)

In [24]:
# computiamo il Silhouette score 

In [25]:
evaluator = ClusteringEvaluator()

In [26]:
silhouette = evaluator.evaluate(predictions3)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.6498745933245135


In [27]:
# visualizziamo i risultati

In [29]:
centers = model3.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[8.08674985 7.02050171 3.06927278 2.5427526 ]
[6.05788156 7.91761264 0.83006151 0.32128819]
[6.8887588  6.04493327 2.38782168 1.74828502]


In [30]:
# modelli alternativi

In [31]:
kmeans2 = KMeans(featuresCol='scaledFeatures', k=2)
model2 = kmeans2.fit(output)

predictions2 = model2.transform(output)

silhouette = evaluator.evaluate(predictions2)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.8465640330714044


In [32]:
kmeans4 = KMeans(featuresCol='scaledFeatures', k=4)
model4 = kmeans4.fit(output)

predictions4 = model4.transform(output)

silhouette = evaluator.evaluate(predictions4)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.6020350846753646


In [None]:
predictions3.take(150)