In [1]:
%%init_spark
launcher.packages = ["org.vegas-viz:vegas_2.11:0.3.11", "org.vegas-viz:vegas-spark_2.11:0.3.11"]
launcher.master = "local[4]"

In [2]:
import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.util.KMeansDataGenerator

// Générer les données
val donneesGenerees = KMeansDataGenerator.generateKMeansRDD(sc, 1000, 5, 2, 5, 1)
                                                .map(l => Vectors.dense(l))
                                                .map(v => Row(v))
donneesGenerees.take(2)

Intitializing Scala interpreter ...

Spark Web UI available at http://wifi-auditeur-368.cnam.fr:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[4], app id = local-1583866992432)
SparkSession available as 'spark'


import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.util.KMeansDataGenerator
donneesGenerees: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[3] at map at <console>:32
res0: Array[org.apache.spark.sql.Row] = Array([[6.851431892838329,5.516447693896727]], [[-4.295988305021189,-4.831902144635173]])


In [3]:
// Construction d'un DataFrame à partir du RDD
import org.apache.spark.sql.types._
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType

val schemaVecteurs = StructType(Seq(StructField("features", VectorType, true)))
val vecteursGroupesDF = spark.createDataFrame(donneesGenerees, schemaVecteurs).cache()
vecteursGroupesDF.show(false)

+-----------------------------------------+
|features                                 |
+-----------------------------------------+
|[6.851431892838329,5.516447693896727]    |
|[-4.295988305021189,-4.831902144635173]  |
|[2.466261424212494,4.360426802469256]    |
|[-3.6929582575126862,-6.342163511120067] |
|[0.40109889970226975,6.603993964561202]  |
|[7.459651912743491,4.835501449876057]    |
|[-4.700311848077538,-6.473414666742812]  |
|[1.7714851259762026,2.820426332188302]   |
|[-2.6229150593060164,-6.2560724387559725]|
|[-0.2488022191305468,8.303237208697512]  |
|[7.09527856586733,5.391719071881441]     |
|[-4.1093962887920155,-4.694216844940949] |
|[2.432021563423027,2.493119849340497]    |
|[-2.3592814771450565,-7.270336546373527] |
|[0.15241818421045839,6.509821155247765]  |
|[7.4662523011015445,4.436875066957234]   |
|[-5.438322877988338,-4.9668282822927665] |
|[1.8326398476752015,2.750501464430018]   |
|[-1.8619822493373936,-7.944879647731648] |
|[-0.466115292913019,6.697896784

import org.apache.spark.sql.types._
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
schemaVecteurs: org.apache.spark.sql.types.StructType = StructType(StructField(features,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true))
vecteursGroupesDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [features: vector]


In [4]:
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.functions.udf
// Extrait le premier scalaire d'un vecteur et le place dans une colonne
val first = udf((v: Vector) => v.toArray(0))
// Extrait le deuxième scalaire d'un vecteur et le place dans une colonne
val second = udf((v: Vector) => v.toArray(1))

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.functions.udf
first: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))
second: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


In [33]:
// Import des bibliothèques de Vegas
implicit val render = vegas.render.ShowHTML(s => print("%html " + s))

import vegas._
import vegas.data.External._
import vegas.sparkExt._

render: vegas.render.ShowHTML = <function1>
import vegas._
import vegas.data.External._
import vegas.sparkExt._


In [34]:
// Construction du nuage de points
val points = vecteursGroupesDF.withColumn("x", first($"features")).withColumn("y", second($"features"))
Vegas("Données initiales").withDataFrame(points).mark(Point).encodeX("x", Quant).encodeY("y", Quant).show

points: org.apache.spark.sql.DataFrame = [features: vector, x: double ... 1 more field]


In [12]:
// K-mean with k-means || initialization
import org.apache.spark.ml.clustering.KMeans

// Appliquer k-means
val kmeans = new KMeans().setK(5).setMaxIter(200).setSeed(2L)
val modele = kmeans.fit(vecteursGroupesDF)

import org.apache.spark.ml.clustering.KMeans
kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_7d7961645e07
modele: org.apache.spark.ml.clustering.KMeansModel = kmeans_7d7961645e07


In [13]:
// Évaluer la classification par la somme des inerties intra-classe
val wsse = modele.computeCost(vecteursGroupesDF)

// Afficher les centres des groupes
modele.clusterCenters.foreach(println)

[-0.012568296756314948,7.526532076239347]
[-3.09577594145568,-7.575565466974656]
[-3.6775192699330796,-5.739050211500434]
[6.476504032626164,4.539774830579849]
[2.2726021783591652,3.5068947661773584]


wsse: Double = 977.923902493241


In [14]:
// Trouver l'indice de groupe pour chaque donnée
val resultat = modele.transform(vecteursGroupesDF)
resultat.show(5)

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[6.85143189283832...|         3|
|[-4.2959883050211...|         2|
|[2.46626142421249...|         4|
|[-3.6929582575126...|         2|
|[0.40109889970226...|         0|
+--------------------+----------+
only showing top 5 rows



resultat: org.apache.spark.sql.DataFrame = [features: vector, prediction: int]


In [15]:
val points = resultat.withColumn("x", first($"features")).withColumn("y", second($"features"))
Vegas("K-Means").withDataFrame(points).mark(Point).encodeX("x", Quant).encodeY("y", Quant).encodeColor("prediction", Nom).show

points: org.apache.spark.sql.DataFrame = [features: vector, prediction: int ... 2 more fields]


In [32]:
// Appliquer k-means
val kmeans = new KMeans().setK(6).setMaxIter(200).setSeed(1L)
val modele = kmeans.fit(vecteursGroupesDF)
val resultat = modele.transform(vecteursGroupesDF)
val points = resultat.withColumn("x", first($"features")).withColumn("y", second($"features"))
Vegas("K-Means").withDataFrame(points).mark(Point).encodeX("x", Quant).encodeY("y", Quant).encodeColor("prediction", Nom).show
// Évaluer la classification par la somme des inerties intra-classe
val wsse = modele.computeCost(vecteursGroupesDF)

kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_544d3ba640ce
modele: org.apache.spark.ml.clustering.KMeansModel = kmeans_544d3ba640ce
resultat: org.apache.spark.sql.DataFrame = [features: vector, prediction: int]
points: org.apache.spark.sql.DataFrame = [features: vector, prediction: int ... 2 more fields]
wsse: Double = 873.2579094097384


In [21]:
// init random
// Appliquer k-means
val kmeans = new KMeans().setK(5).setMaxIter(200).setSeed(2L).setInitMode("random")
val modele = kmeans.fit(vecteursGroupesDF)
val resultat = modele.transform(vecteursGroupesDF)
val points = resultat.withColumn("x", first($"features")).withColumn("y", second($"features"))
Vegas("K-Means").withDataFrame(points).mark(Point).encodeX("x", Quant).encodeY("y", Quant).encodeColor("prediction", Nom).show
// Évaluer la classification par la somme des inerties intra-classe
val wsse = modele.computeCost(vecteursGroupesDF)

kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_b3e34e20ead8
modele: org.apache.spark.ml.clustering.KMeansModel = kmeans_b3e34e20ead8
resultat: org.apache.spark.sql.DataFrame = [features: vector, prediction: int]
points: org.apache.spark.sql.DataFrame = [features: vector, prediction: int ... 2 more fields]
wsse: Double = 2986.7590732647636


In [52]:
import org.apache.spark.ml.feature.ElementwiseProduct
val transformingVector = Vectors.dense(1, 3.0)
val transformer = new ElementwiseProduct()
  .setScalingVec(transformingVector)
  .setInputCol("features")
  .setOutputCol("transformedVector")

// Batch transform the vectors to create new column:
val vecteursGroupesDF_tmp = transformer.transform(vecteursGroupesDF)
val trans_vecteursGroupesDF = vecteursGroupesDF_tmp.select("transformedVector")
trans_vecteursGroupesDF.show()

+--------------------+
|   transformedVector|
+--------------------+
|[6.85143189283832...|
|[-4.2959883050211...|
|[2.46626142421249...|
|[-3.6929582575126...|
|[0.40109889970226...|
|[7.45965191274349...|
|[-4.7003118480775...|
|[1.77148512597620...|
|[-2.6229150593060...|
|[-0.2488022191305...|
|[7.09527856586733...|
|[-4.1093962887920...|
|[2.43202156342302...|
|[-2.3592814771450...|
|[0.15241818421045...|
|[7.46625230110154...|
|[-5.4383228779883...|
|[1.83263984767520...|
|[-1.8619822493373...|
|[-0.4661152929130...|
+--------------------+
only showing top 20 rows



import org.apache.spark.ml.feature.ElementwiseProduct
transformingVector: org.apache.spark.ml.linalg.Vector = [1.0,3.0]
transformer: org.apache.spark.ml.feature.ElementwiseProduct = elemProd_1d482d3d6c6f
vecteursGroupesDF_tmp: org.apache.spark.sql.DataFrame = [features: vector, transformedVector: vector]
trans_vecteursGroupesDF: org.apache.spark.sql.DataFrame = [transformedVector: vector]


In [60]:
// init random
// Appliquer k-means
val kmeans = new KMeans().setK(5).setMaxIter(200).setSeed(2L).setFeaturesCol("transformedVector")
val modele = kmeans.fit(trans_vecteursGroupesDF)
val resultat = modele.transform(trans_vecteursGroupesDF)
val points = resultat.withColumn("x", first($"transformedVector")).withColumn("y", second($"transformedVector"))
Vegas("K-Means").withDataFrame(points).mark(Point).encodeX("x", Quant).encodeY("y", Quant).encodeColor("prediction", Nom).show
// Évaluer la classification par la somme des inerties intra-classe
val wsse = modele.computeCost(trans_vecteursGroupesDF)

kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_b68d4838f4df
modele: org.apache.spark.ml.clustering.KMeansModel = kmeans_b68d4838f4df
resultat: org.apache.spark.sql.DataFrame = [transformedVector: vector, prediction: int]
points: org.apache.spark.sql.DataFrame = [transformedVector: vector, prediction: int ... 2 more fields]
wsse: Double = 6914.1401525601805
