## Package ml-classification (org.apache.spark.ml.classification)
L'objectif est de faire une présentation de la librairie de classification de Spark ML (Scala).

**Note :** Je mets plutôt l'accent sur la démarche et non sur la recherche d'un meilleur modèle.

## 1. Préparation des données

In [2]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}

// Charger les données
val dfmIris = spark.read.option("header",  true)
                   .option("inferSchema",  true)
                   .option("delimiter", ",").csv("../data/iris.txt")

// Labeliser les données
val dfmIrisLabeled = dfmIris.withColumn("label",  when(col("classe") === "Iris-virginica" , 1.0).otherwise(0.0))

val features = Array("sepal_l", "sepal_w", "petal_l", "petal_w")
val label = "label"
val predCol = "labelPredictCol"
val featureName = "features"

// Assembler les features
val assembler = new VectorAssembler().setInputCols(features).setOutputCol(featureName)
val dfmIrisAssembled = assembler.transform(dfmIrisLabeled)

// Séparer les données en train et test
val Array(training, test) = dfmIrisAssembled.randomSplit(Array(0.8, 0.2), seed = 34512)

// Evaluer la performance des modèles avec le F1-score
val evaluator = new MulticlassClassificationEvaluator()
                  .setLabelCol(label)
                  .setPredictionCol(predCol)
                  .setMetricName("f1")

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
dfmIris: org.apache.spark.sql.DataFrame = [sepal_l: double, sepal_w: double ... 3 more fields]
dfmIrisLabeled: org.apache.spark.sql.DataFrame = [sepal_l: double, sepal_w: double ... 4 more fields]
features: Array[String] = Array(sepal_l, sepal_w, petal_l, petal_w)
label: String = label
predCol: String = labelPredictCol
featureName: String = features
assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_8adb92dee6d7, handleInvalid=error, numInputCols=4
dfmIrisAssembled: org.apache.spark.sql.DataFrame = [sepal_l: double, sepal_w: double ... 5 more ...


## 2. Tests d'algorithme de classification

### A. Régression logistique

In [3]:
import org.apache.spark.ml.classification.LogisticRegression

// Regression Logistique
val lr = new LogisticRegression()
            .setLabelCol(label)
            .setFeaturesCol(featureName)
            .setPredictionCol(predCol)

// Grille de recherche pour trouver le meilleur modèle
val paramGrid = new ParamGridBuilder()
  .addGrid(lr.maxIter, Array(50, 100))
  .addGrid(lr.regParam,  Array(0.1, 0.3, 0.7))
  .addGrid(lr.elasticNetParam,  Array(0.5, 0.8))
  .build()

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(lr)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.8)
  .setParallelism(2)

// Fitting du modèle
val model = trainValidationSplit.fit(training)

// Prediction en test
val predictions = model.transform(test)

// Calcul du F1-Score sur le test
val f1Score = evaluator.evaluate(predictions)

print(s"F1-Score = ${f1Score}\n")

F1-Score = 0.9681165936083325


import org.apache.spark.ml.classification.LogisticRegression
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_e651db27e6cc
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_e651db27e6cc-elasticNetParam: 0.5,
	logreg_e651db27e6cc-maxIter: 50,
	logreg_e651db27e6cc-regParam: 0.1
}, {
	logreg_e651db27e6cc-elasticNetParam: 0.5,
	logreg_e651db27e6cc-maxIter: 50,
	logreg_e651db27e6cc-regParam: 0.3
}, {
	logreg_e651db27e6cc-elasticNetParam: 0.5,
	logreg_e651db27e6cc-maxIter: 50,
	logreg_e651db27e6cc-regParam: 0.7
}, {
	logreg_e651db27e6cc-elasticNetParam: 0.8,
	logreg_e651db27e6cc-maxIter: 50,
	logreg_e651db27e6cc-regParam: 0.1
}, {
	logreg_e651db27e6cc-elasticNetParam: 0.8,
	logreg_e651db27e6cc-maxIter: 50,
	logreg_e651db27e6cc-regPa...


### B. Support Vecteurs Machine (SVM)

In [4]:
import org.apache.spark.ml.classification.LinearSVC

// SVM
val lsvc = new LinearSVC()
            .setLabelCol(label)
            .setFeaturesCol(featureName)
            .setPredictionCol(predCol)


// Grille de recherche pour trouver le meilleur modèle
val paramGrid = new ParamGridBuilder()
  .addGrid(lsvc.maxIter, Array(50, 100))
  .addGrid(lsvc.regParam,  Array(0.1, 0.3, 0.7))
  .build()

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(lsvc)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.8)
  .setParallelism(2)

// Fitting du modèle
val model = trainValidationSplit.fit(training)

// Prediction en test
val predictions = model.transform(test)

// Calcul du F1-Score sur le test
val f1Score = evaluator.evaluate(predictions)

print(s"F1-Score = ${f1Score}\n")

predictions.select(label, predCol).show(6)

F1-Score = 0.9368035190615837
+-----+---------------+
|label|labelPredictCol|
+-----+---------------+
|  0.0|            0.0|
|  0.0|            0.0|
|  0.0|            0.0|
|  0.0|            0.0|
|  0.0|            0.0|
|  0.0|            0.0|
+-----+---------------+
only showing top 6 rows



import org.apache.spark.ml.classification.LinearSVC
lsvc: org.apache.spark.ml.classification.LinearSVC = linearsvc_77417142e215
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linearsvc_77417142e215-maxIter: 50,
	linearsvc_77417142e215-regParam: 0.1
}, {
	linearsvc_77417142e215-maxIter: 100,
	linearsvc_77417142e215-regParam: 0.1
}, {
	linearsvc_77417142e215-maxIter: 50,
	linearsvc_77417142e215-regParam: 0.3
}, {
	linearsvc_77417142e215-maxIter: 100,
	linearsvc_77417142e215-regParam: 0.3
}, {
	linearsvc_77417142e215-maxIter: 50,
	linearsvc_77417142e215-regParam: 0.7
}, {
	linearsvc_77417142e215-maxIter: 100,
	linearsvc_77417142e215-regParam: 0.7
})
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_e27581fa7266
model: org.ap...


### C. Arbre de décision 

In [5]:
import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}

// Arbre de décision
val tree = new DecisionTreeClassifier()
            .setLabelCol(label)
            .setFeaturesCol(featureName)
            .setPredictionCol(predCol)

// Grille de recherche pour trouver le meilleur modèle
val paramGrid = new ParamGridBuilder()
  .addGrid(tree.maxDepth, Array(7, 10))
  .addGrid(tree.impurity, Array("entropy", "gini"))
  .addGrid(tree.minInstancesPerNode, Array(3, 5))
  .build()

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(tree)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.8)
  .setParallelism(2)

// Fitting du modèle
val model = trainValidationSplit.fit(training)

// Prediction en test
val predictions = model.transform(test)

// Calcul du F1-Score sur le test
val f1Score = evaluator.evaluate(predictions)

print(s"F1-Score = ${f1Score}\n")

// Afficher l'arbre de décision
val treeModel = model.bestModel.asInstanceOf[DecisionTreeClassificationModel]
println(s"Regles de prediction :\n ${treeModel.toDebugString}")

F1-Score = 0.9681165936083325
Regles de prediction :
 DecisionTreeClassificationModel: uid=dtc_853c5be9b498, depth=2, numNodes=5, numClasses=2, numFeatures=4
  If (feature 2 <= 4.75)
   Predict: 0.0
  Else (feature 2 > 4.75)
   If (feature 3 <= 1.75)
    Predict: 0.0
   Else (feature 3 > 1.75)
    Predict: 1.0



import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}
tree: org.apache.spark.ml.classification.DecisionTreeClassifier = dtc_853c5be9b498
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	dtc_853c5be9b498-impurity: entropy,
	dtc_853c5be9b498-maxDepth: 7,
	dtc_853c5be9b498-minInstancesPerNode: 3
}, {
	dtc_853c5be9b498-impurity: gini,
	dtc_853c5be9b498-maxDepth: 7,
	dtc_853c5be9b498-minInstancesPerNode: 3
}, {
	dtc_853c5be9b498-impurity: entropy,
	dtc_853c5be9b498-maxDepth: 10,
	dtc_853c5be9b498-minInstancesPerNode: 3
}, {
	dtc_853c5be9b498-impurity: gini,
	dtc_853c5be9b498-maxDepth: 10,
	dtc_853c5be9b498-minInstancesPerNode: 3
}, {
	dtc_853c5be9b498-impurity: entropy,
	dtc_853c5be9b498-maxDepth: 7,
	dtc_853c...


### D. Forêts aléatoires

In [6]:
import org.apache.spark.ml.classification.RandomForestClassifier

// Random Forest
val rf = new RandomForestClassifier()
            .setLabelCol(label)
            .setFeaturesCol(featureName)
            .setPredictionCol(predCol)

// Grille de recherche pour trouver le meilleur modèle
val paramGrid = new ParamGridBuilder()
  .addGrid(rf.maxDepth, Array(5, 8, 12))
  .addGrid(rf.impurity,  Array("gini", "entropy"))
  .addGrid(rf.subsamplingRate, Array(0.6, 0.8))
  .addGrid(rf.minInstancesPerNode, Array(3, 5))
  .addGrid(rf.numTrees, Array(10, 20, 40))
  .addGrid(rf.featureSubsetStrategy, Array("sqrt", "log2"))
  .build()

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(rf)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.8)
  .setParallelism(2)

// Fitting du modèle
val model = trainValidationSplit.fit(training)

// Prediction en test
val predictions = model.transform(test)

// Calcul du F1-Score sur le test
val f1Score = evaluator.evaluate(predictions)

print(s"F1-Score = ${f1Score}\n")

F1-Score = 0.9681165936083325


import org.apache.spark.ml.classification.RandomForestClassifier
rf: org.apache.spark.ml.classification.RandomForestClassifier = rfc_dc0510037180
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	rfc_dc0510037180-featureSubsetStrategy: sqrt,
	rfc_dc0510037180-impurity: gini,
	rfc_dc0510037180-maxDepth: 5,
	rfc_dc0510037180-minInstancesPerNode: 3,
	rfc_dc0510037180-numTrees: 10,
	rfc_dc0510037180-subsamplingRate: 0.6
}, {
	rfc_dc0510037180-featureSubsetStrategy: sqrt,
	rfc_dc0510037180-impurity: gini,
	rfc_dc0510037180-maxDepth: 5,
	rfc_dc0510037180-minInstancesPerNode: 3,
	rfc_dc0510037180-numTrees: 10,
	rfc_dc0510037180-subsamplingRate: 0.8
}, {
	rfc_dc0510037180-featureSubsetStrategy: sqrt,
	rfc_dc0510037180-impurity: entropy,
	rfc_dc0510037180-maxDep...


**Références :**  
[Documentation Spark](https://spark.apache.org/docs/3.0.0/index.html)