## Package ml-regression (org.apache.spark.ml.regression)

L'objectif est de faire une présentation de la librairie de régression de Spark ML (Scala).

**Note :** Je mets plutôt l'accent sur la démarche et non sur la recherche d'un meilleur modèle.

## 1. Préparation des données

In [2]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.RegressionEvaluator

// Charger les données
val dfmOzone = spark.read.option("header",  true)
                   .option("inferSchema",  true)
                   .option("delimiter", " ").csv("../data/ozone.txt")
dfmOzone.show(3)

// Transformer les «strings» en variables catégorielles
val ventIndexer= new StringIndexer().setInputCol("vent").setOutputCol("ventIndexed")
val pluieIndexer= new StringIndexer().setInputCol("pluie").setOutputCol("pluieIndexed")

val features = Array("T9", "T12", "T15", "Ne9", "Ne12", "Ne15", "Vx9", 
                     "Vx12", "Vx15", "maxO3v", "ventIndexed", "pluieIndexed")
val label = "maxO3"
val predCol = "maxO3PredictCol"
val featureName = "features"

// Assembler les features
val assembler = new VectorAssembler().setInputCols(features).setOutputCol(featureName)

// Evaluer la performance des modèles avec le RMSE
val evaluator = new RegressionEvaluator() 
                  .setLabelCol(label) 
                  .setPredictionCol(predCol) 
                  .setMetricName("rmse")

// Séparer les données en train et test
val Array(training, test) = dfmOzone.randomSplit(Array(0.8, 0.2), seed = 34512)

+--------+-----+----+----+----+---+----+----+-------+-------+-------+------+----+-----+
|      id|maxO3|  T9| T12| T15|Ne9|Ne12|Ne15|    Vx9|   Vx12|   Vx15|maxO3v|vent|pluie|
+--------+-----+----+----+----+---+----+----+-------+-------+-------+------+----+-----+
|20010601|   87|15.6|18.5|18.4|  4|   4|   8| 0.6946|-1.7101|-0.6946|    84|Nord|  Sec|
|20010602|   82|17.0|18.4|17.7|  5|   5|   7|-4.3301|   -4.0|   -3.0|    87|Nord|  Sec|
|20010603|   92|15.3|17.6|19.5|  2|   5|   4| 2.9544| 1.8794| 0.5209|    82| Est|  Sec|
+--------+-----+----+----+----+---+----+----+-------+-------+-------+------+----+-----+
only showing top 3 rows



import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.RegressionEvaluator
dfmOzone: org.apache.spark.sql.DataFrame = [id: int, maxO3: int ... 12 more fields]
ventIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_49bbbeb16e27
pluieIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_07e1c79e214a
features: Array[String] = Array(T9, T12, T15, Ne9, Ne12, Ne15, Vx9, Vx12, Vx15, maxO3v, ventIndexed, pluieIndexed)
label: String = maxO3
predCol: String = maxO3PredictCol
featureName: String = features
assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: ui...


## 2. Tests d'algorithmes de régression

### A. Régression linéaire

In [3]:
import org.apache.spark.ml.regression.LinearRegression

// Regression linéaire
val lr = new LinearRegression()
            .setLabelCol(label)
            .setFeaturesCol(featureName)
            .setPredictionCol(predCol)

// Grille de recherche pour trouver le meilleur modèle
val paramGrid = new ParamGridBuilder()
  .addGrid(lr.maxIter, Array(50, 100))
  .addGrid(lr.regParam,  Array(0.1, 0.3, 0.7))
  .build()

// Pipeline Préprocessing + Modélisation
val pipeline = new Pipeline().setStages(
    Array(ventIndexer, pluieIndexer, assembler, lr))

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.8)
  .setParallelism(2)

// Fitting du modèle
val model = pipeline.fit(training)

// Prediction en test
val predictions = model.transform(test)

predictions.select(label, predCol).show(5)

// Calcule du RMSE sur le test
val rmse = evaluator.evaluate(predictions)

print(s"RMSE = ${rmse}")

+-----+-----------------+
|maxO3|  maxO3PredictCol|
+-----+-----------------+
|   79|64.43351057025137|
|  101| 87.9941163334745|
|   57|77.77178041641294|
|   71|68.31067713916936|
|   56|66.03368854566439|
+-----+-----------------+
only showing top 5 rows

RMSE = 16.88460042937964

import org.apache.spark.ml.regression.LinearRegression
lr: org.apache.spark.ml.regression.LinearRegression = linReg_0e4699bf5c49
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linReg_0e4699bf5c49-maxIter: 50,
	linReg_0e4699bf5c49-regParam: 0.1
}, {
	linReg_0e4699bf5c49-maxIter: 50,
	linReg_0e4699bf5c49-regParam: 0.3
}, {
	linReg_0e4699bf5c49-maxIter: 50,
	linReg_0e4699bf5c49-regParam: 0.7
}, {
	linReg_0e4699bf5c49-maxIter: 100,
	linReg_0e4699bf5c49-regParam: 0.1
}, {
	linReg_0e4699bf5c49-maxIter: 100,
	linReg_0e4699bf5c49-regParam: 0.3
}, {
	linReg_0e4699bf5c49-maxIter: 100,
	linReg_0e4699bf5c49-regParam: 0.7
})
pipeline: org.apache.spark.ml.Pipeline = pipeline_3da9d3a9747b
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tv...


### B. Arbre de décision

In [4]:
import org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel}

// Arbre de décision
val tree = new DecisionTreeRegressor()
            .setLabelCol(label)
            .setFeaturesCol(featureName)
            .setPredictionCol(predCol)

// Grille de recherche pour trouver le meilleur modèle
val paramGrid = new ParamGridBuilder()
  .addGrid(tree.maxDepth, Array(7, 10))
  .addGrid(tree.minInstancesPerNode, Array(3, 5))
  .build()

// Pipeline Préprocessing + Modélisation
val pipeline = new Pipeline().setStages(
    Array(ventIndexer, pluieIndexer, assembler, tree))

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.8)
  .setParallelism(2)

// Fitting du modèle
val model = trainValidationSplit.fit(training)

// Prediction en test
val predictions = model.transform(test)

// Calcule du RMSE sur le test
val rmse = evaluator.evaluate(predictions)

print(s"RMSE = ${rmse}\n")

predictions.select(label, predCol).show(6)

// Afficher l'arbre de décision
val treeModel = model.bestModel
                     .asInstanceOf[PipelineModel]
                     .stages
                     .last
                     .asInstanceOf[DecisionTreeRegressionModel]
println(s"Regles de prediction :\n ${treeModel.toDebugString}")

RMSE = 20.85954457796239
+-----+---------------+
|maxO3|maxO3PredictCol|
+-----+---------------+
|   79|           73.6|
|  101|           81.0|
|   57|           78.0|
|   71|          105.0|
|   56|           57.2|
|   67|           61.0|
+-----+---------------+
only showing top 6 rows

Regles de prediction :
 DecisionTreeRegressionModel: uid=dtr_a346119917b6, depth=9, numNodes=47, numFeatures=12
  If (feature 0 <= 20.9)
   If (feature 9 <= 91.0)
    If (feature 4 <= 4.5)
     If (feature 10 in {1.0,2.0})
      Predict: 81.0
     Else (feature 10 not in {1.0,2.0})
      Predict: 105.0
    Else (feature 4 > 4.5)
     If (feature 11 in {1.0})
      If (feature 2 <= 21.85)
       If (feature 3 <= 7.5)
        If (feature 5 <= 3.5)
         Predict: 73.0
        Else (feature 5 > 3.5)
         If (feature 5 <= 7.5)
          If (feature 8 <= -4.962)
           Predict: 64.5
          Else (feature 8 > -4.962)
           Predict: 61.0
         Else (feature 5 > 7.5)
          Predict: 69.

import org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel}
tree: org.apache.spark.ml.regression.DecisionTreeRegressor = dtr_a346119917b6
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	dtr_a346119917b6-maxDepth: 7,
	dtr_a346119917b6-minInstancesPerNode: 3
}, {
	dtr_a346119917b6-maxDepth: 10,
	dtr_a346119917b6-minInstancesPerNode: 3
}, {
	dtr_a346119917b6-maxDepth: 7,
	dtr_a346119917b6-minInstancesPerNode: 5
}, {
	dtr_a346119917b6-maxDepth: 10,
	dtr_a346119917b6-minInstancesPerNode: 5
})
pipeline: org.apache.spark.ml.Pipeline = pipeline_58f48336124f
trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_3bd8307310f6
model: org.apache.spark.ml.tuning.TrainValidationSplitModel = TrainValidationSplitModel...


### C. Forêts aléatoires

In [5]:
import org.apache.spark.ml.regression.RandomForestRegressor

// Random Forest
val rf = new RandomForestRegressor()
            .setLabelCol(label)
            .setFeaturesCol(featureName)
            .setPredictionCol(predCol)

// Grille de recherche pour trouver le meilleur modèle
val paramGrid = new ParamGridBuilder()
  .addGrid(rf.maxDepth, Array(7, 12))
  .addGrid(rf.minInstancesPerNode, Array(3, 5))
  .addGrid(rf.numTrees, Array(30, 50))
  .addGrid(rf.subsamplingRate, Array(0.6, 0.8))
  .addGrid(rf.featureSubsetStrategy, Array("sqrt", "log2"))
  .build()

// Pipeline Préprocessing + Modélisation
val pipeline = new Pipeline().setStages(
    Array(ventIndexer, pluieIndexer, assembler, rf))

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.8)
  .setParallelism(2)

// Fitting du modèle
val model = trainValidationSplit.fit(training)

// Prediction en test
val predictions = model.transform(test)

// Calcule du RMSE sur le test
val rmse = evaluator.evaluate(predictions)

print(s"RMSE = ${rmse}\n")

predictions.select(label, predCol).show(6)

RMSE = 16.636849682565877
+-----+-----------------+
|maxO3|  maxO3PredictCol|
+-----+-----------------+
|   79|73.41388888888888|
|  101|84.87833333333334|
|   57|86.22619047619047|
|   71|75.96499999999999|
|   56| 72.4838888888889|
|   67|68.92888888888889|
+-----+-----------------+
only showing top 6 rows



import org.apache.spark.ml.regression.RandomForestRegressor
rf: org.apache.spark.ml.regression.RandomForestRegressor = rfr_e5ab9f043193
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	rfr_e5ab9f043193-featureSubsetStrategy: sqrt,
	rfr_e5ab9f043193-maxDepth: 7,
	rfr_e5ab9f043193-minInstancesPerNode: 3,
	rfr_e5ab9f043193-numTrees: 30,
	rfr_e5ab9f043193-subsamplingRate: 0.6
}, {
	rfr_e5ab9f043193-featureSubsetStrategy: sqrt,
	rfr_e5ab9f043193-maxDepth: 7,
	rfr_e5ab9f043193-minInstancesPerNode: 3,
	rfr_e5ab9f043193-numTrees: 30,
	rfr_e5ab9f043193-subsamplingRate: 0.8
}, {
	rfr_e5ab9f043193-featureSubsetStrategy: log2,
	rfr_e5ab9f043193-maxDepth: 7,
	rfr_e5ab9f043193-minInstancesPerNode: 3,
	rfr_e5ab9f043193-numTrees: 30,
	rfr_e5ab9f043193-subsamplingRate:...


### D. Gradient Boosting Trees

In [6]:
import org.apache.spark.ml.regression.GBTRegressor

// Gradient Boosting Trees
val gbt = new GBTRegressor()
            .setLabelCol(label)
            .setFeaturesCol(featureName)
            .setPredictionCol(predCol)

// Grille de recherche pour trouver le meilleur modèle
val paramGrid = new ParamGridBuilder()
  .addGrid(gbt.maxDepth, Array(5, 8, 12))
  .addGrid(gbt.minInstancesPerNode, Array(3, 5))
  .addGrid(gbt.maxIter, Array(20, 30, 50))   
  .addGrid(gbt.subsamplingRate, Array(0.6, 0.8))   
  .build()

// Pipeline Préprocessing + Modélisation
val pipeline = new Pipeline().setStages(
    Array(ventIndexer, pluieIndexer, assembler, gbt))

val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(pipeline)
  .setEvaluator(evaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.8)
  .setParallelism(2)

// Fitting du modèle
val model = trainValidationSplit.fit(training)

// Prediction en test
val predictions = model.transform(test)

// Calcul du RMSE sur le test
val rmse = evaluator.evaluate(predictions)

print(s"RMSE = ${rmse}\n")

predictions.select(label, predCol).show(6)

RMSE = 25.993742739712264
+-----+-----------------+
|maxO3|  maxO3PredictCol|
+-----+-----------------+
|   79|75.48334680642589|
|  101|82.07077008491203|
|   57|72.19845484060436|
|   71|71.40835345982528|
|   56|53.40715241951024|
|   67| 60.1553979926048|
+-----+-----------------+
only showing top 6 rows



import org.apache.spark.ml.regression.GBTRegressor
gbt: org.apache.spark.ml.regression.GBTRegressor = gbtr_b4a185598b7c
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	gbtr_b4a185598b7c-maxDepth: 5,
	gbtr_b4a185598b7c-maxIter: 20,
	gbtr_b4a185598b7c-minInstancesPerNode: 3,
	gbtr_b4a185598b7c-subsamplingRate: 0.6
}, {
	gbtr_b4a185598b7c-maxDepth: 5,
	gbtr_b4a185598b7c-maxIter: 20,
	gbtr_b4a185598b7c-minInstancesPerNode: 5,
	gbtr_b4a185598b7c-subsamplingRate: 0.6
}, {
	gbtr_b4a185598b7c-maxDepth: 8,
	gbtr_b4a185598b7c-maxIter: 20,
	gbtr_b4a185598b7c-minInstancesPerNode: 3,
	gbtr_b4a185598b7c-subsamplingRate: 0.6
}, {
	gbtr_b4a185598b7c-maxDepth: 8,
	gbtr_b4a185598b7c-maxIter: 20,
	gbtr_b4a185598b7c-minInstancesPerNode: 5,
	gbtr_b4a185598b7c-subsamplin...


**Références :**  
[Documentation Spark](https://spark.apache.org/docs/3.0.0/index.html)