## Regression Pipeline and model tuning

In [0]:
df = spark.read.format('parquet').load("/databricks-datasets/definitive-guide/data/regression")
df.show()

+--------------+-----+
|      features|label|
+--------------+-----+
|[3.0,10.1,3.0]|  2.0|
| [2.0,1.1,1.0]|  1.0|
|[1.0,0.1,-1.0]|  0.0|
|[1.0,0.1,-1.0]|  0.0|
| [2.0,4.1,1.0]|  2.0|
+--------------+-----+



### 1. Linear Regression
#### 1.1 Estimator is a model

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [0]:
# for pipeline, just 2 modification: featuresCol='features_2' and .setEstimator(pipe)
lr = LinearRegression(featuresCol='features', labelCol="label") # .setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
# Controls regularization to prevent overfitting. You should test out different values for the regularization parameter to find the optimal value for your problem. The default is 0.1.


print(lr.explainParams())

# https://spark.apache.org/docs/1.5.2/ml-linear-methods.html

# regParam : Regularization works by adding a penalty term to the loss function of the model during training, which discourages the model from fitting the training data too closely

# elasticNetParam corresponds to α and regParam corresponds to λ : if a linear regression model is trained with the elastic net parameter α set to 1, it is equivalent to a Lasso model. On the other hand, if α is set to 0, the trained model reduces to a ridge regression model.

params = (ParamGridBuilder()
         .addGrid(lr.regParam, [ 0, 0.01, 0.1, 1.0])
         .addGrid(lr.elasticNetParam, [0, 0.5, 1])
         .addGrid(lr.maxIter, [10, 100, 200])
         .build()) # pay attention you must write .build()

params

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max number of itera

In [0]:
lr.fit(df)

lr.fit(df).transform(df).show() # juste pour montrer que le nom de colonne est bien "prediction"

+--------------+-----+-------------------+
|      features|label|         prediction|
+--------------+-----+-------------------+
|[3.0,10.1,3.0]|  2.0| 2.2944785276073647|
| [2.0,1.1,1.0]|  1.0|    1.1963190184049|
|[1.0,0.1,-1.0]|  0.0|0.14723926380368363|
|[1.0,0.1,-1.0]|  0.0|0.14723926380368363|
| [2.0,4.1,1.0]|  2.0| 1.2147239263803682|
+--------------+-----+-------------------+



In [0]:
evaluator = (RegressionEvaluator()
             .setMetricName("rmse") # there is no trick that could be used to extract more than one metric, you can also use RegressionMetrics (below) directly
             .setPredictionCol("prediction")
             .setLabelCol("label")
            )

In [0]:
cv = (CrossValidator() # parallelism=4
#       .setEstimator(lr)
      .setEstimator(pip)
      .setEstimatorParamMaps(params)
      .setEvaluator(evaluator)
      .setNumFolds(3) # ne peut pas mettre 5 cela pose prob aux algo arbres
     )

lrModel = cv.fit(df) # lrModel is pipline version

lrM = cv.fit(df) # lrM is model version
best_M = lrModel.bestModel
bestM = lrM.bestModel




**Evaluation of models** <br/>
**Get information about best model** <br/>
**Une fois tuning, comment on sait les valeurs de hyperparamètres plus performant**

In [0]:
# Quand Estimator est un model

summary = best_M.summary
summary. # autocompletion

print(lrM.avgMetrics)

best_M.extractParamMap()

best_M._java_obj.getRegParam()
best_M._java_obj.getElasticNetParam()
best_M._java_obj.getMaxIter()



[0.7633967808175365, 0.7633967808175365, 0.7633967808175365, 0.7633967808175365, 0.7633967808175365, 0.7633967808175365, 0.7633967808175365, 0.7633967808175365, 0.7633967808175365, 0.7694086147401896, 0.7694086147401896, 0.7694086147401896, 0.7706423662550973, 0.7706423662550973, 0.7706423662550973, 0.7717036897194441, 0.7717036897194441, 0.7717036897194441, 0.8071258046048405, 0.8071258046048405, 0.8071258046048405, 0.830129141021405, 0.830129141021405, 0.830129141021405, 0.8464658722622808, 0.8464658722622808, 0.8464658722622808, 0.9783930063824496, 0.9783930063824496, 0.9783930063824496, 1.2608692451675465, 1.2608692451675465, 1.2608692451675465, 1.410683602522959, 1.410683602522959, 1.410683602522959]


#### 1.2 Estimator is a Pipeline

In [0]:
from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features").setOutputCol("features_2")

sScaler.fit(df).transform(df).show(5, False)

+--------------+-----+-------------------------------------------------------------+
|features      |label|features_2                                                   |
+--------------+-----+-------------------------------------------------------------+
|[3.0,10.1,3.0]|2.0  |[3.5856858280031805,2.3805928299947103,1.7928429140015902]   |
|[2.0,1.1,1.0] |1.0  |[2.390457218668787,0.25927248643506745,0.5976143046671968]   |
|[1.0,0.1,-1.0]|0.0  |[1.1952286093343936,0.023570226039551587,-0.5976143046671968]|
|[1.0,0.1,-1.0]|0.0  |[1.1952286093343936,0.023570226039551587,-0.5976143046671968]|
|[2.0,4.1,1.0] |2.0  |[2.390457218668787,0.966379267621615,0.5976143046671968]     |
+--------------+-----+-------------------------------------------------------------+



In [0]:
from pyspark.ml.pipeline import Pipeline
pip = Pipeline().setStages([sScaler, lr])

pip = Pipeline().setStages([sScaler, glr])

In [0]:
# when Estimator is a Pipeline

# plus simple et moins elegant, quand Estimator est un Pipeline ou un model

lrModel.avgMetrics
list(zip(lrModel.avgMetrics, params))

Out[91]: [(0.7633967808175326,
  {Param(parent='GBTRegressor_35e6fb3c0032', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5,
   Param(parent='GBTRegressor_35e6fb3c0032', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.01}),
 (0.7633967808175326,
  {Param(parent='GBTRegressor_35e6fb3c0032', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5,
   Param(parent='GBTRegressor_35e6fb3c0032', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.1}),
 (0.7633967808175326,
  {Param(parent='GBTRegressor_35e6fb3c0032', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 

In [0]:
# when Estimator is a Pipeline (Optional)
from pyspark.mllib.evaluation import RegressionMetrics
out = (bestModel.transform(df)
       .select("prediction", "label").rdd.map(lambda x:(float(x[0]), float(x[1])))
      )

# how to show a Rdd
dataColl =out.collect()
for row in dataColl:
    print(row[0] , row[1])

metrics = RegressionMetrics(out)
print(metrics.meanSquaredError)
print(metrics.rootMeanSquaredError)
print(metrics.r2)
print(metrics.meanAbsoluteError)
print(metrics.explainedVariance)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-1550011534196551>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# when Estimator is a Pipeline (Optional)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0;32mfrom[0m [0mpyspark[0m[0;34m.[0m[0mmllib[0m[0;34m.[0m[0mevaluation[0m [0;32mimport[0m [0mRegressionMetrics[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m out = (bestModel.transform(df)
[0m[1;32m      4[0m        [0;34m.[0m[0mselect[0m[0;34m([0m[0;34m"prediction"[0m[0;34m,[0m [0;34m"label"[0m[0;34m)[0m[0;34m.[0m[0mrdd[0m[0;34m.[0m[0mmap[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m[0;34m([0m[0mfloat[0m[0;34m([0m[0mx[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mfloat[0m[0;34m([0m[0mx[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;

#### 1.3 Persisting and applying model (Pipeline version)

In [0]:
# Persisting
model_path = '/tmp/mllib-persistence-example'
best_M.write().overwrite().save(model_path + '/lr_model')

In [0]:
# loading and applying
from pyspark.ml.pipeline import PipelineModel
lr_model_loaded = PipelineModel.load(model_path + '/lr_model')

lr_model_loaded

lr_model_loaded.transform(df).show()

+--------------+-----+--------------------+-------------------+
|      features|label|          features_2|         prediction|
+--------------+-----+--------------------+-------------------+
|[3.0,10.1,3.0]|  2.0|[3.58568582800318...| 2.2944785276073647|
| [2.0,1.1,1.0]|  1.0|[2.39045721866878...|    1.1963190184049|
|[1.0,0.1,-1.0]|  0.0|[1.19522860933439...|0.14723926380368363|
|[1.0,0.1,-1.0]|  0.0|[1.19522860933439...|0.14723926380368363|
| [2.0,4.1,1.0]|  2.0|[2.39045721866878...| 1.2147239263803682|
+--------------+-----+--------------------+-------------------+



Persisting and applying model (model version)

In [0]:
bestM.write().overwrite().save(model_path + '/lr_model_model_version')

In [0]:
# loading and applying
from pyspark.ml.regression import LinearRegressionModel 
lr_model_loaded_mv = LinearRegressionModel.load(model_path + '/lr_model_model_version')

lr_model_loaded_mv

lr_model_loaded_mv.transform(df).show()

+--------------+-----+-------------------+
|      features|label|         prediction|
+--------------+-----+-------------------+
|[3.0,10.1,3.0]|  2.0| 2.2944785276073647|
| [2.0,1.1,1.0]|  1.0|    1.1963190184049|
|[1.0,0.1,-1.0]|  0.0|0.14723926380368363|
|[1.0,0.1,-1.0]|  0.0|0.14723926380368363|
| [2.0,4.1,1.0]|  2.0| 1.2147239263803682|
+--------------+-----+-------------------+



### Other models : glr, rf, boosting

In [0]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = (GeneralizedLinearRegression(featuresCol="features_2", labelCol="label")
       .setFamily("gaussian")
       .setLink("identity")
      )
pipeline = Pipeline().setStages([sScaler, glr])
params = (ParamGridBuilder()
          .addGrid(glr.regParam, [0, 0.5, 1])
          .addGrid(glr.maxIter, [5, 10]) # la possibilite de tuning plusieurs params
          .build()
         )
params

# Link function, who maps a non-linear relationship to a linear one, which means you can fit a linear model to the data
       # For the linear regression model, the link function is called the identity link function, because no transformation is needed to get from the linear regression parameters on the right-hand side of the equation to the normal distribution.

Out[86]: [{Param(parent='GeneralizedLinearRegression_26274df8e630', name='regParam', doc='regularization parameter (>= 0).'): 0.0,
  Param(parent='GeneralizedLinearRegression_26274df8e630', name='maxIter', doc='max number of iterations (>= 0).'): 5},
 {Param(parent='GeneralizedLinearRegression_26274df8e630', name='regParam', doc='regularization parameter (>= 0).'): 0.0,
  Param(parent='GeneralizedLinearRegression_26274df8e630', name='maxIter', doc='max number of iterations (>= 0).'): 10},
 {Param(parent='GeneralizedLinearRegression_26274df8e630', name='regParam', doc='regularization parameter (>= 0).'): 0.5,
  Param(parent='GeneralizedLinearRegression_26274df8e630', name='maxIter', doc='max number of iterations (>= 0).'): 5},
 {Param(parent='GeneralizedLinearRegression_26274df8e630', name='regParam', doc='regularization parameter (>= 0).'): 0.5,
  Param(parent='GeneralizedLinearRegression_26274df8e630', name='maxIter', doc='max number of iterations (>= 0).'): 10},
 {Param(parent='Gener

In [0]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol='features_2')
print(rf.explainParams())

pipeline = Pipeline().setStages([sScaler, rf])
params = (ParamGridBuilder().addGrid(rf.numTrees, [10, 20, 50, 100])
          .addGrid(rf.maxDepth, [5, 10])
#           .addGrid(rf.maxBins, [32, 64])
          .build()
         )

params

bootstrap: Whether bootstrap samples are used when building trees. (default: True)
cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the featur

In [0]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol='features_2')
print(gbt.explainParams())

pipeline = Pipeline().setStages([sScaler, gbt])
params = (ParamGridBuilder()
          .addGrid(gbt.maxDepth, [5, 10])
          .addGrid(gbt.stepSize, [0.01, 0.1, 0.2])
          .build()
         )

params

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 

In [0]:
# la partie suivante est optionnelle

import os
os.getcwd()
# os.chdir('/mllib-persistence-example/model_regression') # this does not work, we can acesse /tmp but not 
os.listdir()

In [0]:
dbutils.fs.cp("/tmp/mllib-persistence-example/model_regression", '/FileStore/my-model/model_regression', True)


Out[76]: True

In [0]:
dbutils.fs.cp('/FileStore/my-model', '/tmp/mllib-persistence-example/model_4', True)

dbutils.fs.ls("FileStore/my-model/")
dbutils.fs.ls("/tmp/mllib-persistence-example/")

dbutils.fs.rm("/tmp/mllib-persistence-example/model_4", True)

dbutils.fs.rm('/FileStore/my-model/stages',True)

Out[83]: [FileInfo(path='dbfs:/FileStore/my-model/model_regression/', name='model_regression/', size=0, modificationTime=0)]