# Chapter 10: Machine Learning with MLlib - Hyperparameter Tuning and Tree-Based Methods
Christoph Windheuser    
July, 2022   
Python examples of chapter 10 (page 307 ff) in the book *Learning Spark*

In [30]:
# Import required python spark libraries
import pyspark
import numpy as np
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator


In [2]:
# create a SparkSession
# This requires access to the internet. If executed offline, an error is thrown

spark = (SparkSession \
         .builder \
         .appName("Chapter_10") \
         .getOrCreate())


# Reading and Preparing the Data

In [6]:
filePath = "../DB_Spark/LearningSparkV2/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet"
airbnbDF = spark.read.parquet(filePath)

trainDF, testDF = airbnbDF.randomSplit([0.8, 0.2], seed=42)
print (f"There are {trainDF.count()} rows in the training set, and {testDF.count()} rows in the test set.")


There are 5780 rows in the training set, and 1366 rows in the test set.


In [16]:
categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
# oheOutputCols   = [x + "OHE"   for x in categoricalCols]
stringIndexer = StringIndexer(inputCols = categoricalCols,
                             outputCols = indexOutputCols,
                             handleInvalid="skip")


# Build a Decision Tree Model

In [7]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(labelCol="price")


In [9]:
# Filter for just numeric volumns and exclude price, our label
numericCols = [field for (field, dataType) in trainDF.dtypes
              if ((dataType == "double") & (field != "price"))]


In [13]:
# Combine output of StringIndexer defined above and numeric columns
assemblerInputs = indexOutputCols + numericCols
vecAssembler    = VectorAssembler(inputCols = assemblerInputs, outputCol = "features")


In [17]:
# Combine stages into pipeline
stages   = [stringIndexer, vecAssembler, dt]
pipeline = Pipeline(stages=stages)


### Tuning the Hyperparameter MaxBins
Without the command `dt.setMaxBins(40)` the code would through an error that the paramter `MaxBins = 32`is too small. Seting `MaxBin`to 40 resloves this problem. 

In [20]:
# Run the training: 
dt.setMaxBins(40)                              # Without this command, an error is thrown
pipelineModel = pipeline.fit(trainDF)

### Let us inspect the learned model and it's if-then-else rules

In [21]:
dtModel = pipelineModel.stages[-1]
print(dtModel.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_0a4c13755898, depth=5, numNodes=47, numFeatures=33
  If (feature 12 <= 2.5)
   If (feature 12 <= 1.5)
    If (feature 5 in {1.0,2.0})
     If (feature 4 in {0.0,1.0,3.0,5.0,9.0,10.0,11.0,13.0,14.0,16.0,18.0,24.0})
      If (feature 3 in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0})
       Predict: 104.23992784125075
      Else (feature 3 not in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0})
       Predict: 250.7111111111111
     Else (feature 4 not in {0.0,1.0,3.0,5.0,9.0,10.0,11.0,13.0,14.0,16.0,18.0,24.0})
      If (feature 3 in {0.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,27.0,33.0,35.0})
       Predict: 151.94179894179894
      Else (feat

### Extract the Feature Importance Scores to find the most important features

In [27]:
featureImp = pd.DataFrame(list(zip(vecAssembler.getInputCols(), dtModel.featureImportances)),
                          columns=["feature", "importance"])
featureImp.sort_values(by="importance", ascending=False)


Unnamed: 0,feature,importance
12,bedrooms,0.283406
1,cancellation_policyIndex,0.167893
2,instant_bookableIndex,0.140081
4,property_typeIndex,0.128179
15,number_of_reviews,0.126233
3,neighbourhood_cleansedIndex,0.0562
9,longitude,0.03881
14,minimum_nights,0.029473
13,beds,0.015218
5,room_typeIndex,0.010905


### Calculating RMSE and R<sup>2</sup> for the Decision Tree Model

In [29]:
predDF = pipelineModel.transform(testDF)
predDF.select("price", "prediction").show(10)


+------+------------------+
| price|        prediction|
+------+------------------+
|  85.0|131.96658097686375|
|  45.0|104.23992784125075|
|  70.0|104.23992784125075|
| 128.0|104.23992784125075|
| 159.0|104.23992784125075|
| 250.0| 290.8357933579336|
|  99.0| 205.5814889336016|
|  95.0|131.96658097686375|
| 100.0|104.23992784125075|
|2010.0| 205.5814889336016|
+------+------------------+
only showing top 10 rows



In [31]:
regressionEvaluator = RegressionEvaluator (
    predictionCol = "prediction",
    labelCol      = "price",
    metricName    = "rmse")

rmse = regressionEvaluator.evaluate(predDF)
print (f"RMSE is {rmse:.1f}")


RMSE is 385.9


In [32]:
regressionEvaluator = RegressionEvaluator (
    predictionCol = "prediction",
    labelCol      = "price",
    metricName    = "r2")

r2 = regressionEvaluator.evaluate(predDF)
print (f"R^2 is {r2}")

R^2 is -1.5696388432265533


### Results
The Decision Tree model performs worse than the linear regression models and even worse than our baseline model based on the average:    

Model | RMSE <br/> *(lower is better)* | R<sup>2</sup> <br/> *(higher is better)*
:---|:---|:---
Baseline (avg.) | 240.8 | -0.001
linear price    | 220.6 | 0.160
log price       | 208.2 | 0.252
Decision Tree   | 385.9 | -1.570


# Random Forests
Page 313

In [33]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="price", maxBins=40, seed=42)


In [34]:
pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])

In [35]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.numTrees, [10, 100])
             .build())


In [36]:
evaluator = RegressionEvaluator(labelCol="price",
                                predictionCol="prediction",
                                metricName="rmse")


### Training our model with k-fold cross-validation

In [41]:
from pyspark.ml.tuning import CrossValidator
import time

cv = CrossValidator(estimator          = pipeline,
                    evaluator          = evaluator,
                    estimatorParamMaps = paramGrid,
                    numFolds           = 3,
                    seed               = 42)


start_time = time.time()
cvModel = cv.fit(trainDF)
print("--- Command took %s seconds ---" % (time.time() - start_time))


--- 22.146804809570312 seconds ---


### Inspect the Training Results

In [39]:
list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

[({Param(parent='RandomForestRegressor_dc8c089f32e3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2,
   Param(parent='RandomForestRegressor_dc8c089f32e3', name='numTrees', doc='Number of trees to train (>= 1).'): 10},
  291.18226409247836),
 ({Param(parent='RandomForestRegressor_dc8c089f32e3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2,
   Param(parent='RandomForestRegressor_dc8c089f32e3', name='numTrees', doc='Number of trees to train (>= 1).'): 100},
  286.7714750274078),
 ({Param(parent='RandomForestRegressor_dc8c089f32e3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 4,
   Param(parent='RandomForestRegressor_dc8c089f32e3', nam

### Parallelize Training in Spark
Page 320

In [42]:
start_time = time.time()
cvModel    = cv.setParallelism(4).fit(trainDF)
print("--- Command took %s seconds ---" % (time.time() - start_time))


--- Command took 13.251630067825317 seconds ---


### Further Improve Performance by putting the cross-validator inside the pipeline

In [44]:
cv = CrossValidator(estimator          = rf,
                    evaluator          = evaluator,
                    estimatorParamMaps = paramGrid,
                    numFolds           = 3,
                    parallelism        = 4,
                    seed               = 42)

pipeline = Pipeline(stages=[stringIndexer, vecAssembler, cv])

start_time    = time.time()
pipelineModel = pipeline.fit(trainDF)
print("--- Command took %s seconds ---" % (time.time() - start_time))


--- Command took 11.37712812423706 seconds ---
