In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ch04").getOrCreate()
spark.conf.set("spark.driver.memory", "4g")
sc = spark.sparkContext

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

In [3]:
colNames = ["Elevation", "Aspect", "Slope",
"Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points"]
for i in range(4):
    colNames += ["Wilderness_Area_"+str(i),]
for i in range(40):
    colNames += ["Soil_Type_"+str(i),]
colNames += ["Cover_Type",]

In [4]:
schema = StructType()
for name in colNames:
    if name == "Cover_Type":
        schema.add(StructField(name, DoubleType(), True))
    else:
        schema.add(StructField(name, IntegerType(), True))

In [5]:
data = spark.read.csv("covtype.data", header=False, schema=schema)

In [6]:
data.printSchema()

root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshade_9am: integer (nullable = true)
 |-- Hillshade_Noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Wilderness_Area_0: integer (nullable = true)
 |-- Wilderness_Area_1: integer (nullable = true)
 |-- Wilderness_Area_2: integer (nullable = true)
 |-- Wilderness_Area_3: integer (nullable = true)
 |-- Soil_Type_0: integer (nullable = true)
 |-- Soil_Type_1: integer (nullable = true)
 |-- Soil_Type_2: integer (nullable = true)
 |-- Soil_Type_3: integer (nullable = true)
 |-- Soil_Type_4: integer (nullable = true)
 |-- Soil_Type_5: integer (nullable = true)
 |-- Soil_Type

In [7]:
data.take(1)

[Row(Elevation=2596, Aspect=51, Slope=3, Horizontal_Distance_To_Hydrology=258, Vertical_Distance_To_Hydrology=0, Horizontal_Distance_To_Roadways=510, Hillshade_9am=221, Hillshade_Noon=232, Hillshade_3pm=148, Horizontal_Distance_To_Fire_Points=6279, Wilderness_Area_0=1, Wilderness_Area_1=0, Wilderness_Area_2=0, Wilderness_Area_3=0, Soil_Type_0=0, Soil_Type_1=0, Soil_Type_2=0, Soil_Type_3=0, Soil_Type_4=0, Soil_Type_5=0, Soil_Type_6=0, Soil_Type_7=0, Soil_Type_8=0, Soil_Type_9=0, Soil_Type_10=0, Soil_Type_11=0, Soil_Type_12=0, Soil_Type_13=0, Soil_Type_14=0, Soil_Type_15=0, Soil_Type_16=0, Soil_Type_17=0, Soil_Type_18=0, Soil_Type_19=0, Soil_Type_20=0, Soil_Type_21=0, Soil_Type_22=0, Soil_Type_23=0, Soil_Type_24=0, Soil_Type_25=0, Soil_Type_26=0, Soil_Type_27=0, Soil_Type_28=1, Soil_Type_29=0, Soil_Type_30=0, Soil_Type_31=0, Soil_Type_32=0, Soil_Type_33=0, Soil_Type_34=0, Soil_Type_35=0, Soil_Type_36=0, Soil_Type_37=0, Soil_Type_38=0, Soil_Type_39=0, Cover_Type=5.0)]

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
(trainData, testData) = data.randomSplit([0.9, 0.1])

In [10]:
inputCols = trainData.drop('Cover_Type').columns

In [11]:
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
assembledTrainData = assembler.transform(trainData)
assembledTrainData.select('featureVector').show(truncate=False)

+-----------------------------------------------------------------------------------------------------+
|featureVector                                                                                        |
+-----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0])  |
|(54,[0,1,2,5,6,7,8,9,13,18],[1874.0,18.0,14.0,90.0,208.0,209.0,135.0,793.0,1.0,1.0])                 |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1879.0,28.0,19.0,30.0,12.0,95.0,209.0,196.0,117.0,778.0,1.0,1.0])   |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1888.0,33.0,22.0,150.0,46.0,108.0,209.0,185.0,103.0,735.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,14],[1889.0,28.0,22.0,150.0,23.0,120.0,205.0,185.0,108.0,759.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1889.0,353.0,30.0,95.0,39.0,67.0,153.0,172.0,146.0,600.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1896.0,337.0,12.0,30.0,6.0,175

In [12]:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
model = classifier.fit(assembledTrainData)

print(model.toDebugString)
print(model.featureImportances)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4ea6ade6a1df942d064f) of depth 5 with 63 nodes
  If (feature 0 <= 3035.5)
   If (feature 0 <= 2574.5)
    If (feature 10 <= 0.5)
     If (feature 0 <= 2456.5)
      If (feature 3 <= 15.0)
       Predict: 4.0
      Else (feature 3 > 15.0)
       Predict: 3.0
     Else (feature 0 > 2456.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
    Else (feature 10 > 0.5)
     If (feature 22 <= 0.5)
      If (feature 9 <= 4541.5)
       Predict: 2.0
      Else (feature 9 > 4541.5)
       Predict: 2.0
     Else (feature 22 > 0.5)
      If (feature 5 <= 1021.0)
       Predict: 2.0
      Else (feature 5 > 1021.0)
       Predict: 1.0
   Else (feature 0 > 2574.5)
    If (feature 0 <= 2921.5)
     If (feature 15 <= 0.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
     Else (feature 15 > 0.5)
      If (feature 9 <= 1625.0)
       Predict: 

In [13]:
predictions = model.transform(assembledTrainData)
predictions.select(["Cover_Type", "prediction", "probability"]).show(truncate=False)

+----------+----------+-----------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                    |
+----------+----------+-----------------------------------------------------------------------------------------------+
|6.0       |3.0       |[0.0,0.0,0.03572755208860973,0.6286428257859724,0.04937020902981799,0.0,0.2862594130955999,0.0]|
|6.0       |4.0       |[0.0,0.0,0.0444064901793339,0.28223740392826646,0.4269854824935952,0.0,0.24637062339880444,0.0]|
|6.0       |3.0       |[0.0,0.0,0.03572755208860973,0.6286428257859724,0.04937020902981799,0.0,0.2862594130955999,0.0]|
|6.0       |3.0       |[0.0,0.0,0.03572755208860973,0.6286428257859724,0.04937020902981799,0.0,0.2862594130955999,0.0]|
|6.0       |3.0       |[0.0,0.0,0.03572755208860973,0.6286428257859724,0.04937020902981799,0.0,0.2862594130955999,0.0]|
|6.0       |3.0       |[0.0,0.0,0.035727

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")

In [16]:
evaluator.setMetricName("accuracy").evaluate(predictions)

0.6985440542665465

In [17]:
evaluator.setMetricName("f1").evaluate(predictions)

0.6801324470946156

In [17]:
#### confusion matrix - not supported in PySpark ML library

In [18]:
from pyspark.ml import Pipeline

inputCols = trainData.columns[:-1]
assembler = VectorAssembler(inputCols=inputCols, outputCol="featureVector")
classifier = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, classifier])

In [22]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder()\
    .addGrid(classifier, ["gini", "entropy"])\
    .addGrid(classifier, [1, 20])\
    .addGrid(classifier, [40, 300])\
    .addGrid(classifier, [0.0, 0.05])\
    .build()

In [23]:
multiclassEval = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy")
multiclassEval.evaluate(predictions)

0.6985440542665465

In [24]:
from pyspark.ml.tuning import TrainValidationSplit

validator = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=multiclassEval,
    trainRatio=0.9)
validatorModel = validator.fit(trainData)

In [25]:
bestModel = validatorModel.bestModel

In [26]:
bestModel.stages[-1].extractParamMap()

{Param(parent='DecisionTreeClassifier_433abbefc9420553705f', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,
 Param(parent='DecisionTreeClassifier_433abbefc9420553705f', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,
 Param(parent='DecisionTreeClassifier_433abbefc9420553705f', name='featuresCol', doc='features column name'): 'featureVector',
 Param(parent='DecisionTreeClassifier_433abbefc9420553705f', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',
 Param(parent='DecisionTreeClassifier_433abbefc9420553705f', na

In [27]:
paramsAndMetrics = validatorModel.validationMetrics
paramsAndMetrics

[0.6981244019138756, 0.6981244019138756]

In [28]:
multiclassEval.evaluate(bestModel.transform(testData))

0.7016941588424106

#### undoing the one-hot encoding

In [29]:
wildernessCols = []
for i in range(4):
    wildernessCols += ["Wilderness_Area_"+str(i),]

In [30]:
wildernessAssembler = VectorAssembler(
    inputCols=wildernessCols,
    outputCol="wilderness")

In [31]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType, StructType

unhotudf = udf(lambda x: float(x.toArray().nonzero()[0]), DoubleType())

In [32]:
withWilderness = wildernessAssembler.transform(data)
withWilderness = withWilderness\
    .drop(*wildernessCols)\
    .withColumn("wilderness", unhotudf(withWilderness['wilderness']))
withWilderness.take(1)

[Row(Elevation=2596, Aspect=51, Slope=3, Horizontal_Distance_To_Hydrology=258, Vertical_Distance_To_Hydrology=0, Horizontal_Distance_To_Roadways=510, Hillshade_9am=221, Hillshade_Noon=232, Hillshade_3pm=148, Horizontal_Distance_To_Fire_Points=6279, Soil_Type_0=0, Soil_Type_1=0, Soil_Type_2=0, Soil_Type_3=0, Soil_Type_4=0, Soil_Type_5=0, Soil_Type_6=0, Soil_Type_7=0, Soil_Type_8=0, Soil_Type_9=0, Soil_Type_10=0, Soil_Type_11=0, Soil_Type_12=0, Soil_Type_13=0, Soil_Type_14=0, Soil_Type_15=0, Soil_Type_16=0, Soil_Type_17=0, Soil_Type_18=0, Soil_Type_19=0, Soil_Type_20=0, Soil_Type_21=0, Soil_Type_22=0, Soil_Type_23=0, Soil_Type_24=0, Soil_Type_25=0, Soil_Type_26=0, Soil_Type_27=0, Soil_Type_28=1, Soil_Type_29=0, Soil_Type_30=0, Soil_Type_31=0, Soil_Type_32=0, Soil_Type_33=0, Soil_Type_34=0, Soil_Type_35=0, Soil_Type_36=0, Soil_Type_37=0, Soil_Type_38=0, Soil_Type_39=0, Cover_Type=5.0, wilderness=0.0)]

In [33]:
soilCols = []
for i in range(40):
    soilCols += ["Soil_Type_"+str(i),]


In [34]:
soilAssembler = VectorAssembler(
        inputCols=soilCols,
        outputCol="soil")

withWilderness = soilAssembler.transform(withWilderness)
unencData = withWilderness\
    .drop(*soilCols)\
    .withColumn("soil", unhotudf(withWilderness['soil']))
unencData.take(1)

[Row(Elevation=2596, Aspect=51, Slope=3, Horizontal_Distance_To_Hydrology=258, Vertical_Distance_To_Hydrology=0, Horizontal_Distance_To_Roadways=510, Hillshade_9am=221, Hillshade_Noon=232, Hillshade_3pm=148, Horizontal_Distance_To_Fire_Points=6279, Cover_Type=5.0, wilderness=0.0, soil=28.0)]

#### Decision Tree Classifier with unencoded data

In [35]:
(unencTrainData, unencTestData) = unencData.randomSplit([0.9, 0.1])

In [36]:
from pyspark.ml.feature import VectorIndexer

inputCols = unencTrainData.drop('Cover_Type').columns
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
indexer = VectorIndexer(
    maxCategories=40,
    inputCol="featureVector",
    outputCol="indexedVector")
classifier = DecisionTreeClassifier(
    seed=42,
    labelCol="Cover_Type",
    featuresCol="indexedVector",
    predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, indexer, classifier])

#### Random Forest Classifier

In [37]:
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(
    seed=42,
    maxBins=40,
    labelCol="Cover_Type",
    featuresCol="indexedVector",
    predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, indexer, classifier])

In [39]:
paramGrid = ParamGridBuilder()\
    .addGrid(classifier.minInfoGain, [0.0, 0.05])\
    .addGrid(classifier.numTrees, [1, 10])\
    .build()

In [40]:
multiclassEval = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy")

In [41]:
validator = TrainValidationSplit(
    seed=42,
    estimator=pipeline,
    evaluator=multiclassEval,
    estimatorParamMaps=paramGrid,
    trainRatio=0.9)

In [42]:
%%time
validatorModel = validator.fit(unencTrainData)
bestModel = validatorModel.bestModel
forestModel = bestModel.stages[-1]
print(forestModel.extractParamMap())

{Param(parent='RandomForestClassifier_487d8d8c18a1ce62d9c4', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False, Param(parent='RandomForestClassifier_487d8d8c18a1ce62d9c4', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10, Param(parent='RandomForestClassifier_487d8d8c18a1ce62d9c4', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto', Param(parent='RandomForestClassifier_487d8d8c18a1ce62d9c4', name='featuresCol', doc='features column name'): 'indexedVector', Param(parent='Rando

In [43]:
forestModel.getNumTrees

1

In [44]:
sorted(list(zip(inputCols, forestModel.featureImportances)), key=lambda x: x[1], reverse=True)

[('Elevation', 0.7830407302460989),
 ('soil', 0.11246799802207412),
 ('wilderness', 0.0360537582305858),
 ('Horizontal_Distance_To_Roadways', 0.03567176793162674),
 ('Horizontal_Distance_To_Fire_Points', 0.012947058869423229),
 ('Hillshade_Noon', 0.009230795355062984),
 ('Horizontal_Distance_To_Hydrology', 0.008267537972706721),
 ('Hillshade_3pm', 0.0023203533724215357),
 ('Aspect', 0.0),
 ('Slope', 0.0),
 ('Vertical_Distance_To_Hydrology', 0.0),
 ('Hillshade_9am', 0.0)]

In [45]:
testAccuracy = multiclassEval.evaluate(bestModel.transform(unencTestData))
testAccuracy

0.699871783913155

In [46]:
bestModel.transform(unencTestData.drop("Cover_Type")).show()

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+----------+----+--------------------+--------------------+--------------------+--------------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm|Horizontal_Distance_To_Fire_Points|wilderness|soil|       featureVector|       indexedVector|       rawPrediction|         probability|prediction|
+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+----------+----+--------------------+--------------------+--------------------+--------------------+----------+
|     1896|   337|   12|                              30|             

In [47]:
from pyspark.ml.classification import LogisticRegression

In [73]:
(trainData, testData) = data.randomSplit([0.9, 0.1])

In [74]:
inputCols = trainData.drop('Cover_Type').columns

In [75]:
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
assembledTrainData = assembler.transform(trainData)
assembledTrainData.select('featureVector').show(truncate=False)

+-----------------------------------------------------------------------------------------------------+
|featureVector                                                                                        |
+-----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0])  |
|(54,[0,1,2,5,6,7,8,9,13,18],[1874.0,18.0,14.0,90.0,208.0,209.0,135.0,793.0,1.0,1.0])                 |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1879.0,28.0,19.0,30.0,12.0,95.0,209.0,196.0,117.0,778.0,1.0,1.0])   |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1888.0,33.0,22.0,150.0,46.0,108.0,209.0,185.0,103.0,735.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,14],[1889.0,28.0,22.0,150.0,23.0,120.0,205.0,185.0,108.0,759.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1896.0,337.0,12.0,30.0,6.0,175.0,195.0,224.0,168.0,732.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1898.0,34.0,23.0,175.0,56.0,13

In [76]:
inputCols = testData.drop('Cover_Type').columns

In [77]:
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
assembledTestData = assembler.transform(testData)

In [60]:
lr = LogisticRegression(maxIter=10, 
                        regParam=0.3, 
                        elasticNetParam=0.8, 
                        labelCol="Cover_Type",
                        featuresCol="featureVector",
                        predictionCol="prediction")

In [61]:
lrModel = lr.fit(assembledTrainData)

In [62]:
predictions = lrModel.transform(assembledTrainData)
predictions.select(["Cover_Type", "prediction", "probability"]).show(truncate=False)

+----------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                                                                                        |
+----------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|6.0       |2.0       |[1.911923718971162E-6,0.3642014745902969,0.487820912763859,0.06165824436851518,0.0047233157125177345,0.016397432599075345,0.029913001696554368,0.03528370634546256]|
|6.0       |2.0       |[1.911923718971162E-6,0.3642014745902969,0.487820912763859,0.06165824436851518,0.0047233157125177345,0.016397432599075345,0.029913001696554368,0.03528370634546256]|
|6.0       |2.0       |[1.911923718971162E-6,0.3642014745902

In [63]:
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")

In [64]:
evaluator.setMetricName("accuracy").evaluate(predictions)

0.4878233568849834

In [65]:
evaluator.setMetricName("f1").evaluate(predictions)

0.31989231305088367

In [79]:
test = lrModel.transform(assembledTestData)
test.select(["Cover_Type", "prediction", "probability"]).show(truncate=False)

+----------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                                                                                        |
+----------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|6.0       |2.0       |[1.911923718971162E-6,0.3642014745902969,0.487820912763859,0.06165824436851518,0.0047233157125177345,0.016397432599075345,0.029913001696554368,0.03528370634546256]|
|3.0       |2.0       |[1.911923718971162E-6,0.3642014745902969,0.487820912763859,0.06165824436851518,0.0047233157125177345,0.016397432599075345,0.029913001696554368,0.03528370634546256]|
|6.0       |2.0       |[1.911923718971162E-6,0.3642014745902

In [80]:
evaluator.setMetricName("accuracy").evaluate(test)

0.48911947212772866

In [81]:
inputCols = trainData.columns[:-1]
assembler = VectorAssembler(inputCols=inputCols, outputCol="featureVector")
# classifier = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, lr])

In [83]:
multiclassEval = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy")
multiclassEval.evaluate(predictions)

0.48911947212772866

In [86]:
paramGrid = ParamGridBuilder()\
    .addGrid(lr, ["gini", "entropy"])\
    .addGrid(lr, [1, 20])\
    .addGrid(lr, [40, 300])\
    .addGrid(lr, [0.0, 0.05])\
    .build()

In [87]:
validator = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=multiclassEval,
    trainRatio=0.9)
validatorModel = validator.fit(trainData)

In [88]:
bestModel = validatorModel.bestModel

In [89]:
bestModel.stages[-1].extractParamMap()

{Param(parent='LogisticRegression_4d62b19d0e0e79065969', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2,
 Param(parent='LogisticRegression_4d62b19d0e0e79065969', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.8,
 Param(parent='LogisticRegression_4d62b19d0e0e79065969', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial.'): 'auto',
 Param(parent='LogisticRegression_4d62b19d0e0e79065969', name='featuresCol', doc='features column name'): 'featureVector',
 Param(parent='LogisticRegression_4d62b19d0e0e79065969', name='fitIntercept', doc='whether to fit an intercept term'): True,
 Param(parent='LogisticRegression_4d62b19d0e0e79065969', name='labelCol', doc='label column name'): 'Cover_Type',
 Param(parent='LogisticRegression_4d62b19d0e0e79065

In [90]:
paramsAndMetrics = validatorModel.validationMetrics
paramsAndMetrics

[0.49310767345688467, 0.49310767345688467]

In [91]:
multiclassEval.evaluate(bestModel.transform(testData))

0.48911947212772866